def test_set_for_rank_to_scores(self, data_structure, feature_values, config_path_learning): sentences_systems = defaultdict(list) combination_methods = FeatureExtractor.get_combinations_from_config_file(self.config) data_set_name = self.config.get('WMT', 'dataset') f_features = open(os.path.expanduser(self.config.get('WMT', 'output_dir')) + '/' + 'x_' + data_set_name, 'w') meta_data = defaultdict(list) for data_set, lang_pair, system_name, phrase_number in data_structure: sentences_systems[data_set, lang_pair, phrase_number].append(system_name) for data_set, lang_pair, phrase_number in sorted(sentences_systems.keys()): system_pairs = list(combinations(sentences_systems[data_set, lang_pair, phrase_number], 2)) for sys1, sys2 in sorted(system_pairs): idx_sys1, idx_sys2 = self.get_sentence_idx(data_set, lang_pair, data_structure, phrase_number, sys1, sys2) combined_features = [] for i in range(len(feature_values[0])): combined_feature = self.combine_feature_values(combination_methods[i], feature_values[idx_sys1][i], feature_values[idx_sys2][i]) combined_features.append(combined_feature) f_features.write('\t'.join([val for val in combined_features]) + '\n') meta_data[data_set, lang_pair, phrase_number].append([sys1, sys2]) f_features.close() results = defaultdict(list) confidence_scores = self.get_confidence_scores(config_path_learning) count = 0 for data_set, lang_pair, phrase_number in sorted(meta_data.keys()): for sys1, sys2 in sorted(meta_data[data_set, lang_pair, phrase_number]): results[data_set, lang_pair, phrase_number].append([sys1, sys2, confidence_scores[count]]) count += 1 return results
def training_set_for_rank_direct(self, data_structure, human_rankings, feature_values, ignore_ties=True): combination_methods = FeatureExtractor.get_combinations_from_config_file(self.config) data_set_name = self.config.get('WMT', 'dataset') feature_set_name = os.path.basename(self.config.get('Features', 'feature_set')).replace(".txt", "") for dataset, lang_pair in sorted(human_rankings.keys()): f_features = open(os.path.expanduser(self.config.get('WMT', 'output_dir')) + '/' + 'x_' + data_set_name + '.' + feature_set_name + '.' + lang_pair + '.tsv', 'w') f_objective = open(os.path.expanduser(self.config.get('WMT', 'output_dir')) + '/' + 'y_' + data_set_name + '.' + feature_set_name + '.' + lang_pair + '.tsv', 'w') f_meta_data = open(os.path.expanduser(self.config.get('WMT', 'output_dir')) + '/' + 'meta_' + data_set_name + '.' + feature_set_name + '.' + lang_pair + '.tsv', 'w') for human_comparison in human_rankings[dataset, lang_pair]: label = self.signs_to_labels(human_comparison.sign, ignore_ties=ignore_ties) if label is None: continue f_objective.write(label + '\n') seg_id = human_comparison.phrase sys1 = human_comparison.sys1 sys2 = human_comparison.sys2 idx_sys1, idx_sys2 = self.get_sentence_idx(dataset, lang_pair, data_structure, seg_id, sys1, sys2) f_meta_data.write(str(idx_sys1) + '\t' + str(idx_sys2) + '\n') combined_features = [] for i in range(len(feature_values[0])): combined_feature = self.combine_feature_values(combination_methods[i], feature_values[idx_sys1][i], feature_values[idx_sys2][i]) combined_features.append(combined_feature) f_features.write('\t'.join([val for val in combined_features]) + '\n') f_features.close() f_objective.close() f_meta_data.close()
def average_feature_values(): config_path = os.getcwd() + "/" + "config" + "/" + "wmt.cfg" config = ConfigParser() config.readfp(open(config_path)) my_dir = os.path.expanduser("~/Dropbox/experiments_fluency/test_learn_to_rank") feature_file = my_dir + "/" + "x_newstest2015.cobalt_comb_min_fluency_features_all.cs-en.tsv" feature_names = FeatureExtractor.get_features_from_config_file_unsorted(config) strategies = FeatureExtractor.get_combinations_from_config_file(config) feature_values = read_features_file(feature_file, "\t") averages = np.mean(feature_values, axis=0) feature_list = [] for i, feature_name in enumerate(feature_names): # if strategies[i] == 'both': # feature_list.append(feature_name) # feature_list.append(feature_name) # else: feature_list.append(feature_name) for i, name in enumerate(feature_list): print(name + "\t" + str(averages[i]))