def training_set_for_learn_to_rank_from_feature_file(config_learning, config): data_set_name = config.get('WMT', 'dataset') feature_values = read_features_file(config_learning.get("x_train", None), '\t') human_rankings = read_reference_file(config_learning.get("y_train", None), '\t') new_features = [] new_labels = [] path_features = os.path.expanduser(config.get('WMT', 'output_dir')) + '/' + 'x_' + data_set_name + ".learn_rank.tsv" path_objective = os.path.expanduser(config.get('WMT', 'output_dir')) + '/' + 'y_' + data_set_name + ".learn_rank.tsv" for i, label in enumerate(human_rankings): if label == 2: features = np.subtract(RankingTask.split_list(feature_values[i])[0], RankingTask.split_list(feature_values[i])[1]) new_label = 1 else: features = np.subtract(RankingTask.split_list(feature_values[i])[1], RankingTask.split_list(feature_values[i])[0]) new_label = 0 new_features.append(features) new_labels.append(new_label) write_feature_file(path_objective, new_features) write_reference_file(path_features, new_labels)
def round_robin(self, config_path_learning, config_path, feature_set, lps): config = ConfigParser() config.readfp(open(config_path)) with open(config_path_learning, "r") as cfg_file: config_learning = yaml.load(cfg_file.read()) f_results = open("results.txt", "w") for test_lp in sorted(lps): x_train = os.path.expanduser(self.config.get('Data', 'output_dir')) + "/" + "x_" + self.config.get("Settings", "dataset") + "." + "train" + "." + "tsv" y_train = os.path.expanduser(self.config.get('Data', 'output_dir')) + "/" + "y_" + self.config.get("Settings", "dataset") + "." + "train" + "." + "tsv" x_test = os.path.expanduser(self.config.get('Data', 'output_dir')) + "/" + "x_" + self.config.get("Settings", "dataset") + "." + "test" + "." + "tsv" y_test = os.path.expanduser(self.config.get('Data', 'output_dir')) + "/" + "y_" + self.config.get("Settings", "dataset") + "." + "test" + "." + "tsv" train_lps = ScoringTask.get_train_lps(lps, test_lp) train_feature_values = [] train_reference_values = [] test_feature_values = read_features_file(os.path.expanduser(self.config.get('Data', 'output_dir')) + "/" + \ "x_" + self.config.get("Settings", "dataset") + "." + feature_set + "." + test_lp + "." + "tsv", "\t") test_reference_values = read_reference_file(os.path.expanduser(self.config.get('Data', 'output_dir')) + "/" + \ "y_" + self.config.get("Settings", "dataset") + "." + feature_set + "." + test_lp + "." + "tsv", "\t") for train_lp in sorted(train_lps): feature_values = read_features_file(os.path.expanduser(self.config.get('Data', 'output_dir')) + "/" + \ "x_" + self.config.get("Settings", "dataset") + "." + feature_set + "." + train_lp + "." + "tsv", "\t") reference_values = read_reference_file(os.path.expanduser(self.config.get('Data', 'output_dir')) + "/" + \ "y_" + self.config.get("Settings", "dataset") + "." + feature_set + "." + train_lp + "." + "tsv", "\t") train_feature_values += list(feature_values) train_reference_values += list(reference_values) write_feature_file(x_train, train_feature_values) write_reference_file(y_train, train_reference_values) write_feature_file(x_test, test_feature_values) write_reference_file(y_test, test_reference_values) gold_standard = test_reference_values predictions = ScoringTask.train_predict(config_path_learning) # predictions = ScoringTask.recursive_feature_elimination(config_learning, config, 50) correlation = ScoringTask.evaluate_predicted(predictions, gold_standard) f_results.write(test_lp + " " + str(correlation) + " with " + feature_set + "\n") os.remove(x_train) os.remove(x_test) os.remove(y_train) os.remove(y_test)
def learn_to_rank(feature_values, human_comparisons, path_x, path_y): xs = [] ys = [] for dataset, lp in sorted(human_comparisons.keys()): for comparison in human_comparisons[dataset, lp]: if comparison.sign == '=': continue idx_winner, idx_loser = find_winner_loser_index(comparison) xs.append(make_instance(feature_values[idx_winner], feature_values[idx_loser])) xs.append(make_instance(feature_values[idx_loser], feature_values[idx_winner])) ys.append(1) ys.append(0) write_feature_file(path_x, xs) write_reference_file(path_y, ys)
def clean_dataset(config_learning, human_comparisons): feature_values = read_features_file(config_learning.get('x_train'), '\t') labels = read_reference_file(config_learning.get('y_train'), '\t') new_feature_values = [] new_labels = [] human_comparisons = RankingTask.eliminate_ties(human_comparisons) comparisons_untied_phrases = defaultdict(list) comparisons_untied_signs = defaultdict(list) deduplicated_phrases, deduplicated_signs = HumanRanking.deduplicate(human_comparisons) for dataset, lang_pair in sorted(human_comparisons.keys()): for comparison in human_comparisons[dataset, lang_pair]: if comparison.sign == "=": continue else: comparisons_untied_phrases[dataset, lang_pair].append([comparison.phrase, comparison.sys1, comparison.sys2]) comparisons_untied_signs[dataset, lang_pair].append(comparison.sign) for dataset, lang_pair in sorted(human_comparisons.keys()): for i, comparison in enumerate(comparisons_untied_phrases[dataset, lang_pair]): features = feature_values[i] label = labels[i] if comparison in deduplicated_phrases[dataset, lang_pair]: if deduplicated_signs[dataset, lang_pair][deduplicated_phrases[dataset, lang_pair].index(comparison)] is None: continue label = RankingTask.signs_to_labels(deduplicated_signs[dataset, lang_pair][deduplicated_phrases[dataset, lang_pair].index(comparison)]) new_feature_values.append(features) new_labels.append(label) write_feature_file(config_learning.get('x_train') + "." + "clean", new_feature_values) write_reference_file(config_learning.get('y_train') + "." + "clean", new_labels)
def feature_extraction(config_features_path): config = ConfigParser() config.readfp(open(config_features_path)) wd = config.get('WMT', 'working_directory') if not os.path.exists(wd): os.mkdir(wd) data = RankingData(config) data.read_dataset() process = Process(config) sentences_tgt, sentences_ref = process.run_processors() feature_names = FeatureExtractor.read_feature_names(config) feature_values = FeatureExtractor.extract_features_static(feature_names, sentences_tgt, sentences_ref) write_feature_file(wd + '/' + 'x' + '_' + data.datasets[0].name + '.tsv', feature_values) my_dataset = data.plain[0].dataset my_lp = data.plain[0].lp f_path = wd + '/' + 'x' + '_' + my_dataset + '_' + my_lp + '.tsv' f_file = open(f_path, 'w') for i, instance in enumerate(data.plain): if instance.dataset == my_dataset and instance.lp == my_lp: f_file.write('\t'.join([str(x) for x in feature_values[i]]) + "\n") else: f_file.close() my_dataset = instance.dataset my_lp = instance.lp f_path = wd + '/' + 'x' + '_' + my_dataset + '_' + my_lp + '.tsv' f_file = open(f_path, 'w') f_judgements = config.get('WMT', 'human_ranking') human_rankings = HumanRanking() human_rankings.add_human_data(f_judgements, config) human_rankings.get_sentence_ids(data) learn_to_rank(feature_values, human_rankings, wd + '/' + 'x_learn_to_rank.tsv', wd + '/' + 'y_learn_to_rank.tsv')