def get_data(self): process_wmt = PrepareWmt() data_structure1 = process_wmt.get_data_structure(self.config) data_structure2 = process_wmt.get_data_structure2(self.config) process_wmt.print_data_set(self.config, data_structure1) if 'Parse' in loads(self.config.get("Resources", "processors")): process_wmt_parse = PrepareWmt(data_type='parse') data_structure_parse = process_wmt_parse.get_data_structure(self.config) process_wmt_parse.print_data_set(self.config, data_structure_parse) f_judgements = self.config.get('WMT', 'human_ranking') maximum_comparisons = int(self.config.get('WMT', 'maximum_comparisons')) human_rankings = HumanRanking() human_rankings.add_human_data(f_judgements, self.config, max_comparisons=maximum_comparisons) process = Process(self.config) sents_tgt, sents_ref = process.run_processors() extractor = FeatureExtractor(self.config) features_to_extract = FeatureExtractor.read_feature_names(self.config) extractor.extract_features(features_to_extract, sents_tgt, sents_ref) return data_structure2, human_rankings, extractor.vals
def prepare_feature_files(self): process_wmt = PrepareWmt() data_structure1 = process_wmt.get_data_structure(self.config) data_structure2 = process_wmt.get_data_structure2(self.config) process_wmt.print_data_set(self.config, data_structure1) if 'Parse' in loads(self.config.get("Resources", "processors")): process_wmt_parse = PrepareWmt(data_type='parse') data_structure_parse = process_wmt_parse.get_data_structure(self.config) process_wmt_parse.print_data_set(self.config, data_structure_parse) process = Process(self.config) sents_tgt, sents_ref = process.run_processors() extractor = FeatureExtractor(self.config) features_to_extract = FeatureExtractor.read_feature_names(self.config) extractor.extract_features(features_to_extract, sents_tgt, sents_ref) feature_values = extractor.vals datasets_language_pairs = set((x[0], x[1]) for x in data_structure2) dataset_for_all = self.config.get('WMT', 'dataset') feature_set_name = os.path.basename(self.config.get('Features', 'feature_set')).replace(".txt", "") f_features_all = open(os.path.expanduser(self.config.get('WMT', 'output_dir')) + '/' + 'x_' + dataset_for_all + '.' + feature_set_name + '.' + 'all' + '.tsv', 'w') f_meta_data_all = open(os.path.expanduser(self.config.get('WMT', 'output_dir')) + '/' + 'meta_' + dataset_for_all + '.' + feature_set_name + '.' + 'all' + '.tsv', 'w') for dataset, lp in sorted(datasets_language_pairs): f_features = open(os.path.expanduser(self.config.get('WMT', 'output_dir')) + '/' + 'x_' + dataset + '.' + feature_set_name + '.' + lp + '.tsv', 'w') for i, sentence_data in enumerate(data_structure2): if dataset in sentence_data and lp in sentence_data: f_features_all.write('\t'.join([str(x) for x in feature_values[i]]) + "\n") f_meta_data_all.write('\t'.join([str(x) for x in sentence_data]) + "\n") f_features.write('\t'.join([str(x) for x in feature_values[i]]) + "\n") f_features.close() f_features_all.close()
config_learning = yaml.load(cfg_file.read()) # Prepare feature files # This needs to be done for both training and testing data, changing the names of the datasets in the configuratio file prepare_wmt = PrepareWmt() ranking_task = RankingTask(config_path) ranking_task.prepare_feature_files() # Create training set for learn to rank # Comment the above prepare feature files method dataset_for_all = config.get('WMT', 'dataset') feature_set_name = os.path.basename(config.get('Features', 'feature_set')).replace(".txt", "") data_structure2 = prepare_wmt.get_data_structure2(config) f_judgements = config.get('WMT', 'human_ranking') human_rankings = HumanRanking() human_rankings.add_human_data(f_judgements, config) feature_values = read_features_file(os.path.expanduser(config.get('WMT', 'output_dir')) + '/' + 'x_' + dataset_for_all + '.' + feature_set_name + '.' + 'all' + '.tsv', "\t") ranking_task.training_set_for_learn_to_rank(data_structure2, human_rankings, feature_values) ranking_task.train_save(config_learning, config) # Run the trained model on a the test feature file and produce the output in WMT format predictions = ranking_task.test_learn_to_rank_coefficients(config_learning, config) data_structure = prepare_wmt.get_data_structure(config) prepare_wmt.wmt_format(config, feature_set_name, dataset_for_all, predictions, data_structure)