def load_get_coefficients(config_learning, config_data): output = open(os.path.expanduser(config_data.get("Learner", "models")) + "/" + "coefficients.txt", "w") feature_names = FeatureExtractor.get_features_from_config_file_unsorted(config_data) combination_methods = FeatureExtractor.get_combinations_from_config_file_unsorted(config_data) learning_config = config_learning.get("learning", None) method_name = learning_config.get("method", None) estimator = joblib.load(os.path.expanduser(config_data.get("Learner", "models")) + "/" + method_name + ".pkl") coefficients = estimator.coef_ feature_list = [] for i, feature_name in enumerate(feature_names): if combination_methods[i] == 'both': feature_list.append(feature_name) feature_list.append(feature_name) else: feature_list.append(feature_name) for i, name in enumerate(feature_list): output.write(name + "\t" + str(coefficients[0][i]) + "\n") output.close()
def get_data(self): process_wmt = PrepareWmt() data_structure1 = process_wmt.get_data_structure(self.config) data_structure2 = process_wmt.get_data_structure2(self.config) process_wmt.print_data_set(self.config, data_structure1) if 'Parse' in loads(self.config.get("Resources", "processors")): process_wmt_parse = PrepareWmt(data_type='parse') data_structure_parse = process_wmt_parse.get_data_structure(self.config) process_wmt_parse.print_data_set(self.config, data_structure_parse) f_judgements = self.config.get('WMT', 'human_ranking') maximum_comparisons = int(self.config.get('WMT', 'maximum_comparisons')) human_rankings = HumanRanking() human_rankings.add_human_data(f_judgements, self.config, max_comparisons=maximum_comparisons) process = Process(self.config) sents_tgt, sents_ref = process.run_processors() extractor = FeatureExtractor(self.config) features_to_extract = FeatureExtractor.read_feature_names(self.config) extractor.extract_features(features_to_extract, sents_tgt, sents_ref) return data_structure2, human_rankings, extractor.vals
def get_data(self): human_scores = read_reference_file(os.path.expanduser(self.config.get('Data', 'human_scores')), '\t') process = Process(self.config) sents_tgt, sents_ref = process.run_processors() extractor = FeatureExtractor(self.config) features_to_extract = FeatureExtractor.read_feature_names(self.config) extractor.extract_features(features_to_extract, sents_tgt, sents_ref) return extractor.vals, human_scores
def recursive_feature_elimination(config_learning, config_data, number_features): output = open(os.path.expanduser(config_data.get("Learner", "models")) + "/" + "feature_ranks.txt", "w") feature_names = FeatureExtractor.get_combinations_from_config_file_unsorted(config_data) x_train = read_features_file(config_learning.get('x_train'), '\t') y_train = read_reference_file(config_learning.get('y_train'), '\t') x_test = read_features_file(config_learning.get('x_test'), '\t') estimator, scorers = learn_model.set_learning_method(config_learning, x_train, y_train) scale = config_learning.get("scale", True) if scale: x_train, x_test = scale_datasets(x_train, x_test) rfe = RFE(estimator, number_features, step=1) rfe.fit(x_train, y_train) for i, name in enumerate(feature_names): output.write(name + "\t" + str(rfe.ranking_[i]) + "\n") print(name + "\t" + str(rfe.ranking_[i])) predictions = rfe.predict(x_test) output.close() return predictions
def main(args): feature_extractor = FeatureExtractor(get_resources(), args.model) train_feats, train_labels, _, _ = feature_extractor.get_features( args.train, args.train_topk) test_feats, _, test_segs, test_gold_truths = feature_extractor.get_features( args.test, args.test_topk) epochs, lr1, lr2 = 100, 0.01, 0.05 # Initialize model model = None if args.model == "mse": model = mse_ranker.MSERanker(epochs, lr1) elif args.model == "mr": model = mr_ranker.MRRanker(epochs, lr1) elif args.model == "mse_multi": model = mse_multi_ranker.MSEMultiRanker(epochs, lr1, lr2) elif args.model == "mr_multi": model = mr_multi_ranker.MRMultiRanker(epochs, lr1, lr2) # Train model model.train(train_feats, train_labels) # Rerank top-k segmentations top_segmentations = [] for segs_feats, segs, gds in zip(test_feats, test_segs, test_gold_truths): if len(segs) == 1: top_segmentations.extend(segs) else: reranked_segs = rerank(segs, segs_feats, model, args.model) top_segmentations.append(reranked_segs) if args.output is not None: fp = open(args.output, 'w') for segs in top_segmentations: target = "".join(segs[0].split()) fp.write(target + "\t" + "\t".join([seg.strip() for seg in segs]) + "\n") fp.close() # Evaluate metrics print("MRR:", mean_reciprocal_rank(test_gold_truths, top_segmentations)) print("Accuracy@1:", accuracy(1, test_gold_truths, top_segmentations)) print("Accuracy@2:", accuracy(2, test_gold_truths, top_segmentations)) print("Fscore@1:", fscore(1, test_gold_truths, top_segmentations)) print("Fscore@2:", fscore(2, test_gold_truths, top_segmentations))
def get_writing_features(image_path): # Pre-processing gray_img = cv.imread(image_path, cv.IMREAD_GRAYSCALE) gray_img, bin_img = PreProcessor.process(gray_img) gray_lines, bin_lines = LineSegmentor(gray_img, bin_img).segment() # Feature extraction return FeatureExtractor(gray_lines, bin_lines).extract()
def prepare_feature_files(self): process_wmt = PrepareWmt() data_structure1 = process_wmt.get_data_structure(self.config) data_structure2 = process_wmt.get_data_structure2(self.config) process_wmt.print_data_set(self.config, data_structure1) if 'Parse' in loads(self.config.get("Resources", "processors")): process_wmt_parse = PrepareWmt(data_type='parse') data_structure_parse = process_wmt_parse.get_data_structure(self.config) process_wmt_parse.print_data_set(self.config, data_structure_parse) process = Process(self.config) sents_tgt, sents_ref = process.run_processors() extractor = FeatureExtractor(self.config) features_to_extract = FeatureExtractor.read_feature_names(self.config) extractor.extract_features(features_to_extract, sents_tgt, sents_ref) feature_values = extractor.vals datasets_language_pairs = set((x[0], x[1]) for x in data_structure2) dataset_for_all = self.config.get('WMT', 'dataset') feature_set_name = os.path.basename(self.config.get('Features', 'feature_set')).replace(".txt", "") f_features_all = open(os.path.expanduser(self.config.get('WMT', 'output_dir')) + '/' + 'x_' + dataset_for_all + '.' + feature_set_name + '.' + 'all' + '.tsv', 'w') f_meta_data_all = open(os.path.expanduser(self.config.get('WMT', 'output_dir')) + '/' + 'meta_' + dataset_for_all + '.' + feature_set_name + '.' + 'all' + '.tsv', 'w') for dataset, lp in sorted(datasets_language_pairs): f_features = open(os.path.expanduser(self.config.get('WMT', 'output_dir')) + '/' + 'x_' + dataset + '.' + feature_set_name + '.' + lp + '.tsv', 'w') for i, sentence_data in enumerate(data_structure2): if dataset in sentence_data and lp in sentence_data: f_features_all.write('\t'.join([str(x) for x in feature_values[i]]) + "\n") f_meta_data_all.write('\t'.join([str(x) for x in sentence_data]) + "\n") f_features.write('\t'.join([str(x) for x in feature_values[i]]) + "\n") f_features.close() f_features_all.close()
def feature_extraction(config_features_path): config = ConfigParser() config.readfp(open(config_features_path)) wd = config.get('WMT', 'working_directory') if not os.path.exists(wd): os.mkdir(wd) data = RankingData(config) data.read_dataset() process = Process(config) sentences_tgt, sentences_ref = process.run_processors() feature_names = FeatureExtractor.read_feature_names(config) feature_values = FeatureExtractor.extract_features_static(feature_names, sentences_tgt, sentences_ref) write_feature_file(wd + '/' + 'x' + '_' + data.datasets[0].name + '.tsv', feature_values) my_dataset = data.plain[0].dataset my_lp = data.plain[0].lp f_path = wd + '/' + 'x' + '_' + my_dataset + '_' + my_lp + '.tsv' f_file = open(f_path, 'w') for i, instance in enumerate(data.plain): if instance.dataset == my_dataset and instance.lp == my_lp: f_file.write('\t'.join([str(x) for x in feature_values[i]]) + "\n") else: f_file.close() my_dataset = instance.dataset my_lp = instance.lp f_path = wd + '/' + 'x' + '_' + my_dataset + '_' + my_lp + '.tsv' f_file = open(f_path, 'w') f_judgements = config.get('WMT', 'human_ranking') human_rankings = HumanRanking() human_rankings.add_human_data(f_judgements, config) human_rankings.get_sentence_ids(data) learn_to_rank(feature_values, human_rankings, wd + '/' + 'x_learn_to_rank.tsv', wd + '/' + 'y_learn_to_rank.tsv')
def get_features(path): # Read and pre-process the image gray_img = cv.imread(path, cv.IMREAD_GRAYSCALE) gray_img, bin_img = PreProcessor.process(gray_img) gray_lines, bin_lines = LineSegmentor(gray_img, bin_img).segment() # Extract features of every line separately x = [] for g, b in zip(gray_lines, bin_lines): f = FeatureExtractor([g], [b]).extract() x.append(f) # Return list of features for every line in the image return x
def recursive_feature_elimination_cv(config_learning, config_data): output = open(os.path.expanduser(config_data.get("Learner", "models")) + "/" + "feature_ranks.txt", "w") feature_names = FeatureExtractor.get_features_from_config_file_unsorted(config_data) combination_methods = FeatureExtractor.get_combinations_from_config_file_unsorted(config_data) x_train = read_features_file(config_learning.get('x_train'), '\t') y_train = read_reference_file(config_learning.get('y_train'), '\t') x_test = read_features_file(config_learning.get('x_test'), '\t') estimator, scorers = learn_model.set_learning_method(config_learning, x_train, y_train) scale = config_learning.get("scale", True) if scale: x_train, x_test = scale_datasets(x_train, x_test) rfecv = RFECV(estimator=estimator, step=1, cv=StratifiedKFold(y_train, 2), scoring='accuracy') rfecv.fit(x_train, y_train) feature_list = [] for i, feature_name in enumerate(feature_names): if combination_methods[i] == 'both': feature_list.append(feature_name) feature_list.append(feature_name) else: feature_list.append(feature_name) for i, name in enumerate(feature_list): output.write(name + "\t" + str(rfecv.ranking_[i]) + "\n") output.close() predictions = rfecv.predict(x_test) return predictions
def average_feature_values(): config_path = os.getcwd() + "/" + "config" + "/" + "wmt.cfg" config = ConfigParser() config.readfp(open(config_path)) my_dir = os.path.expanduser("~/Dropbox/experiments_fluency/test_learn_to_rank") feature_file = my_dir + "/" + "x_newstest2015.cobalt_comb_min_fluency_features_all.cs-en.tsv" feature_names = FeatureExtractor.get_features_from_config_file_unsorted(config) strategies = FeatureExtractor.get_combinations_from_config_file(config) feature_values = read_features_file(feature_file, "\t") averages = np.mean(feature_values, axis=0) feature_list = [] for i, feature_name in enumerate(feature_names): # if strategies[i] == 'both': # feature_list.append(feature_name) # feature_list.append(feature_name) # else: feature_list.append(feature_name) for i, name in enumerate(feature_list): print(name + "\t" + str(averages[i]))
def test_feature_sets(): cfg = ConfigParser() cfg.readfp(open(os.getcwd() + '/config/system.cfg')) group_name = FE.get_features_group_name(cfg) features_to_test = FE.read_feature_names(cfg) if os.path.exists(cfg.get('Data', 'output') + '/' + group_name + '.' + 'summary'): "Path exists!" return output_file = open(cfg.get('Data', 'output') + '/' + group_name + '.' + 'summary', 'w') name0 = group_name + '_' + 'all' corr0 = corr_feature_set(features_to_test, name0) output_file.write(name0 + '\t' + str(corr0) + '\n') for feat in features_to_test: name1 = group_name + '_' + feat + '_' + 'only' corr1 = corr_feature_set(feat, name1) output_file.write(name1 + '\t' + str(corr1) + '\n') name2 = group_name + '_' + feat + '_' + 'excluded' excluding = [] for ffeat in features_to_test: if ffeat == feat: continue excluding.append(ffeat) corr2 = corr_feature_set(excluding, name2) output_file.write(name2 + '\t' + str(corr2) + '\n') output_file.close()
def extract_features(txt_io, feat_io, cword_io, train=False, factor_files={}): csets = CSetPair(config['source-cset'], config['target-cset']) extractor = FeatureExtractor(csets, config['features'], config['costs']) check_factor_requirements(extractor.required_factors(), factor_files) finder = CWordFinder(csets, train) if config['nulls-ngrams']: null_finder = NullFinder(csets.src, config['nulls-ngrams']) finder.add_extra_finder(null_finder) reader = CWordReader(cword_io) log.info("Extract features from {}".format(txt_io.name)) count = 0 for sid, line, fact_sent in each_factorized_input(txt_io, factor_files): for cword in finder.find_confusion_words(line, fact_sent): feat_str = extractor.extract_features(cword, fact_sent) feat_io.write(feat_str) reader.format(sid, cword) count += 1 log.info("Found {} confusion words".format(count))
def test_set_for_rank_to_scores(self, data_structure, feature_values, config_path_learning): sentences_systems = defaultdict(list) combination_methods = FeatureExtractor.get_combinations_from_config_file(self.config) data_set_name = self.config.get('WMT', 'dataset') f_features = open(os.path.expanduser(self.config.get('WMT', 'output_dir')) + '/' + 'x_' + data_set_name, 'w') meta_data = defaultdict(list) for data_set, lang_pair, system_name, phrase_number in data_structure: sentences_systems[data_set, lang_pair, phrase_number].append(system_name) for data_set, lang_pair, phrase_number in sorted(sentences_systems.keys()): system_pairs = list(combinations(sentences_systems[data_set, lang_pair, phrase_number], 2)) for sys1, sys2 in sorted(system_pairs): idx_sys1, idx_sys2 = self.get_sentence_idx(data_set, lang_pair, data_structure, phrase_number, sys1, sys2) combined_features = [] for i in range(len(feature_values[0])): combined_feature = self.combine_feature_values(combination_methods[i], feature_values[idx_sys1][i], feature_values[idx_sys2][i]) combined_features.append(combined_feature) f_features.write('\t'.join([val for val in combined_features]) + '\n') meta_data[data_set, lang_pair, phrase_number].append([sys1, sys2]) f_features.close() results = defaultdict(list) confidence_scores = self.get_confidence_scores(config_path_learning) count = 0 for data_set, lang_pair, phrase_number in sorted(meta_data.keys()): for sys1, sys2 in sorted(meta_data[data_set, lang_pair, phrase_number]): results[data_set, lang_pair, phrase_number].append([sys1, sys2, confidence_scores[count]]) count += 1 return results
def get_writer_features(path, writer_id): # All lines of the writer total_gray_lines, total_bin_lines = [], [] # Read and append all lines of the writer for root, dirs, files in os.walk(path): for filename in files: gray_img = cv.imread(path + filename, cv.IMREAD_GRAYSCALE) gray_img, bin_img = PreProcessor.process(gray_img) gray_lines, bin_lines = LineSegmentor(gray_img, bin_img).segment() total_gray_lines.extend(gray_lines) total_bin_lines.extend(bin_lines) break # Extract features of every line separately x, y = [], [] for g, b in zip(total_gray_lines, total_bin_lines): f = FeatureExtractor([g], [b]).extract() x.append(f) y.append(writer_id) return x, y
def training_set_for_rank_direct(self, data_structure, human_rankings, feature_values, ignore_ties=True): combination_methods = FeatureExtractor.get_combinations_from_config_file(self.config) data_set_name = self.config.get('WMT', 'dataset') feature_set_name = os.path.basename(self.config.get('Features', 'feature_set')).replace(".txt", "") for dataset, lang_pair in sorted(human_rankings.keys()): f_features = open(os.path.expanduser(self.config.get('WMT', 'output_dir')) + '/' + 'x_' + data_set_name + '.' + feature_set_name + '.' + lang_pair + '.tsv', 'w') f_objective = open(os.path.expanduser(self.config.get('WMT', 'output_dir')) + '/' + 'y_' + data_set_name + '.' + feature_set_name + '.' + lang_pair + '.tsv', 'w') f_meta_data = open(os.path.expanduser(self.config.get('WMT', 'output_dir')) + '/' + 'meta_' + data_set_name + '.' + feature_set_name + '.' + lang_pair + '.tsv', 'w') for human_comparison in human_rankings[dataset, lang_pair]: label = self.signs_to_labels(human_comparison.sign, ignore_ties=ignore_ties) if label is None: continue f_objective.write(label + '\n') seg_id = human_comparison.phrase sys1 = human_comparison.sys1 sys2 = human_comparison.sys2 idx_sys1, idx_sys2 = self.get_sentence_idx(dataset, lang_pair, data_structure, seg_id, sys1, sys2) f_meta_data.write(str(idx_sys1) + '\t' + str(idx_sys2) + '\n') combined_features = [] for i in range(len(feature_values[0])): combined_feature = self.combine_feature_values(combination_methods[i], feature_values[idx_sys1][i], feature_values[idx_sys2][i]) combined_features.append(combined_feature) f_features.write('\t'.join([val for val in combined_features]) + '\n') f_features.close() f_objective.close() f_meta_data.close()
def __init__(self): FeatureExtractor.__init__(self, 'sentence_score')
def __init__(self): FeatureExtractor.__init__(self, 'word_size')
def __init__(self): FeatureExtractor.__init__(self, 'num_lines')
def __init__(self): FeatureExtractor.__init__(self, 'repetition_score') self.stemmer = SnowballStemmer("english")
def __init__(self): FeatureExtractor.__init__(self, 'obscurity_score') self.common = self.get_wordlist(1000)
import sys import os from configparser import ConfigParser from utils.ranking_data import RankingData from utils.write_parsed import write_parsed from utils.human_ranking import HumanRanking from processors.process import Process from features.feature_extractor import FeatureExtractor from utils.wmt import write_wmt_format from utils.process_semeval import process_semeval from nltk.corpus import stopwords # Read configuration file config = ConfigParser() config.readfp(open('test.cfg')) # Prepare dataset ranking_data = RankingData(config) ranking_data.read_dataset() ranking_data.write_dataset() write_parsed(config.get('Data', 'input_dir').replace('plain', 'parse'), config.get('Data', 'working_dir'), ['cs-en']) # Process dataset process = Process(config) sentences_target, sentences_reference = process.run_processors() cobalt_scores = FeatureExtractor.extract_features_static(['cobalt'], sentences_target, sentences_reference) # print(str(cobalt_scores[-1][0])) ranking_data.write_scores_wmt_format(cobalt_scores, metric='cobalt', output_path='output/cobalt.scores')
def __init__(self): FeatureExtractor.__init__(self, 'width_in_char')
# Choose statistical features and physical parameters features_list = [fc.get_min, fc.get_max, fc.get_median] params_name_list = ['TOTUSJH', 'TOTBSQ', 'TOTPOT', 'TOTUSJZ'] # Input and output path #path_to_root = os.path.join('..', CONST.IN_PATH_TO_MVTS_FL) #path_to_dest = os.path.join('..', CONST.OUT_PATH_TO_RAW_FEATURES) #path_to_root = CONST.IN_PATH_TO_MVTS_FL path_to_root = CONST.IN_PATH_TO_MVTS_FL2 path_to_dest = CONST.OUT_PATH_TO_RAW_FEATURES output_filename = 'raw_features_p2_FL.csv' # Extract features pc = FeatureExtractor(path_to_root, path_to_dest, output_filename) pc.calculate_all(features_list, params_name_list=params_name_list) path_to_root = CONST.IN_PATH_TO_MVTS_NF2 path_to_dest = CONST.OUT_PATH_TO_RAW_FEATURES output_filename = 'raw_features_p2_NF.csv' # Extract features pc = FeatureExtractor(path_to_root, path_to_dest, output_filename) pc.calculate_all(features_list, params_name_list=params_name_list) path_to_root = CONST.IN_PATH_TO_MVTS_FL3 path_to_dest = CONST.OUT_PATH_TO_RAW_FEATURES output_filename = 'raw_features_p3_FL.csv' # Extract features
def __init__(self): FeatureExtractor.__init__(self, 'num_words')