def test_load_from_config(self): generator = build_object(self.config['representations']['training'][1]) data_obj = generator.generate() self.assertTrue('target' in data_obj) self.assertTrue('source' in data_obj) self.assertTrue('tags' in data_obj) self.assertTrue(len(data_obj['target']) == len(data_obj['source']) == len(data_obj['tags'])) self.assertTrue(len(data_obj['target']) == len(data_obj['tags']))
def main(config, stamp): # the data_type is the format corresponding to the model of the data that the user wishes to learn data_type = config['data_type'] if 'data_type' in config else ( config['contexts'] if 'contexts' in config else 'plain') bad_tagging = config[ 'bad_tagging'] if 'bad_tagging' in config else 'pessimistic' logger.info("data_type -- {}, bad_tagging -- {}".format( data_type, bad_tagging)) # time_stamp = str(time.time()) time_stamp = stamp workers = config['workers'] tmp_dir = config['tmp_dir'] # one generator train_data_generator = build_object(config['datasets']['training'][0]) train_data = train_data_generator.generate() # test test_data_generator = build_object(config['datasets']['test'][0]) test_data = test_data_generator.generate() logger.info("Train data keys: {}".format(train_data.keys())) logger.info("Train data sequences: {}".format(len(train_data['target']))) logger.info("Sample sequence: {}".format( [w.encode('utf-8') for w in train_data['target'][0]])) # additional representations if 'representations' in config: representation_generators = build_objects(config['representations']) else: representation_generators = [] for r in representation_generators: train_data = r.generate(train_data) test_data = r.generate(test_data) borders = config['borders'] if 'borders' in config else False logger.info('here are the keys in your representations: {}'.format( train_data.keys())) bad_tagging = config[ 'bad_tagging'] if 'bad_tagging' in config else 'pessimistic' # test_contexts = create_contexts_ngram(test_data, data_type=data_type, test=True, bad_tagging=bad_tagging) test_contexts = create_contexts_ngram(test_data, data_type=data_type, test=True, bad_tagging=bad_tagging, tags_format=config['tags_format']) print("Objects in the train data: {}".format(len(train_data['target']))) print("UNAMBIGUOUS: ", config['unambiguous']) # train_contexts = create_contexts_ngram(train_data, data_type=data_type, bad_tagging=bad_tagging, unambiguous=config['unambiguous']) train_contexts = create_contexts_ngram(train_data, data_type=data_type, bad_tagging=bad_tagging, unambiguous=config['unambiguous'], tags_format=config['tags_format']) #print("Train contexts: {}".format(len(train_contexts))) #print("1st context:", train_contexts[0]) # the list of context objects' 'target' field lengths # to restore the word-level tags from the phrase-level #test_context_correspondence = get_contexts_words_number(test_contexts) if data_type == 'sequential': test_context_correspondence = flatten( [get_contexts_words_number(cont) for cont in test_contexts]) #print(test_context_correspondence) for idx, cont in enumerate(test_contexts): get_cont = get_contexts_words_number(cont) count_cont = [len(c['token']) for c in cont] assert (all([ get_cont[i] == count_cont[i] for i in range(len(cont)) ])), "Sum doesn't match at line {}:\n{}\n{}".format( idx, ' '.join([str(c) for c in get_cont]), ' '.join([str(c) for c in count_cont])) assert (sum(test_context_correspondence) == sum([ len(c['token']) for cont in test_contexts for c in cont ])), "Sums don't match: {} and {}".format( sum(test_context_correspondence) == sum( [len(c['token']) for cont in test_contexts for c in cont])) else: test_context_correspondence = get_contexts_words_number(test_contexts) assert (sum(test_context_correspondence) == sum([ len(c['token']) for c in test_contexts ])), "Sums don't match: {} and {}".format( sum(test_context_correspondence), sum([len(c['token']) for c in test_contexts])) # print("Token lengths:", sum([len(c['token']) for c in test_contexts])) # assert(sum(test_context_correspondence) == 9613), "GOLAKTEKO OPASNOSTE!!!, {}".format(sum(test_context_correspondence)) # sys.exit() # if data_type == 'sequential': # test_context_correspondence = flatten(test_context_correspondence) logger.info('Vocabulary comparison -- coverage for each dataset: ') logger.info(compare_vocabulary([train_data['target'], test_data['target']])) # END REPRESENTATION GENERATION # FEATURE EXTRACTION train_tags = call_for_each_element(train_contexts, tags_from_contexts, data_type=data_type) test_tags = call_for_each_element(test_contexts, tags_from_contexts, data_type=data_type) test_tags_true = test_data['tags'] tag_idx = 0 seg_idx = 0 # test_context_correspondence_seq = [get_contexts_words_number(cont) for cont in test_contexts] # for idx, (tag_seq, phr_seq) in enumerate(zip(test_data['tags'], test_context_correspondence_seq)): # assert(len(tag_seq) == sum(phr_seq)),"Something wrong in line {}:\n{}\n{}".format(idx, ' '.join(tag_seq), ' '.join([str(p) for p in phr_seq])) # tag_idx = 0 # for d in phr_seq: # first_tag = tag_seq[tag_idx] # assert(all([t == first_tag for t in tag_seq[tag_idx:tag_idx+d]])), "Something wrong in line {}:\n{}\n{}".format(idx, ' '.join(tag_seq), ' '.join([str(p) for p in phr_seq])) # try: # indicator = [t == first_tag for t in test_data['tags'][seg_idx][tag_idx:tag_idx+d]] # assert(all(indicator)) # tags_cnt += d # if tags_cnt == len(test_data['tags'][seg_idx]): # tags_cnt = 0 # seg_idx += 1 # elif tags_cnt > len(test_data['tags'][seg_idx]): # raise # except: # print("No correspondence in line {}, tag {}: \n{}\n{}".format(seg_idx, tag_idx, ' '.join(test_data['tags'][seg_idx]), d)) # sys.exit() #assert(sum(test_context_correspondence) == len(flatten(test_data['tags']))), "Sums don't match for phrase contexts and test data object: {} and {}".format(sum(test_context_correspondence), len(flatten(test_data['tags']))) # flat_cont = flatten(test_contexts) # flat_tags = flatten(test_data['tags']) # for ii in range(len(flat_cont)): if data_type == 'plain': assert ( len(test_context_correspondence) == len(test_tags) ), "Lengths don't match for phrase contexts and test tags: {} and {}".format( len(test_context_correspondence), len(test_tags)) # test_tags_seq = call_for_each_element(test_contexts_seq, tags_from_contexts, data_type='sequential') logger.info('creating feature extractors...') feature_extractors = build_objects(config['feature_extractors']) logger.info('mapping the feature extractors over the contexts for test...') test_features = call_for_each_element(test_contexts, contexts_to_features, [feature_extractors, workers], data_type=data_type) logger.info( 'mapping the feature extractors over the contexts for train...') train_features = call_for_each_element(train_contexts, contexts_to_features, [feature_extractors, workers], data_type=data_type) logger.info('number of training instances: {}'.format(len(train_features))) logger.info('number of testing instances: {}'.format(len(test_features))) logger.info( 'All of your features now exist in their raw representation, but they may not be numbers yet' ) # END FEATURE EXTRACTION from sklearn.metrics import f1_score, precision_score, recall_score from sklearn.cross_validation import permutation_test_score import numpy as np tag_map = {u'OK': 1, u'BAD': 0} if data_type == 'sequential': # TODO: save features for CRFSuite, call it logger.info('training sequential model...') experiment_datasets = [{ 'name': 'test', 'features': test_features, 'tags': test_tags }, { 'name': 'train', 'features': train_features, 'tags': train_tags }] feature_names = [ f for extractor in feature_extractors for f in extractor.get_feature_names() ] print("FEATURE NAMES: ", feature_names) persist_dir = tmp_dir logger.info('persisting your features to: {}'.format(persist_dir)) # for each dataset, write a file and persist the features if 'persist_format' not in config: config['persist_format'] = 'crf_suite' for dataset_obj in experiment_datasets: persist_features(dataset_obj['name'] + time_stamp, dataset_obj['features'], persist_dir, feature_names=feature_names, tags=dataset_obj['tags'], file_format=config['persist_format']) feature_num = len(train_features[0][0]) train_file = os.path.join(tmp_dir, 'train' + time_stamp + '.crf') test_file = os.path.join(tmp_dir, 'test' + time_stamp + '.crf') if config['persist_format'] == 'crf++': # generate a template for CRF++ feature extractor generate_crf_template(feature_num, 'template', tmp_dir) # train a CRF++ model call([ 'crf_learn', os.path.join(tmp_dir, 'template'), train_file, os.path.join(tmp_dir, 'crfpp_model_file' + time_stamp) ]) # tag a test set call([ 'crf_test', '-m', os.path.join(tmp_dir, 'crfpp_model_file' + time_stamp), '-o', test_file + '.tagged', test_file ]) elif config['persist_format'] == 'crf_suite': crfsuite_algorithm = config[ 'crfsuite_algorithm'] if 'crfsuite_algorithm' in config else 'arow' call([ 'crfsuite', 'learn', '-a', crfsuite_algorithm, '-m', os.path.join(tmp_dir, 'crfsuite_model_file' + time_stamp), train_file ]) test_out = open(test_file + '.tagged', 'w') call([ 'crfsuite', 'tag', '-tr', '-m', os.path.join(tmp_dir, 'crfsuite_model_file' + time_stamp), test_file ], stdout=test_out) test_out.close() else: print("Unknown persist format: {}".format( config['persist_format'])) sys.exit() sequential_true = [[]] sequential_predictions = [[]] flat_true = [] flat_predictions = [] for line in open(test_file + '.tagged'): # end of tagging, statistics reported if line.startswith('Performance'): break if line == '\n': sequential_predictions.append([]) continue chunks = line[:-1].decode('utf-8').split() flat_true.append(chunks[-2]) sequential_true[-1].append(chunks[-2]) flat_predictions.append(chunks[-1]) sequential_predictions[-1].append(chunks[-1]) # restoring the word-level tags test_predictions_word, test_tags_word = [], [] for idx, n in enumerate(test_context_correspondence): for i in range(n): test_predictions_word.append(flat_predictions[idx]) test_tags_word.append(flat_true[idx]) print(f1_score(test_predictions_word, test_tags_word, average=None)) print( f1_score(test_predictions_word, test_tags_word, average='weighted', pos_label=None)) print("Precision: {}, recall: {}".format( precision_score(test_predictions_word, test_tags_word, average=None), recall_score(test_predictions_word, test_tags_word, average=None))) else: train_tags = [tag_map[tag] for tag in train_tags] #print(test_tags) test_tags = [tag_map[tag] for tag in test_tags] #print(test_tags) #sys.exit() # data_type is 'token' or 'plain' logger.info('start training...') classifier_type = import_class( config['learning']['classifier']['module']) # train the classifier(s) classifier_map = map_classifiers(train_features, train_tags, classifier_type, data_type=data_type) logger.info('classifying the test instances') test_predictions = predict_all(test_features, classifier_map, data_type=data_type) # assert(len(test_predictions) == len(flatten(test_tags_seq))), "long predictions: {}, sequential: {}".format(len(test_predictions), len(flatten(test_tags_seq))) cnt = 0 test_predictions_seq = [] test_tags_seq_num = [] tag_map = {'OK': 1, 'BAD': 0, 1: 1, 0: 0} long_test = True if 'multiply_data_test' in config and ( config['multiply_data_test'] == 'ngrams' or config['multiply_data_test'] == '1ton') else False # restoring the word-level tags test_predictions_word, test_tags_word = [], [] logger.info("Test predictions lenght: {}".format( len(test_predictions))) for idx, n in enumerate(test_context_correspondence): for i in range(n): test_predictions_word.append(test_predictions[idx]) test_tags_word.append(test_tags[idx]) test_tags_true_flat = flatten(test_tags_true) test_tags_true_flat = [tag_map[t] for t in test_tags_true_flat] # print(f1_score(test_tags_word, test_predictions_word, average=None)) # print(f1_score(test_tags_word, test_predictions_word, average='weighted', pos_label=None)) print( f1_score(test_tags_true_flat, test_predictions_word, average=None)) print( f1_score(test_tags_true_flat, test_predictions_word, average='weighted', pos_label=None)) print("Precision: {}, recall: {}".format( precision_score(test_tags_true_flat, test_predictions_word, average=None), recall_score(test_tags_true_flat, test_predictions_word, average=None))) # TODO: remove the hard coding of the tags here bad_count = sum(1 for t in test_tags if t == u'BAD' or t == 0) good_count = sum(1 for t in test_tags if t == u'OK' or t == 1) total = len(test_tags) assert (total == bad_count + good_count), 'tag counts should be correct' percent_good = good_count / total logger.info('percent good in test set: {}'.format(percent_good)) logger.info('percent bad in test set: {}'.format(1 - percent_good)) random_class_results = [] random_weighted_results = [] for i in range(20): random_tags_phrase = list( np.random.choice([1, 0], total, [percent_good, 1 - percent_good])) random_tags = [] for idx, n in enumerate(test_context_correspondence): for i in range(n): random_tags.append(random_tags_phrase[idx]) # random_tags = [u'GOOD' for i in range(total)] random_class_f1 = f1_score(test_tags_true_flat, random_tags, average=None) random_class_results.append(random_class_f1) logger.info('two class f1 random score ({}): {}'.format( i, random_class_f1)) # random_average_f1 = f1_score(random_tags, test_tags, average='weighted') random_average_f1 = f1_score(test_tags_true_flat, random_tags, average='weighted', pos_label=None) random_weighted_results.append(random_average_f1) # logger.info('average f1 random score ({}): {}'.format(i, random_average_f1)) avg_random_class = np.average(random_class_results, axis=0) avg_weighted = np.average(random_weighted_results) logger.info( 'two class f1 random average score: {}'.format(avg_random_class)) logger.info( 'weighted f1 random average score: {}'.format(avg_weighted)) # print("Cross-validation:") # print(permutation_test_score()) # logger.info("Sequence correlation: ") # print(sequence_correlation_weighted(test_tags_seq_num, test_predictions_seq, verbose=True)[1]) label_test_hyp_ref(test_predictions_word, test_tags_true_flat, os.path.join(tmp_dir, config['output_name']), config["output_test"])
def main(config): workers = config['workers'] tmp_dir = config['tmp_dir'] if 'tmp_dir' in config else None tmp_dir = mk_tmp_dir(tmp_dir) time_stamp = str(time.time()) # REPRESENTATION GENERATION # main representations (source, target, tags) # training train_data_generators = build_objects(config['datasets']['training']) train_data = {} for gen in train_data_generators: data = gen.generate() for key in data: if key not in train_data: train_data[key] = [] train_data[key].extend(data[key]) # test test_data_generator = build_object(config['datasets']['test'][0]) test_data = test_data_generator.generate() logger.info("Train data keys: {}".format(train_data.keys())) logger.info("Train data sequences: {}".format(len(train_data['target']))) logger.info("Sample sequence: {}".format([w.encode('utf-8') for w in train_data['target'][0]])) # logger.info("Sample sequence: {}".format(train_data['similarity'][0])) # sys.exit() # additional representations if 'representations' in config: representation_generators = build_objects(config['representations']) else: representation_generators = [] for r in representation_generators: train_data = r.generate(train_data) test_data = r.generate(test_data) # borders = config['borders'] if 'borders' in config else False # if 'multiply_data_train' not in config: # pass # elif config['multiply_data_train'] == 'ngrams': # train_data = multiply_data_ngrams(train_data, borders=borders) # elif config['multiply_data_train'] == '1ton': # train_data = multiply_data(train_data, borders=borders) # elif config['multiply_data_train'] == 'duplicate': # train_data = multiply_data_base(train_data) # elif config['multiply_data_train'] == 'all': # train_data = multiply_data_all(train_data, borders=borders) # else: # print("Unknown 'multiply data train' value: {}".format(config['multiply_data_train'])) # logger.info("Extended train representations: {}".format(len(train_data['target']))) # logger.info("Simple test representations: {}".format(len(test_data['target']))) # if 'multiply_data_test' not in config: # pass # elif config['multiply_data_test'] == 'ngrams': # test_data = multiply_data_ngrams(test_data, borders=borders) # elif config['multiply_data_test'] == '1ton': # test_data = multiply_data(test_data, borders=borders) # else: # print("Unknown 'multiply data test' value: {}".format(config['multiply_data_test'])) # logger.info("Extended test representations: {}".format(len(test_data['target']))) logger.info('here are the keys in your representations: {}'.format(train_data.keys())) # the data_type is the format corresponding to the model of the data that the user wishes to learn data_type = config['contexts'] if 'contexts' in config else 'plain' test_contexts = create_contexts(test_data, data_type=data_type) test_contexts_seq = create_contexts(test_data, data_type='sequential') train_contexts = create_contexts(train_data, data_type=data_type) logger.info('Vocabulary comparison -- coverage for each dataset: ') logger.info(compare_vocabulary([train_data['target'], test_data['target']])) # END REPRESENTATION GENERATION # FEATURE EXTRACTION train_tags = call_for_each_element(train_contexts, tags_from_contexts, data_type=data_type) test_tags = call_for_each_element(test_contexts, tags_from_contexts, data_type=data_type) test_tags_seq = call_for_each_element(test_contexts_seq, tags_from_contexts, data_type='sequential') logger.info('creating feature extractors...') feature_extractors = build_objects(config['feature_extractors']) logger.info('mapping the feature extractors over the contexts for test...') test_features = call_for_each_element(test_contexts, contexts_to_features, [feature_extractors, workers], data_type=data_type) logger.info('mapping the feature extractors over the contexts for train...') train_features = call_for_each_element(train_contexts, contexts_to_features, [feature_extractors, workers], data_type=data_type) logger.info('number of training instances: {}'.format(len(train_features))) logger.info('number of testing instances: {}'.format(len(test_features))) logger.info('All of your features now exist in their raw representation, but they may not be numbers yet') # END FEATURE EXTRACTION # BEGIN CONVERTING FEATURES TO NUMBERS logger.info('binarization flag: {}'.format(config['features']['binarize'])) # flatten so that we can properly binarize the features if config['features']['binarize'] is True: logger.info('Binarizing your features...') all_values = [] if data_type == 'sequential': all_values = flatten(train_features) elif data_type == 'plain': all_values = train_features elif data_type == 'token': all_values = flatten(train_features.values()) feature_names = [f for extractor in feature_extractors for f in extractor.get_feature_names()] features_num = len(feature_names) true_features_num = len(all_values[0]) logger.info('fitting binarizers...') binarizers = fit_binarizers(all_values) logger.info('binarizing test data...') test_features = call_for_each_element(test_features, binarize, [binarizers], data_type=data_type) logger.info('binarizing training data...') # TODO: this line hangs with alignment+w2v train_features = call_for_each_element(train_features, binarize, [binarizers], data_type=data_type) logger.info('All of your features are now scalars in numpy arrays') logger.info('training and test sets successfully generated') # the way that we persist depends upon the structure of the data (plain/sequence/token_dict) # TODO: remove this once we have a list containing all datasets if config['features']['persist']: if 'persist_format' in config['features']: persist_format = config['features']['persist_format'] else: persist_format = 'crf++' experiment_datasets = [{'name': 'test', 'features': test_features, 'tags': test_tags}, {'name': 'train', 'features': train_features, 'tags': train_tags}] feature_names = [f for extractor in feature_extractors for f in extractor.get_feature_names()] if config['features']['persist_dir']: persist_dir = config['features']['persist_dir'] else: persist_dir = os.path.getcwd() logger.info('persisting your features to: {}'.format(persist_dir)) # for each dataset, write a file and persist the features for dataset_obj in experiment_datasets: persist_features(dataset_obj['name'], dataset_obj['features'], persist_dir, feature_names=feature_names, tags=dataset_obj['tags'], file_format=persist_format) # BEGIN LEARNING # TODO: different sequence learning modules need different representation, we should wrap them in a class # TODO: create a consistent interface to sequence learners, will need to use *args and **kwargs because APIs are very different from sklearn.metrics import f1_score, precision_score, recall_score import numpy as np experiment_datasets = [{'name': 'test', 'features': test_features, 'tags': test_tags}, {'name': 'train', 'features': train_features, 'tags': train_tags}] feature_names = [f for extractor in feature_extractors for f in extractor.get_feature_names()] print("FEATURE NAMES: ", feature_names) persist_dir = tmp_dir logger.info('persisting your features to: {}'.format(persist_dir)) # for each dataset, write a file and persist the features if 'persist_format' not in config: config['persist_format'] = 'crf_suite' for dataset_obj in experiment_datasets: persist_features(dataset_obj['name']+time_stamp, dataset_obj['features'], persist_dir, feature_names=feature_names, tags=dataset_obj['tags'], file_format=config['persist_format']) feature_num = len(train_features[0][0]) train_file = os.path.join(tmp_dir, 'train'+time_stamp+'.crf') test_file = os.path.join(tmp_dir, 'test'+time_stamp+'.crf') tag_map = {u'OK': 1, u'BAD': 0, 0: 0, 1: 1} if config['persist_format'] == 'crf++': # generate a template for CRF++ feature extractor generate_crf_template(feature_num, 'template', tmp_dir) # train a CRF++ model call(['crf_learn', '-a', 'MIRA', os.path.join(tmp_dir, 'template'), train_file, os.path.join(tmp_dir, 'crfpp_model_file'+time_stamp)]) # tag a test set call(['crf_test', '-m', os.path.join(tmp_dir, 'crfpp_model_file'+time_stamp), '-o', test_file+'.tagged', test_file]) elif config['persist_format'] == 'crf_suite': crfsuite_algorithm = config['crfsuite_algorithm'] call(['crfsuite', 'learn', '-a', crfsuite_algorithm, '-m', os.path.join(tmp_dir, 'crfsuite_model_file'+time_stamp), train_file]) test_out = open(test_file+'.tagged', 'w') call(['crfsuite', 'tag', '-tr', '-m', os.path.join(tmp_dir, 'crfsuite_model_file'+time_stamp), test_file], stdout=test_out) test_out.close() else: print("Unknown persist format: {}".format(config['persist_format'])) # parse CRFSuite output flattened_ref, flattened_hyp = [], [] tag_map = {'OK': 1, 'BAD': 0} for line in open(test_file+'.tagged'): if line == "\n": continue chunks = line.strip('\n').split('\t') if len(chunks) != 2: continue try: flattened_ref.append(tag_map[chunks[-2]]) flattened_hyp.append(tag_map[chunks[-1]]) except KeyError: continue print("Ref, hyp: ", len(flattened_ref), len(flattened_hyp)) logger.info('Structured prediction f1: ') print(f1_score(flattened_ref, flattened_hyp, average=None)) print(f1_score(flattened_ref, flattened_hyp, average='weighted', pos_label=None)) logger.info("Sequence correlation: ")
def main(config): workers = config['workers'] tmp_dir = config['tmp_dir'] tmp_dir = mk_tmp_dir(tmp_dir) # REPRESENTATION GENERATION # main representations (source, target, tags) # training # train_data_generators = build_objects(config['datasets']['training']) # train_data = {} # for gen in train_data_generators: # data = gen.generate() # for key in data: # if key not in train_data: # train_data[key] = [] # train_data[key].extend(data[key]) train_data_generator = build_object(config['datasets']['training'][0]) train_data = train_data_generator.generate() dev, test = False, False # test if 'test' in config['datasets']: test = True test_data_generator = build_object(config['datasets']['test'][0]) test_data = test_data_generator.generate() # dev if 'dev' in config['datasets']: dev = True dev_data_generator = build_object(config['datasets']['dev'][0]) dev_data = dev_data_generator.generate() # additional representations # print("IN MAIN") # print(train_data['alignments_file']) # print(dev_data['alignments_file']) # print(test_data['alignments_file']) if 'representations' in config: representation_generators = build_objects(config['representations']) else: representation_generators = [] for r in representation_generators: train_data = r.generate(train_data) if test: test_data = r.generate(test_data) if dev: dev_data = r.generate(dev_data) print("TEST DATA", test_data['alignments'][0]) logger.info("Simple representations: {}".format(len(train_data['target']))) logger.info('here are the keys in your representations: {}'.format(train_data.keys())) # the data_type is the format corresponding to the model of the data that the user wishes to learn data_type = config['data_type'] print("DATA TYPE:", data_type) # sys.exit() print("Train data: ", len(train_data['target'])) if dev: print("Dev data: ", len(dev_data['target'])) if test: print("Test data: ", len(test_data['target'])) print("In different representations: ") for rep in train_data: print(rep, len(train_data[rep])) # print('Source dependencies: {}'.format(train_data['source_dependencies'][0])) # print('Target dependencies: {}'.format(train_data['target_dependencies'][0])) # print('Source root: {}'.format(train_data['source_root'][0])) # print('Target root: {}'.format(train_data['target_root'][0])) train_contexts = create_contexts(train_data, data_type=data_type) if test: test_contexts = create_contexts(test_data, data_type=data_type) logger.info('Vocabulary comparison -- coverage for test dataset: ') logger.info(compare_vocabulary([train_data['target'], test_data['target']])) if dev: dev_contexts = create_contexts(dev_data, data_type=data_type) # print("TEST CONTEXT", test_contexts[0]) print("Train contexts: ", len(train_contexts)) if dev: print("Dev contexts: ", len(dev_contexts)) if test: print("Test contexts: ", len(test_contexts)) print('Train context example: {}'.format(train_contexts[0])) # END REPRESENTATION GENERATION # FEATURE EXTRACTION train_tags = call_for_each_element(train_contexts, tags_from_contexts, data_type=data_type) if test: test_tags = call_for_each_element(test_contexts, tags_from_contexts, data_type=data_type) if dev: dev_tags = call_for_each_element(dev_contexts, tags_from_contexts, data_type=data_type) print("Train tags: ", len(train_tags)) if dev: print("Dev tags: ", len(dev_tags)) if test: print("Test tags: ", len(test_tags)) logger.info('creating feature extractors...') feature_extractors = build_objects(config['feature_extractors']) if test: logger.info('mapping the feature extractors over the contexts for test...') test_features = call_for_each_element(test_contexts, contexts_to_features, [feature_extractors, workers], data_type=data_type) print("Test features sample: ", test_features[0]) if dev: logger.info('mapping the feature extractors over the contexts for dev...') dev_features = call_for_each_element(dev_contexts, contexts_to_features, [feature_extractors, workers], data_type=data_type) logger.info('mapping the feature extractors over the contexts for train...') train_features = call_for_each_element(train_contexts, contexts_to_features, [feature_extractors, 1], data_type=data_type) print("Train features sample: ", train_features[0]) logger.info('number of training instances: {}'.format(len(train_features))) if dev: logger.info('number of development instances: {}'.format(len(dev_features))) if test: logger.info('number of testing instances: {}'.format(len(test_features))) logger.info('All of your features now exist in their raw representation, but they may not be numbers yet') # END FEATURE EXTRACTION # binarizing features logger.info('binarization flag: {}'.format(config['features']['binarize'])) # flatten so that we can properly binarize the features if config['features']['binarize'] is True: logger.info('Binarizing your features...') all_values = [] if data_type == 'sequential': all_values = flatten(train_features) elif data_type == 'plain': all_values = train_features elif data_type == 'token': all_values = flatten(train_features.values()) feature_names = [f for extractor in feature_extractors for f in extractor.get_feature_names()] features_num = len(feature_names) true_features_num = len(all_values[0]) logger.info('fitting binarizers...') binarizers = fit_binarizers(all_values) logger.info('binarizing test data...') test_features = call_for_each_element(test_features, binarize, [binarizers], data_type=data_type) logger.info('binarizing training data...') # TODO: this line hangs with alignment+w2v train_features = call_for_each_element(train_features, binarize, [binarizers], data_type=data_type) logger.info('All of your features are now scalars in numpy arrays') logger.info('training and test sets successfully generated') # persisting features logger.info('training and test sets successfully generated') experiment_datasets = [{'name': 'train', 'features': train_features, 'tags': train_tags}] if test: experiment_datasets.append({'name': 'test', 'features': test_features, 'tags': test_tags}) if dev: experiment_datasets.append({'name': 'dev', 'features': dev_features, 'tags': dev_tags}) feature_names = [f for extractor in feature_extractors for f in extractor.get_feature_names()] persist_dir = config['persist_dir'] if 'persist_dir' in config else config['features']['persist_dir'] persist_dir = mk_tmp_dir(persist_dir) persist_format = config['persist_format'] if 'persist_format' in config else config['features']['persist_format'] logger.info('persisting your features to: {}'.format(persist_dir)) # for each dataset, write a file and persist the features for dataset_obj in experiment_datasets: # persist_features(dataset_obj['name'], dataset_obj['features'], persist_dir, feature_names=feature_names, tags=dataset_obj['tags'], file_format=persist_format) persist_features(dataset_obj['name'], dataset_obj['features'], persist_dir, feature_names=feature_names, tags=dataset_obj['tags'], file_format=persist_format) # generate a template for CRF++ feature extractor feature_num = len(feature_names) if persist_format == 'crf++': generate_crf_template(feature_num, 'template', persist_dir) logger.info('Features persisted to: {}'.format(', '.join([os.path.join(persist_dir, nn) for nn in [obj['name'] for obj in experiment_datasets]])))
def main(config): workers = config['workers'] tmp_dir = config['tmp_dir'] tmp_dir = mk_tmp_dir(tmp_dir) # REPRESENTATION GENERATION # main representations (source, target, tags) # training train_data_generators = build_objects(config['datasets']['training']) train_data = {} for gen in train_data_generators: data = gen.generate() for key in data: if key not in train_data: train_data[key] = [] train_data[key].extend(data[key]) dev, test = False, False # test if 'test' in config['datasets']: test = True test_data_generator = build_object(config['datasets']['test'][0]) test_data = test_data_generator.generate() # dev if 'dev' in config['datasets']: dev = True dev_data_generator = build_object(config['datasets']['dev'][0]) dev_data = dev_data_generator.generate() # additional representations if 'representations' in config: representation_generators = build_objects(config['representations']) else: representation_generators = [] for r in representation_generators: train_data = r.generate(train_data) if test: test_data = r.generate(test_data) if dev: dev_data = r.generate(dev_data) logger.info("Simple representations: {}".format(len(train_data['target']))) logger.info('here are the keys in your representations: {}'.format(train_data.keys())) # the data_type is the format corresponding to the model of the data that the user wishes to learn data_type = config['contexts'] print("DATA TYPE:", data_type) # sys.exit() train_contexts = create_contexts(train_data, data_type=data_type) if test: test_contexts = create_contexts(test_data, data_type=data_type) if dev: dev_contexts = create_contexts(dev_data, data_type=data_type) logger.info('Vocabulary comparison -- coverage for each dataset: ') logger.info(compare_vocabulary([train_data['target'], test_data['target']])) # END REPRESENTATION GENERATION # FEATURE EXTRACTION train_tags = call_for_each_element(train_contexts, tags_from_contexts, data_type=data_type) if test: test_tags = call_for_each_element(test_contexts, tags_from_contexts, data_type=data_type) if dev: dev_tags = call_for_each_element(dev_contexts, tags_from_contexts, data_type=data_type) logger.info('creating feature extractors...') feature_extractors = build_objects(config['feature_extractors']) if test: logger.info('mapping the feature extractors over the contexts for test...') test_features = call_for_each_element(test_contexts, contexts_to_features, [feature_extractors, workers], data_type=data_type) print("Test features sample: ", test_features[0]) if dev: logger.info('mapping the feature extractors over the contexts for dev...') dev_features = call_for_each_element(dev_contexts, contexts_to_features, [feature_extractors, workers], data_type=data_type) logger.info('mapping the feature extractors over the contexts for train...') train_features = call_for_each_element(train_contexts, contexts_to_features, [feature_extractors, 1], data_type=data_type) print("Train features sample: ", train_features[0]) logger.info('number of training instances: {}'.format(len(train_features))) logger.info('number of testing instances: {}'.format(len(test_features))) logger.info('All of your features now exist in their raw representation, but they may not be numbers yet') # END FEATURE EXTRACTION # binarizing features logger.info('binarization flag: {}'.format(config['features']['binarize'])) # flatten so that we can properly binarize the features if config['features']['binarize'] is True: logger.info('Binarizing your features...') all_values = [] if data_type == 'sequential': all_values = flatten(train_features) elif data_type == 'plain': all_values = train_features elif data_type == 'token': all_values = flatten(train_features.values()) feature_names = [f for extractor in feature_extractors for f in extractor.get_feature_names()] features_num = len(feature_names) true_features_num = len(all_values[0]) logger.info('fitting binarizers...') binarizers = fit_binarizers(all_values) logger.info('binarizing test data...') test_features = call_for_each_element(test_features, binarize, [binarizers], data_type=data_type) logger.info('binarizing training data...') # TODO: this line hangs with alignment+w2v train_features = call_for_each_element(train_features, binarize, [binarizers], data_type=data_type) logger.info('All of your features are now scalars in numpy arrays') logger.info('training and test sets successfully generated') # persisting features logger.info('training and test sets successfully generated') # experiment_datasets = [{'name': 'train', 'features': train_features, 'tags': train_tags}] # if test: # experiment_datasets.append({'name': 'test', 'features': test_features, 'tags': test_tags}) # if dev: # experiment_datasets.append({'name': 'dev', 'features': dev_features, 'tags': dev_tags}) # feature_names = [f for extractor in feature_extractors for f in extractor.get_feature_names()] feature_names = [f for extractor in feature_extractors for f in extractor.get_feature_names()] persist_dir = config['persist_dir'] if 'persist_dir' in config else config['features']['persist_dir'] persist_dir = mk_tmp_dir(persist_dir) # train_file_name, test_file_name, inv_test_file_name = persist_to_svm_dbl(train_features, test_features, feature_names, train_tags, test_tags, persist_dir) train_file_name, test_file_name = persist_to_svm_blind(train_features, test_features, train_tags, test_tags, feature_names, persist_dir) model_name = os.path.join(persist_dir, 'model') logger.info("Start training") kernel = 0 # linear kernel (default) if 'svm_params' in config: kernel = int(config['svm_params']['kernel']) if kernel <= 4 else 0 call(['/export/tools/varvara/svm_multiclass/svm_light/svm_learn', '-t', str(kernel), train_file_name, model_name]) logger.info("Training completed, start testing") test_file = os.path.join(persist_dir, 'out') # inverse_test_file = os.path.join(persist_dir, 'out_inv') call(['/export/tools/varvara/svm_multiclass/svm_light/svm_classify', '-f', '0', test_file_name, model_name, test_file]) # call(['/export/tools/varvara/svm_multiclass/svm_light/svm_classify', '-f', '0', inv_test_file_name, model_name, inverse_test_file]) logger.info("Testing completed") # predicted = get_test_score(test_file, inverse_test_file) predicted = get_test_score_blind(test_file) tag_map = {'OK': 1, 'BAD': 0} test_tags_num = [tag_map[t] for t in test_tags] logger.info(f1_score(predicted, test_tags_num, average=None)) logger.info(f1_score(predicted, test_tags_num, average='weighted', pos_label=None))
def test_build_object(self): testing_cc = self.config['testing'] context_creator = import_utils.build_object(testing_cc) self.assertTrue(len(context_creator.get_contexts('and')) > 0) self.assertFalse(context_creator.get_contexts('the')[0]['token'] is None)
def main(config): workers = config['workers'] tmp_dir = config['tmp_dir'] tmp_dir = mk_tmp_dir(tmp_dir) # REPRESENTATION GENERATION # main representations (source, target, tags) dev, test = False, False # training if 'training' in config['datasets']: train_data_generator = build_object(config['datasets']['training'][0]) train_data = train_data_generator.generate() # test if 'test' in config['datasets']: test = True test_data_generator = build_object(config['datasets']['test'][0]) test_data = test_data_generator.generate() # dev if 'dev' in config['datasets']: dev = True dev_data_generator = build_object(config['datasets']['dev'][0]) dev_data = dev_data_generator.generate() # additional representations if 'representations' in config: representation_generators = build_objects(config['representations']) else: representation_generators = [] for r in representation_generators: train_data = r.generate(train_data) if test: test_data = r.generate(test_data) if dev: dev_data = r.generate(dev_data) logger.info("Simple representations: {}".format(len(train_data['target']))) logger.info('here are the keys in your representations: {}'.format( train_data.keys())) # the data_type is the format corresponding to the model of the data that the user wishes to learn data_type = 'sequential' bad_tagging = config['bad_tagging'] tags_format = config['tags_format'] if 'tags_format' in config else 'word' train_contexts = create_contexts_ngram(train_data, data_type=data_type, test=False, bad_tagging=bad_tagging, unambiguous=config['unambiguous'], tags_format=tags_format) if test: test_contexts = create_contexts_ngram( test_data, data_type=data_type, test=True, bad_tagging=bad_tagging, unambiguous=config['unambiguous'], tags_format=tags_format) if dev: dev_contexts = create_contexts_ngram(dev_data, data_type=data_type, test=True, bad_tagging=bad_tagging, unambiguous=config['unambiguous'], tags_format=tags_format) logger.info('Vocabulary comparison -- coverage for each dataset: ') logger.info(compare_vocabulary([train_data['target'], test_data['target']])) # END REPRESENTATION GENERATION # FEATURE EXTRACTION train_tags = call_for_each_element(train_contexts, tags_from_contexts, data_type=data_type) if test: test_tags = call_for_each_element(test_contexts, tags_from_contexts, data_type=data_type) if dev: dev_tags = call_for_each_element(dev_contexts, tags_from_contexts, data_type=data_type) # word-level tags and phrase lengths if test: test_phrase_lengths = [ get_contexts_words_number(cont) for cont in test_contexts ] if dev: dev_phrase_lengths = [ get_contexts_words_number(cont) for cont in dev_contexts ] logger.info('creating feature extractors...') feature_extractors = build_objects(config['feature_extractors']) if test: logger.info( 'mapping the feature extractors over the contexts for test...') test_features = call_for_each_element(test_contexts, contexts_to_features, [feature_extractors, workers], data_type=data_type) if dev: logger.info( 'mapping the feature extractors over the contexts for dev...') dev_features = call_for_each_element(dev_contexts, contexts_to_features, [feature_extractors, workers], data_type=data_type) logger.info( 'mapping the feature extractors over the contexts for train...') train_features = call_for_each_element(train_contexts, contexts_to_features, [feature_extractors, workers], data_type=data_type) logger.info('number of training instances: {}'.format(len(train_features))) logger.info('number of testing instances: {}'.format(len(test_features))) logger.info( 'All of your features now exist in their raw representation, but they may not be numbers yet' ) # END FEATURE EXTRACTION # persisting features logger.info('training and test sets successfully generated') experiment_datasets = [{ 'name': 'train', 'features': train_features, 'tags': train_tags, 'phrase_lengths': None }] if test: experiment_datasets.append({ 'name': 'test', 'features': test_features, 'tags': test_tags, 'phrase_lengths': test_phrase_lengths }) if dev: experiment_datasets.append({ 'name': 'dev', 'features': dev_features, 'tags': dev_tags, 'phrase_lengths': dev_phrase_lengths }) feature_names = [ f for extractor in feature_extractors for f in extractor.get_feature_names() ] persist_dir = config['persist_dir'] if 'persist_dir' in config else tmp_dir persist_dir = mk_tmp_dir(persist_dir) persist_format = config['persist_format'] logger.info('persisting your features to: {}'.format(persist_dir)) # for each dataset, write a file and persist the features for dataset_obj in experiment_datasets: persist_features(dataset_obj['name'], dataset_obj['features'], persist_dir, feature_names=feature_names, phrase_lengths=dataset_obj['phrase_lengths'], tags=None, file_format=persist_format) # generate a template for CRF++ feature extractor feature_num = len(feature_names) if persist_format == 'crf++': generate_crf_template(feature_num, 'template', persist_dir) logger.info('Features persisted to: {}'.format(', '.join([ os.path.join(persist_dir, nn) for nn in [obj['name'] for obj in experiment_datasets] ])))
def main(config): workers = config['workers'] tmp_dir = config['tmp_dir'] tmp_dir = mk_tmp_dir(tmp_dir) # REPRESENTATION GENERATION # main representations (source, target, tags) dev, test = False, False # training if 'training' in config['datasets']: train_data_generator = build_object(config['datasets']['training'][0]) train_data = train_data_generator.generate() # test if 'test' in config['datasets']: test = True test_data_generator = build_object(config['datasets']['test'][0]) test_data = test_data_generator.generate() # dev if 'dev' in config['datasets']: dev = True dev_data_generator = build_object(config['datasets']['dev'][0]) dev_data = dev_data_generator.generate() # additional representations if 'representations' in config: representation_generators = build_objects(config['representations']) else: representation_generators = [] for r in representation_generators: train_data = r.generate(train_data) if test: test_data = r.generate(test_data) if dev: dev_data = r.generate(dev_data) logger.info("Simple representations: {}".format(len(train_data['target']))) logger.info('here are the keys in your representations: {}'.format(train_data.keys())) # the data_type is the format corresponding to the model of the data that the user wishes to learn data_type = 'sequential' bad_tagging = config['bad_tagging'] tags_format = config['tags_format'] if 'tags_format' in config else 'word' train_contexts = create_contexts_ngram(train_data, data_type=data_type, test=False, bad_tagging=bad_tagging, unambiguous=config['unambiguous'], tags_format=tags_format) if test: test_contexts = create_contexts_ngram(test_data, data_type=data_type, test=True, bad_tagging=bad_tagging, unambiguous=config['unambiguous'], tags_format=tags_format) if dev: dev_contexts = create_contexts_ngram(dev_data, data_type=data_type, test=True, bad_tagging=bad_tagging, unambiguous=config['unambiguous'], tags_format=tags_format) logger.info('Vocabulary comparison -- coverage for each dataset: ') logger.info(compare_vocabulary([train_data['target'], test_data['target']])) # END REPRESENTATION GENERATION # FEATURE EXTRACTION train_tags = call_for_each_element(train_contexts, tags_from_contexts, data_type=data_type) if test: test_tags = call_for_each_element(test_contexts, tags_from_contexts, data_type=data_type) if dev: dev_tags = call_for_each_element(dev_contexts, tags_from_contexts, data_type=data_type) # word-level tags and phrase lengths if test: test_phrase_lengths = [get_contexts_words_number(cont) for cont in test_contexts] if dev: dev_phrase_lengths = [get_contexts_words_number(cont) for cont in dev_contexts] logger.info('creating feature extractors...') feature_extractors = build_objects(config['feature_extractors']) if test: logger.info('mapping the feature extractors over the contexts for test...') test_features = call_for_each_element(test_contexts, contexts_to_features, [feature_extractors, workers], data_type=data_type) if dev: logger.info('mapping the feature extractors over the contexts for dev...') dev_features = call_for_each_element(dev_contexts, contexts_to_features, [feature_extractors, workers], data_type=data_type) logger.info('mapping the feature extractors over the contexts for train...') train_features = call_for_each_element(train_contexts, contexts_to_features, [feature_extractors, workers], data_type=data_type) logger.info('number of training instances: {}'.format(len(train_features))) logger.info('number of testing instances: {}'.format(len(test_features))) logger.info('All of your features now exist in their raw representation, but they may not be numbers yet') # END FEATURE EXTRACTION # persisting features logger.info('training and test sets successfully generated') experiment_datasets = [{'name': 'train', 'features': train_features, 'tags': train_tags, 'phrase_lengths': None}] if test: experiment_datasets.append({'name': 'test', 'features': test_features, 'tags': test_tags, 'phrase_lengths': test_phrase_lengths}) if dev: experiment_datasets.append({'name': 'dev', 'features': dev_features, 'tags': dev_tags, 'phrase_lengths': dev_phrase_lengths}) feature_names = [f for extractor in feature_extractors for f in extractor.get_feature_names()] persist_dir = config['persist_dir'] if 'persist_dir' in config else tmp_dir persist_dir = mk_tmp_dir(persist_dir) persist_format = config['persist_format'] logger.info('persisting your features to: {}'.format(persist_dir)) # for each dataset, write a file and persist the features for dataset_obj in experiment_datasets: persist_features(dataset_obj['name'], dataset_obj['features'], persist_dir, feature_names=feature_names, phrase_lengths=dataset_obj['phrase_lengths'], tags=dataset_obj['tags'], file_format=persist_format) # generate a template for CRF++ feature extractor feature_num = len(feature_names) if persist_format == 'crf++': generate_crf_template(feature_num, 'template', persist_dir) logger.info('Features persisted to: {}'.format(', '.join([os.path.join(persist_dir, nn) for nn in [obj['name'] for obj in experiment_datasets]])))
def main(config, stamp): # the data_type is the format corresponding to the model of the data that the user wishes to learn data_type = config['data_type'] if 'data_type' in config else (config['contexts'] if 'contexts' in config else 'plain') bad_tagging = config['bad_tagging'] if 'bad_tagging' in config else 'pessimistic' logger.info("data_type -- {}, bad_tagging -- {}".format(data_type, bad_tagging)) # time_stamp = str(time.time()) time_stamp = stamp workers = config['workers'] tmp_dir = config['tmp_dir'] # one generator train_data_generator = build_object(config['datasets']['training'][0]) train_data = train_data_generator.generate() # test test_data_generator = build_object(config['datasets']['test'][0]) test_data = test_data_generator.generate() logger.info("Train data keys: {}".format(train_data.keys())) logger.info("Train data sequences: {}".format(len(train_data['target']))) logger.info("Sample sequence: {}".format([w.encode('utf-8') for w in train_data['target'][0]])) # additional representations if 'representations' in config: representation_generators = build_objects(config['representations']) else: representation_generators = [] for r in representation_generators: train_data = r.generate(train_data) test_data = r.generate(test_data) borders = config['borders'] if 'borders' in config else False logger.info('here are the keys in your representations: {}'.format(train_data.keys())) bad_tagging = config['bad_tagging'] if 'bad_tagging' in config else 'pessimistic' # test_contexts = create_contexts_ngram(test_data, data_type=data_type, test=True, bad_tagging=bad_tagging) test_contexts = create_contexts_ngram(test_data, data_type=data_type, test=True, bad_tagging=bad_tagging, tags_format=config['tags_format']) print("Objects in the train data: {}".format(len(train_data['target']))) print("UNAMBIGUOUS: ", config['unambiguous']) # train_contexts = create_contexts_ngram(train_data, data_type=data_type, bad_tagging=bad_tagging, unambiguous=config['unambiguous']) train_contexts = create_contexts_ngram(train_data, data_type=data_type, bad_tagging=bad_tagging, unambiguous=config['unambiguous'], tags_format=config['tags_format']) #print("Train contexts: {}".format(len(train_contexts))) #print("1st context:", train_contexts[0]) # the list of context objects' 'target' field lengths # to restore the word-level tags from the phrase-level #test_context_correspondence = get_contexts_words_number(test_contexts) if data_type == 'sequential': test_context_correspondence = flatten([get_contexts_words_number(cont) for cont in test_contexts]) #print(test_context_correspondence) for idx, cont in enumerate(test_contexts): get_cont = get_contexts_words_number(cont) count_cont = [len(c['token']) for c in cont] assert(all([get_cont[i] == count_cont[i] for i in range(len(cont))])), "Sum doesn't match at line {}:\n{}\n{}".format(idx, ' '.join([str(c) for c in get_cont]), ' '.join([str(c) for c in count_cont])) assert(sum(test_context_correspondence) == sum([len(c['token']) for cont in test_contexts for c in cont])), "Sums don't match: {} and {}".format(sum(test_context_correspondence) == sum([len(c['token']) for cont in test_contexts for c in cont])) else: test_context_correspondence = get_contexts_words_number(test_contexts) assert(sum(test_context_correspondence) == sum([len(c['token']) for c in test_contexts])), "Sums don't match: {} and {}".format(sum(test_context_correspondence), sum([len(c['token']) for c in test_contexts])) # print("Token lengths:", sum([len(c['token']) for c in test_contexts])) # assert(sum(test_context_correspondence) == 9613), "GOLAKTEKO OPASNOSTE!!!, {}".format(sum(test_context_correspondence)) # sys.exit() # if data_type == 'sequential': # test_context_correspondence = flatten(test_context_correspondence) logger.info('Vocabulary comparison -- coverage for each dataset: ') logger.info(compare_vocabulary([train_data['target'], test_data['target']])) # END REPRESENTATION GENERATION # FEATURE EXTRACTION train_tags = call_for_each_element(train_contexts, tags_from_contexts, data_type=data_type) test_tags = call_for_each_element(test_contexts, tags_from_contexts, data_type=data_type) test_tags_true = test_data['tags'] tag_idx = 0 seg_idx = 0 # test_context_correspondence_seq = [get_contexts_words_number(cont) for cont in test_contexts] # for idx, (tag_seq, phr_seq) in enumerate(zip(test_data['tags'], test_context_correspondence_seq)): # assert(len(tag_seq) == sum(phr_seq)),"Something wrong in line {}:\n{}\n{}".format(idx, ' '.join(tag_seq), ' '.join([str(p) for p in phr_seq])) # tag_idx = 0 # for d in phr_seq: # first_tag = tag_seq[tag_idx] # assert(all([t == first_tag for t in tag_seq[tag_idx:tag_idx+d]])), "Something wrong in line {}:\n{}\n{}".format(idx, ' '.join(tag_seq), ' '.join([str(p) for p in phr_seq])) # try: # indicator = [t == first_tag for t in test_data['tags'][seg_idx][tag_idx:tag_idx+d]] # assert(all(indicator)) # tags_cnt += d # if tags_cnt == len(test_data['tags'][seg_idx]): # tags_cnt = 0 # seg_idx += 1 # elif tags_cnt > len(test_data['tags'][seg_idx]): # raise # except: # print("No correspondence in line {}, tag {}: \n{}\n{}".format(seg_idx, tag_idx, ' '.join(test_data['tags'][seg_idx]), d)) # sys.exit() assert(sum(test_context_correspondence) == len(flatten(test_data['tags']))), "Sums don't match for phrase contexts and test data object: {} and {}".format(sum(test_context_correspondence), len(flatten(test_data['tags']))) # flat_cont = flatten(test_contexts) # flat_tags = flatten(test_data['tags']) # for ii in range(len(flat_cont)): if data_type == 'plain': assert(len(test_context_correspondence) == len(test_tags)), "Lengths don't match for phrase contexts and test tags: {} and {}".format(len(test_context_correspondence), len(test_tags)) # test_tags_seq = call_for_each_element(test_contexts_seq, tags_from_contexts, data_type='sequential') logger.info('creating feature extractors...') feature_extractors = build_objects(config['feature_extractors']) logger.info('mapping the feature extractors over the contexts for test...') test_features = call_for_each_element(test_contexts, contexts_to_features, [feature_extractors, workers], data_type=data_type) logger.info('mapping the feature extractors over the contexts for train...') train_features = call_for_each_element(train_contexts, contexts_to_features, [feature_extractors, workers], data_type=data_type) logger.info('number of training instances: {}'.format(len(train_features))) logger.info('number of testing instances: {}'.format(len(test_features))) logger.info('All of your features now exist in their raw representation, but they may not be numbers yet') # END FEATURE EXTRACTION from sklearn.metrics import f1_score, precision_score, recall_score from sklearn.cross_validation import permutation_test_score import numpy as np tag_map = {u'OK': 1, u'BAD': 0} if data_type == 'sequential': # TODO: save features for CRFSuite, call it logger.info('training sequential model...') experiment_datasets = [{'name': 'test', 'features': test_features, 'tags': test_tags}, {'name': 'train', 'features': train_features, 'tags': train_tags}] feature_names = [f for extractor in feature_extractors for f in extractor.get_feature_names()] print("FEATURE NAMES: ", feature_names) persist_dir = tmp_dir logger.info('persisting your features to: {}'.format(persist_dir)) # for each dataset, write a file and persist the features if 'persist_format' not in config: config['persist_format'] = 'crf_suite' for dataset_obj in experiment_datasets: persist_features(dataset_obj['name']+time_stamp, dataset_obj['features'], persist_dir, feature_names=feature_names, tags=dataset_obj['tags'], file_format=config['persist_format']) feature_num = len(train_features[0][0]) train_file = os.path.join(tmp_dir, 'train'+time_stamp+'.crf') test_file = os.path.join(tmp_dir, 'test'+time_stamp+'.crf') if config['persist_format'] == 'crf++': # generate a template for CRF++ feature extractor generate_crf_template(feature_num, 'template', tmp_dir) # train a CRF++ model call(['crf_learn', os.path.join(tmp_dir, 'template'), train_file, os.path.join(tmp_dir, 'crfpp_model_file'+time_stamp)]) # tag a test set call(['crf_test', '-m', os.path.join(tmp_dir, 'crfpp_model_file'+time_stamp), '-o', test_file+'.tagged', test_file]) elif config['persist_format'] == 'crf_suite': crfsuite_algorithm = config['crfsuite_algorithm'] if 'crfsuite_algorithm' in config else 'arow' call(['crfsuite', 'learn', '-a', crfsuite_algorithm, '-m', os.path.join(tmp_dir, 'crfsuite_model_file'+time_stamp), train_file]) test_out = open(test_file+'.tagged', 'w') call(['crfsuite', 'tag', '-tr', '-m', os.path.join(tmp_dir, 'crfsuite_model_file'+time_stamp), test_file], stdout=test_out) test_out.close() else: print("Unknown persist format: {}".format(config['persist_format'])) sequential_true = [[]] sequential_predictions = [[]] flat_true = [] flat_predictions = [] for line in open(test_file+'.tagged'): # end of tagging, statistics reported if line.startswith('Performance'): break if line == '\n': sequential_predictions.append([]) continue chunks = line[:-1].decode('utf-8').split() flat_true.append(chunks[-2]) sequential_true[-1].append(chunks[-2]) flat_predictions.append(chunks[-1]) sequential_predictions[-1].append(chunks[-1]) # restoring the word-level tags test_predictions_word, test_tags_word = [], [] for idx, n in enumerate(test_context_correspondence): for i in range(n): test_predictions_word.append(flat_predictions[idx]) test_tags_word.append(flat_true[idx]) print(f1_score(test_predictions_word, test_tags_word, average=None)) print(f1_score(test_predictions_word, test_tags_word, average='weighted', pos_label=None)) print("Precision: {}, recall: {}".format(precision_score(test_predictions_word, test_tags_word, average=None), recall_score(test_predictions_word, test_tags_word, average=None))) else: train_tags = [tag_map[tag] for tag in train_tags] #print(test_tags) test_tags = [tag_map[tag] for tag in test_tags] #print(test_tags) #sys.exit() # data_type is 'token' or 'plain' logger.info('start training...') classifier_type = import_class(config['learning']['classifier']['module']) # train the classifier(s) classifier_map = map_classifiers(train_features, train_tags, classifier_type, data_type=data_type) logger.info('classifying the test instances') test_predictions = predict_all(test_features, classifier_map, data_type=data_type) # assert(len(test_predictions) == len(flatten(test_tags_seq))), "long predictions: {}, sequential: {}".format(len(test_predictions), len(flatten(test_tags_seq))) cnt = 0 test_predictions_seq = [] test_tags_seq_num = [] tag_map = {'OK': 1, 'BAD': 0, 1: 1, 0: 0} long_test = True if 'multiply_data_test' in config and (config['multiply_data_test'] == 'ngrams' or config['multiply_data_test'] == '1ton') else False # restoring the word-level tags test_predictions_word, test_tags_word = [], [] logger.info("Test predictions lenght: {}".format(len(test_predictions))) for idx, n in enumerate(test_context_correspondence): for i in range(n): test_predictions_word.append(test_predictions[idx]) test_tags_word.append(test_tags[idx]) test_tags_true_flat = flatten(test_tags_true) test_tags_true_flat = [tag_map[t] for t in test_tags_true_flat] # print(f1_score(test_tags_word, test_predictions_word, average=None)) # print(f1_score(test_tags_word, test_predictions_word, average='weighted', pos_label=None)) print(f1_score(test_tags_true_flat, test_predictions_word, average=None)) print(f1_score(test_tags_true_flat, test_predictions_word, average='weighted', pos_label=None)) print("Precision: {}, recall: {}".format(precision_score(test_tags_true_flat, test_predictions_word, average=None), recall_score(test_tags_true_flat, test_predictions_word, average=None))) # TODO: remove the hard coding of the tags here bad_count = sum(1 for t in test_tags if t == u'BAD' or t == 0) good_count = sum(1 for t in test_tags if t == u'OK' or t == 1) total = len(test_tags) assert (total == bad_count+good_count), 'tag counts should be correct' percent_good = good_count / total logger.info('percent good in test set: {}'.format(percent_good)) logger.info('percent bad in test set: {}'.format(1 - percent_good)) random_class_results = [] random_weighted_results = [] for i in range(20): random_tags_phrase = list(np.random.choice([1, 0], total, [percent_good, 1-percent_good])) random_tags = [] for idx, n in enumerate(test_context_correspondence): for i in range(n): random_tags.append(random_tags_phrase[idx]) # random_tags = [u'GOOD' for i in range(total)] random_class_f1 = f1_score(test_tags_true_flat, random_tags, average=None) random_class_results.append(random_class_f1) logger.info('two class f1 random score ({}): {}'.format(i, random_class_f1)) # random_average_f1 = f1_score(random_tags, test_tags, average='weighted') random_average_f1 = f1_score(test_tags_true_flat, random_tags, average='weighted', pos_label=None) random_weighted_results.append(random_average_f1) # logger.info('average f1 random score ({}): {}'.format(i, random_average_f1)) avg_random_class = np.average(random_class_results, axis=0) avg_weighted = np.average(random_weighted_results) logger.info('two class f1 random average score: {}'.format(avg_random_class)) logger.info('weighted f1 random average score: {}'.format(avg_weighted)) # print("Cross-validation:") # print(permutation_test_score()) # logger.info("Sequence correlation: ") # print(sequence_correlation_weighted(test_tags_seq_num, test_predictions_seq, verbose=True)[1]) label_test_hyp_ref(test_predictions_word, test_tags_true_flat, os.path.join(tmp_dir, config['output_name']), config["output_test"])
def main(config): workers = config['workers'] tmp_dir = config['tmp_dir'] tmp_dir = mk_tmp_dir(tmp_dir) # REPRESENTATION GENERATION # main representations (source, target, tags) # training # train_data_generators = build_objects(config['datasets']['training']) # train_data = {} # for gen in train_data_generators: # data = gen.generate() # for key in data: # if key not in train_data: # train_data[key] = [] # train_data[key].extend(data[key]) train_data_generator = build_object(config['datasets']['training'][0]) train_data = train_data_generator.generate() dev, test = False, False # test if 'test' in config['datasets']: test = True test_data_generator = build_object(config['datasets']['test'][0]) test_data = test_data_generator.generate() # dev if 'dev' in config['datasets']: dev = True dev_data_generator = build_object(config['datasets']['dev'][0]) dev_data = dev_data_generator.generate() # additional representations # print("IN MAIN") # print(train_data['alignments_file']) # print(dev_data['alignments_file']) # print(test_data['alignments_file']) if 'representations' in config: representation_generators = build_objects(config['representations']) else: representation_generators = [] for r in representation_generators: train_data = r.generate(train_data) if test: test_data = r.generate(test_data) if dev: dev_data = r.generate(dev_data) print("TEST DATA", test_data['alignments'][0]) logger.info("Simple representations: {}".format(len(train_data['target']))) logger.info('here are the keys in your representations: {}'.format( train_data.keys())) # the data_type is the format corresponding to the model of the data that the user wishes to learn data_type = config['contexts'] print("DATA TYPE:", data_type) # sys.exit() train_contexts = create_contexts(train_data, data_type=data_type) if test: test_contexts = create_contexts(test_data, data_type=data_type) logger.info('Vocabulary comparison -- coverage for test dataset: ') logger.info( compare_vocabulary([train_data['target'], test_data['target']])) if dev: dev_contexts = create_contexts(dev_data, data_type=data_type) # END REPRESENTATION GENERATION # FEATURE EXTRACTION train_tags = call_for_each_element(train_contexts, tags_from_contexts, data_type=data_type) if test: test_tags = call_for_each_element(test_contexts, tags_from_contexts, data_type=data_type) if dev: dev_tags = call_for_each_element(dev_contexts, tags_from_contexts, data_type=data_type) logger.info('creating feature extractors...') feature_extractors = build_objects(config['feature_extractors']) if test: logger.info( 'mapping the feature extractors over the contexts for test...') test_features = call_for_each_element(test_contexts, contexts_to_features, [feature_extractors, workers], data_type=data_type) print("Test features sample: ", test_features[0]) if dev: logger.info( 'mapping the feature extractors over the contexts for dev...') dev_features = call_for_each_element(dev_contexts, contexts_to_features, [feature_extractors, workers], data_type=data_type) logger.info( 'mapping the feature extractors over the contexts for train...') train_features = call_for_each_element(train_contexts, contexts_to_features, [feature_extractors, 1], data_type=data_type) print("Train features sample: ", train_features[0]) logger.info('number of training instances: {}'.format(len(train_features))) logger.info('number of testing instances: {}'.format(len(test_features))) logger.info( 'All of your features now exist in their raw representation, but they may not be numbers yet' ) # END FEATURE EXTRACTION # binarizing features logger.info('binarization flag: {}'.format(config['features']['binarize'])) # flatten so that we can properly binarize the features if config['features']['binarize'] is True: logger.info('Binarizing your features...') all_values = [] if data_type == 'sequential': all_values = flatten(train_features) elif data_type == 'plain': all_values = train_features elif data_type == 'token': all_values = flatten(train_features.values()) feature_names = [ f for extractor in feature_extractors for f in extractor.get_feature_names() ] features_num = len(feature_names) true_features_num = len(all_values[0]) logger.info('fitting binarizers...') binarizers = fit_binarizers(all_values) logger.info('binarizing test data...') test_features = call_for_each_element(test_features, binarize, [binarizers], data_type=data_type) logger.info('binarizing training data...') # TODO: this line hangs with alignment+w2v train_features = call_for_each_element(train_features, binarize, [binarizers], data_type=data_type) logger.info('All of your features are now scalars in numpy arrays') logger.info('training and test sets successfully generated') # persisting features logger.info('training and test sets successfully generated') experiment_datasets = [{ 'name': 'train', 'features': train_features, 'tags': train_tags }] if test: experiment_datasets.append({ 'name': 'test', 'features': test_features, 'tags': test_tags }) if dev: experiment_datasets.append({ 'name': 'dev', 'features': dev_features, 'tags': dev_tags }) feature_names = [ f for extractor in feature_extractors for f in extractor.get_feature_names() ] persist_dir = config['persist_dir'] if 'persist_dir' in config else config[ 'features']['persist_dir'] persist_dir = mk_tmp_dir(persist_dir) persist_format = config[ 'persist_format'] if 'persist_format' in config else config[ 'features']['persist_format'] logger.info('persisting your features to: {}'.format(persist_dir)) # for each dataset, write a file and persist the features for dataset_obj in experiment_datasets: # persist_features(dataset_obj['name'], dataset_obj['features'], persist_dir, feature_names=feature_names, tags=dataset_obj['tags'], file_format=persist_format) persist_features(dataset_obj['name'], dataset_obj['features'], persist_dir, feature_names=feature_names, tags=None, file_format=persist_format) # generate a template for CRF++ feature extractor feature_num = len(feature_names) if persist_format == 'crf++': generate_crf_template(feature_num, 'template', persist_dir) logger.info('Features persisted to: {}'.format(', '.join([ os.path.join(persist_dir, nn) for nn in [obj['name'] for obj in experiment_datasets] ])))
def main(config): workers = config['workers'] tmp_dir = config['tmp_dir'] if 'tmp_dir' in config else None tmp_dir = mk_tmp_dir(tmp_dir) time_stamp = str(time.time()) # REPRESENTATION GENERATION # main representations (source, target, tags) # training train_data_generators = build_objects(config['datasets']['training']) train_data = {} for gen in train_data_generators: data = gen.generate() for key in data: if key not in train_data: train_data[key] = [] train_data[key].extend(data[key]) # test test_data_generator = build_object(config['datasets']['test'][0]) test_data = test_data_generator.generate() logger.info("Train data keys: {}".format(train_data.keys())) logger.info("Train data sequences: {}".format(len(train_data['target']))) logger.info("Sample sequence: {}".format( [w.encode('utf-8') for w in train_data['target'][0]])) # logger.info("Sample sequence: {}".format(train_data['similarity'][0])) # sys.exit() # additional representations if 'representations' in config: representation_generators = build_objects(config['representations']) else: representation_generators = [] for r in representation_generators: train_data = r.generate(train_data) test_data = r.generate(test_data) # borders = config['borders'] if 'borders' in config else False # if 'multiply_data_train' not in config: # pass # elif config['multiply_data_train'] == 'ngrams': # train_data = multiply_data_ngrams(train_data, borders=borders) # elif config['multiply_data_train'] == '1ton': # train_data = multiply_data(train_data, borders=borders) # elif config['multiply_data_train'] == 'duplicate': # train_data = multiply_data_base(train_data) # elif config['multiply_data_train'] == 'all': # train_data = multiply_data_all(train_data, borders=borders) # else: # print("Unknown 'multiply data train' value: {}".format(config['multiply_data_train'])) # logger.info("Extended train representations: {}".format(len(train_data['target']))) # logger.info("Simple test representations: {}".format(len(test_data['target']))) # if 'multiply_data_test' not in config: # pass # elif config['multiply_data_test'] == 'ngrams': # test_data = multiply_data_ngrams(test_data, borders=borders) # elif config['multiply_data_test'] == '1ton': # test_data = multiply_data(test_data, borders=borders) # else: # print("Unknown 'multiply data test' value: {}".format(config['multiply_data_test'])) # logger.info("Extended test representations: {}".format(len(test_data['target']))) logger.info('here are the keys in your representations: {}'.format( train_data.keys())) # the data_type is the format corresponding to the model of the data that the user wishes to learn data_type = config['contexts'] if 'contexts' in config else 'plain' test_contexts = create_contexts(test_data, data_type=data_type) test_contexts_seq = create_contexts(test_data, data_type='sequential') train_contexts = create_contexts(train_data, data_type=data_type) logger.info('Vocabulary comparison -- coverage for each dataset: ') logger.info(compare_vocabulary([train_data['target'], test_data['target']])) # END REPRESENTATION GENERATION # FEATURE EXTRACTION train_tags = call_for_each_element(train_contexts, tags_from_contexts, data_type=data_type) test_tags = call_for_each_element(test_contexts, tags_from_contexts, data_type=data_type) test_tags_seq = call_for_each_element(test_contexts_seq, tags_from_contexts, data_type='sequential') logger.info('creating feature extractors...') feature_extractors = build_objects(config['feature_extractors']) logger.info('mapping the feature extractors over the contexts for test...') test_features = call_for_each_element(test_contexts, contexts_to_features, [feature_extractors, workers], data_type=data_type) logger.info( 'mapping the feature extractors over the contexts for train...') train_features = call_for_each_element(train_contexts, contexts_to_features, [feature_extractors, workers], data_type=data_type) logger.info('number of training instances: {}'.format(len(train_features))) logger.info('number of testing instances: {}'.format(len(test_features))) logger.info( 'All of your features now exist in their raw representation, but they may not be numbers yet' ) # END FEATURE EXTRACTION # BEGIN CONVERTING FEATURES TO NUMBERS logger.info('binarization flag: {}'.format(config['features']['binarize'])) # flatten so that we can properly binarize the features if config['features']['binarize'] is True: logger.info('Binarizing your features...') all_values = [] if data_type == 'sequential': all_values = flatten(train_features) elif data_type == 'plain': all_values = train_features elif data_type == 'token': all_values = flatten(train_features.values()) feature_names = [ f for extractor in feature_extractors for f in extractor.get_feature_names() ] features_num = len(feature_names) true_features_num = len(all_values[0]) logger.info('fitting binarizers...') binarizers = fit_binarizers(all_values) logger.info('binarizing test data...') test_features = call_for_each_element(test_features, binarize, [binarizers], data_type=data_type) logger.info('binarizing training data...') # TODO: this line hangs with alignment+w2v train_features = call_for_each_element(train_features, binarize, [binarizers], data_type=data_type) logger.info('All of your features are now scalars in numpy arrays') logger.info('training and test sets successfully generated') # the way that we persist depends upon the structure of the data (plain/sequence/token_dict) # TODO: remove this once we have a list containing all datasets if config['features']['persist']: if 'persist_format' in config['features']: persist_format = config['features']['persist_format'] else: persist_format = 'crf++' experiment_datasets = [{ 'name': 'test', 'features': test_features, 'tags': test_tags }, { 'name': 'train', 'features': train_features, 'tags': train_tags }] feature_names = [ f for extractor in feature_extractors for f in extractor.get_feature_names() ] if config['features']['persist_dir']: persist_dir = config['features']['persist_dir'] else: persist_dir = os.path.getcwd() logger.info('persisting your features to: {}'.format(persist_dir)) # for each dataset, write a file and persist the features for dataset_obj in experiment_datasets: persist_features(dataset_obj['name'], dataset_obj['features'], persist_dir, feature_names=feature_names, tags=dataset_obj['tags'], file_format=persist_format) # BEGIN LEARNING # TODO: different sequence learning modules need different representation, we should wrap them in a class # TODO: create a consistent interface to sequence learners, will need to use *args and **kwargs because APIs are very different from sklearn.metrics import f1_score, precision_score, recall_score import numpy as np experiment_datasets = [{ 'name': 'test', 'features': test_features, 'tags': test_tags }, { 'name': 'train', 'features': train_features, 'tags': train_tags }] feature_names = [ f for extractor in feature_extractors for f in extractor.get_feature_names() ] print("FEATURE NAMES: ", feature_names) persist_dir = tmp_dir logger.info('persisting your features to: {}'.format(persist_dir)) # for each dataset, write a file and persist the features if 'persist_format' not in config: config['persist_format'] = 'crf_suite' for dataset_obj in experiment_datasets: persist_features(dataset_obj['name'] + time_stamp, dataset_obj['features'], persist_dir, feature_names=feature_names, tags=dataset_obj['tags'], file_format=config['persist_format']) feature_num = len(train_features[0][0]) train_file = os.path.join(tmp_dir, 'train' + time_stamp + '.crf') test_file = os.path.join(tmp_dir, 'test' + time_stamp + '.crf') tag_map = {u'OK': 1, u'BAD': 0, 0: 0, 1: 1} if config['persist_format'] == 'crf++': # generate a template for CRF++ feature extractor generate_crf_template(feature_num, 'template', tmp_dir) # train a CRF++ model call([ 'crf_learn', '-a', 'MIRA', os.path.join(tmp_dir, 'template'), train_file, os.path.join(tmp_dir, 'crfpp_model_file' + time_stamp) ]) # tag a test set call([ 'crf_test', '-m', os.path.join(tmp_dir, 'crfpp_model_file' + time_stamp), '-o', test_file + '.tagged', test_file ]) elif config['persist_format'] == 'crf_suite': crfsuite_algorithm = config['crfsuite_algorithm'] call([ 'crfsuite', 'learn', '-a', crfsuite_algorithm, '-m', os.path.join(tmp_dir, 'crfsuite_model_file' + time_stamp), train_file ]) test_out = open(test_file + '.tagged', 'w') call([ 'crfsuite', 'tag', '-tr', '-m', os.path.join(tmp_dir, 'crfsuite_model_file' + time_stamp), test_file ], stdout=test_out) test_out.close() else: print("Unknown persist format: {}".format(config['persist_format'])) # parse CRFSuite output flattened_ref, flattened_hyp = [], [] tag_map = {'OK': 1, 'BAD': 0} for line in open(test_file + '.tagged'): if line == "\n": continue chunks = line.strip('\n').split('\t') if len(chunks) != 2: continue try: flattened_ref.append(tag_map[chunks[-2]]) flattened_hyp.append(tag_map[chunks[-1]]) except KeyError: continue print("Ref, hyp: ", len(flattened_ref), len(flattened_hyp)) logger.info('Structured prediction f1: ') print(f1_score(flattened_ref, flattened_hyp, average=None)) print( f1_score(flattened_ref, flattened_hyp, average='weighted', pos_label=None)) logger.info("Sequence correlation: ")