def run_prediction((train_data, train_tags, test_data, test_tags, idx)): logger.info('training sequential model...') all_values = flatten(train_data) # binarize binarizers = fit_binarizers(all_values) test_data = call_for_each_element(test_data, binarize, [binarizers], data_type='sequential') train_data = call_for_each_element(train_data, binarize, [binarizers], data_type='sequential') x_train = np.array([np.array(xi) for xi in train_data]) y_train = np.array([np.array(xi) for xi in train_tags]) x_test = np.array([np.array(xi) for xi in test_data]) y_test = np.array([np.array(xi) for xi in test_tags]) sequence_learner = PystructSequenceLearner() sequence_learner.fit(x_train, y_train) structured_hyp = sequence_learner.predict(x_test) logger.info('scoring sequential model...') flattened_hyp = flatten(structured_hyp) flattened_ref = flatten(y_test) test_tags = flattened_ref logger.info('Structured prediction f1: ') cur_res = f1_score(flattened_ref, flattened_hyp, average=None) logger.info('[ {}, {} ], {}'.format(cur_res[0], cur_res[1], f1_score(flattened_ref, flattened_hyp, pos_label=None))) return (cur_res, idx)
def run_prediction((train_data, train_tags, test_data, test_tags, idx)): logger.info('training sequential model...') all_values = flatten(train_data) # binarize binarizers = fit_binarizers(all_values) test_data = call_for_each_element(test_data, binarize, [binarizers], data_type='sequential') train_data = call_for_each_element(train_data, binarize, [binarizers], data_type='sequential') x_train = np.array([np.array(xi) for xi in train_data]) y_train = np.array([np.array(xi) for xi in train_tags]) x_test = np.array([np.array(xi) for xi in test_data]) y_test = np.array([np.array(xi) for xi in test_tags]) sequence_learner = PystructSequenceLearner() sequence_learner.fit(x_train, y_train) structured_hyp = sequence_learner.predict(x_test) logger.info('scoring sequential model...') flattened_hyp = flatten(structured_hyp) flattened_ref = flatten(y_test) test_tags = flattened_ref logger.info('Structured prediction f1: ') cur_res = f1_score(flattened_ref, flattened_hyp, average=None) logger.info('[ {}, {} ], {}'.format( cur_res[0], cur_res[1], f1_score(flattened_ref, flattened_hyp, pos_label=None))) return (cur_res, idx)
def main(config): workers = config['workers'] # REPRESENTATION GENERATION # main representations (source, target, tags) # training train_data_generator = build_object(config['datasets']['training'][0]) train_data = train_data_generator.generate() # train_data = {} # for gen in train_data_generators: # data = gen.generate() # for key in data: # if key not in train_data: # train_data[key] = [] # train_data[key].extend(data[key]) # test test_data_generator = build_object(config['datasets']['test'][0]) test_data = test_data_generator.generate() logger.info("Train data keys: {}".format(train_data.keys())) logger.info("Train data sequences: {}".format(len(train_data['target']))) logger.info("Sample sequence: {}".format( [w.encode('utf-8') for w in train_data['target'][0]])) # logger.info("Sample sequence: {}".format(train_data['similarity'][0])) # sys.exit() #logger.info("Alignment file: {}".format(train_data['alignments_file'])) #logger.info("Alignment file: {}".format(test_data['alignments_file'])) # additional representations if 'representations' in config: representation_generators = build_objects(config['representations']) else: representation_generators = [] for r in representation_generators: train_data = r.generate(train_data) test_data = r.generate(test_data) borders = config['borders'] if 'borders' in config else False if 'multiply_data_train' not in config: pass elif config['multiply_data_train'] == 'ngrams': logger.info("Multiply data: {} and {}".format( config['multiply_data_train'], config['multiply_data_test'])) train_data = multiply_data_ngrams(train_data, borders=borders) logger.info("Sequences: {}, tag sequences: {}".format( len(train_data['target']), len(train_data['tags']))) elif config['multiply_data_train'] == '1ton': logger.info("Multiply data: {} and {}".format( config['multiply_data_train'], config['multiply_data_test'])) train_data = multiply_data(train_data, borders=borders) elif config['multiply_data_train'] == 'duplicate': train_data = multiply_data_base(train_data) elif config['multiply_data_train'] == 'all': train_data = multiply_data_all(train_data, borders=borders) else: print("Unknown 'multiply data train' value: {}".format( config['multiply_data_train'])) logger.info("Train data example: {}".format(train_data['target'][:10])) logger.info("Train tags example: {}".format(train_data['tags'][:10])) logger.info("Extended train representations: {}".format( len(train_data['target']))) # print(train_data[:2]) logger.info("Simple test representations: {}".format( len(test_data['target']))) if 'multiply_data_test' not in config: pass elif config['multiply_data_test'] == 'ngrams': test_data = multiply_data_ngrams(test_data, borders=borders) elif config['multiply_data_test'] == '1ton': test_data = multiply_data(test_data, borders=borders) else: print("Unknown 'multiply data test' value: {}".format( config['multiply_data_test'])) logger.info("Extended test representations: {}".format( len(test_data['target']))) logger.info('here are the keys in your representations: {}'.format( train_data.keys())) # the data_type is the format corresponding to the model of the data that the user wishes to learn # data_type = config['contexts'] if 'contexts' in config else 'plain' data_type = config['data_type'] if 'data_type' in config else 'sequential' test_contexts = create_contexts(test_data, data_type=data_type) test_contexts_seq = create_contexts(test_data, data_type='sequential') train_contexts = create_contexts(train_data, data_type=data_type) logger.info('Vocabulary comparison -- coverage for each dataset: ') logger.info(compare_vocabulary([train_data['target'], test_data['target']])) # END REPRESENTATION GENERATION # FEATURE EXTRACTION train_tags = call_for_each_element(train_contexts, tags_from_contexts, data_type=data_type) test_tags = call_for_each_element(test_contexts, tags_from_contexts, data_type=data_type) test_tags_seq = call_for_each_element(test_contexts_seq, tags_from_contexts, data_type='sequential') logger.info('creating feature extractors...') feature_extractors = build_objects(config['feature_extractors']) logger.info('mapping the feature extractors over the contexts for test...') test_features = call_for_each_element(test_contexts, contexts_to_features, [feature_extractors, workers], data_type=data_type) logger.info( 'mapping the feature extractors over the contexts for train...') train_features = call_for_each_element(train_contexts, contexts_to_features, [feature_extractors, workers], data_type=data_type) logger.info('number of training instances: {}'.format(len(train_features))) logger.info('number of testing instances: {}'.format(len(test_features))) logger.info('train features sample: {}'.format(train_features[:5])) logger.info('train tags sample: {}'.format(train_tags[:5])) logger.info( 'All of your features now exist in their raw representation, but they may not be numbers yet' ) # END FEATURE EXTRACTION # BEGIN CONVERTING FEATURES TO NUMBERS logger.info('binarization flag: {}'.format(config['features']['binarize'])) # flatten so that we can properly binarize the features if config['features']['binarize'] is True: logger.info('Binarizing your features...') all_values = [] if data_type == 'sequential': all_values = flatten(train_features) elif data_type == 'plain': all_values = train_features elif data_type == 'token': all_values = flatten(train_features.values()) feature_names = [ f for extractor in feature_extractors for f in extractor.get_feature_names() ] features_num = len(feature_names) true_features_num = len(all_values[0]) logger.info('fitting binarizers...') binarizers = fit_binarizers(all_values) logger.info('binarizing test data...') test_features = call_for_each_element(test_features, binarize, [binarizers], data_type=data_type) logger.info('binarizing training data...') # TODO: this line hangs with alignment+w2v train_features = call_for_each_element(train_features, binarize, [binarizers], data_type=data_type) logger.info('All of your features are now scalars in numpy arrays') logger.info('training and test sets successfully generated') # the way that we persist depends upon the structure of the data (plain/sequence/token_dict) # TODO: remove this once we have a list containing all datasets if config['features']['persist']: if 'persist_format' in config['features']: persist_format = config['features']['persist_format'] else: persist_format = 'crf++' experiment_datasets = [{ 'name': 'test', 'features': test_features, 'tags': test_tags }, { 'name': 'train', 'features': train_features, 'tags': train_tags }] feature_names = [ f for extractor in feature_extractors for f in extractor.get_feature_names() ] if config['persist_dir']: persist_dir = config['persist_dir'] else: persist_dir = os.path.getcwd() logger.info('persisting your features to: {}'.format(persist_dir)) # for each dataset, write a file and persist the features for dataset_obj in experiment_datasets: persist_features(dataset_obj['name'], dataset_obj['features'], persist_dir, feature_names=feature_names, tags=dataset_obj['tags'], file_format=persist_format) sys.exit() # BEGIN LEARNING # TODO: different sequence learning modules need different representation, we should wrap them in a class # TODO: create a consistent interface to sequence learners, will need to use *args and **kwargs because APIs are very different from sklearn.metrics import f1_score, precision_score, recall_score import numpy as np tag_map = {u'OK': 1, u'BAD': 0, 0: 0, 1: 1} if data_type == 'sequential': logger.info('training sequential model...') train_tags = [[tag_map[tag] for tag in seq] for seq in train_tags] test_tags = [[tag_map[tag] for tag in seq] for seq in test_tags] x_train = np.array([np.array(xi) for xi in train_features]) y_train = np.array([np.array(xi) for xi in train_tags]) x_test = np.array([numpy.array(xi) for xi in test_features]) y_test = np.array([numpy.array(xi) for xi in test_tags]) # pystruct from marmot.learning.pystruct_sequence_learner import PystructSequenceLearner sequence_learner = PystructSequenceLearner() sequence_learner.fit(x_train, y_train) structured_hyp = sequence_learner.predict(x_test) # only the last word in every sequence should be counted flattened_hyp = [] flattened_ref = [] if long_test: for idx, seq in enumerate(structured_hyp): flattened_hyp.append(seq[-1]) flattened_ref.append(y_test[idx][-1]) else: flattened_hyp = flatten(structured_hyp) flattened_ref = flatten(y_test) logger.info('scoring sequential model...') # TODO: the flattening is currently a hack to let us use the same evaluation code for structured and plain tasks # flattened_hyp = flatten(structured_hyp) # end pystruct # for idx, seq in enumerate(test_tags_seq): # cnt += len(seq) # if cnt >= len(test_predictions): # print("long predictions: {}, sequential: {}, sequence #{}".format(len(test_predictions), len(flatten(test_tags_seq)), idx)) # print("Sequence: ", test_contexts_seq[idx]) # if long_test: # cnt = -1 # new_predictions = [] # new_true = [] # for seq in test_tags_seq: # cnt += len(seq) # new_predictions.append(tag_map[test_predictions[cnt]]) # new_true.append(tag_map[seq[-1]]) # test_predictions = new_predictions # test_tags = new_true # # print(f1_score(test_predictions, test_tags, average=None)) print("Ref, hyp: ", len(flattened_ref), len(flattened_hyp)) logger.info('Structured prediction f1: ') print(f1_score(flattened_ref, flattened_hyp, average=None)) print( f1_score(flattened_ref, flattened_hyp, average='weighted', pos_label=None)) logger.info("Sequence correlation: ") print( sequence_correlation_weighted(y_test, structured_hyp, verbose=True)[1]) else: train_tags = [tag_map[tag] for tag in train_tags] test_tags = [tag_map[tag] for tag in test_tags] # data_type is 'token' or 'plain' logger.info('start training...') classifier_type = import_class( config['learning']['classifier']['module']) # train the classifier(s) classifier_map = map_classifiers(train_features, train_tags, classifier_type, data_type=data_type) logger.info('classifying the test instances') test_predictions = predict_all(test_features, classifier_map, data_type=data_type) # assert(len(test_predictions) == len(flatten(test_tags_seq))), "long predictions: {}, sequential: {}".format(len(test_predictions), len(flatten(test_tags_seq))) cnt = 0 test_predictions_seq = [] test_tags_seq_num = [] tag_map = {'OK': 1, 'BAD': 0, 1: 1, 0: 0} long_test = True if 'multiply_data_test' in config and ( config['multiply_data_test'] == 'ngrams' or config['multiply_data_test'] == '1ton') else False for idx, seq in enumerate(test_tags_seq): test_predictions_seq.append([]) test_tags_seq_num.append([]) for w in seq: test_predictions_seq[-1].append(tag_map[test_predictions[cnt]]) test_tags_seq_num[-1].append(tag_map[w]) cnt += 1 # cnt += len(seq) # if cnt >= len(test_predictions): # print("long predictions: {}, sequential: {}, sequence #{}".format(len(test_predictions), len(flatten(test_tags_seq)), idx)) # print("Sequence: ", test_contexts_seq[idx]) if long_test: cnt = -1 new_predictions = [] new_true = [] for seq in test_tags_seq: cnt += len(seq) new_predictions.append(tag_map[test_predictions[cnt]]) new_true.append(tag_map[seq[-1]]) test_predictions = new_predictions test_tags = new_true print(f1_score(test_predictions, test_tags, average=None)) print( f1_score(test_predictions, test_tags, average='weighted', pos_label=None)) print("Precision: {}, recall: {}".format( precision_score(test_predictions, test_tags, average=None), recall_score(test_predictions, test_tags, average=None))) logger.info("Sequence correlation: ") print( sequence_correlation_weighted(test_tags_seq_num, test_predictions_seq, verbose=True)[1])
def main(config): workers = config['workers'] # REPRESENTATION GENERATION # main representations (source, target, tags) # training train_data_generators = build_objects(config['datasets']['training']) train_data = {} for gen in train_data_generators: data = gen.generate() for key in data: if key not in train_data: train_data[key] = [] train_data[key].extend(data[key]) # test test_data_generator = build_object(config['datasets']['test'][0]) test_data = test_data_generator.generate() logger.info("Train data keys: {}".format(train_data.keys())) logger.info("Train data sequences: {}".format(len(train_data['target']))) logger.info("Sample sequence: {}".format([w.encode('utf-8') for w in train_data['target'][0]])) # logger.info("Sample sequence: {}".format(train_data['similarity'][0])) # sys.exit() # additional representations if 'representations' in config: representation_generators = build_objects(config['representations']) else: representation_generators = [] for r in representation_generators: train_data = r.generate(train_data) test_data = r.generate(test_data) borders = config['borders'] if 'borders' in config else False if 'multiply_data_train' not in config: pass elif config['multiply_data_train'] == 'ngrams': train_data = multiply_data_ngrams(train_data, borders=borders) elif config['multiply_data_train'] == '1ton': train_data = multiply_data(train_data, borders=borders) elif config['multiply_data_train'] == 'duplicate': train_data = multiply_data_base(train_data) elif config['multiply_data_train'] == 'all': train_data = multiply_data_all(train_data, borders=borders) else: print("Unknown 'multiply data train' value: {}".format(config['multiply_data_train'])) logger.info("Extended train representations: {}".format(len(train_data['target']))) # print(train_data[:2]) logger.info("Simple test representations: {}".format(len(test_data['target']))) if 'multiply_data_test' not in config: pass elif config['multiply_data_test'] == 'ngrams': test_data = multiply_data_ngrams(test_data, borders=borders) elif config['multiply_data_test'] == '1ton': test_data = multiply_data(test_data, borders=borders) else: print("Unknown 'multiply data test' value: {}".format(config['multiply_data_test'])) logger.info("Extended test representations: {}".format(len(test_data['target']))) logger.info('here are the keys in your representations: {}'.format(train_data.keys())) # the data_type is the format corresponding to the model of the data that the user wishes to learn data_type = config['contexts'] if 'contexts' in config else 'plain' test_contexts = create_contexts(test_data, data_type=data_type) test_contexts_seq = create_contexts(test_data, data_type='sequential') train_contexts = create_contexts(train_data, data_type=data_type) logger.info('Vocabulary comparison -- coverage for each dataset: ') logger.info(compare_vocabulary([train_data['target'], test_data['target']])) # END REPRESENTATION GENERATION # FEATURE EXTRACTION train_tags = call_for_each_element(train_contexts, tags_from_contexts, data_type=data_type) test_tags = call_for_each_element(test_contexts, tags_from_contexts, data_type=data_type) test_tags_seq = call_for_each_element(test_contexts_seq, tags_from_contexts, data_type='sequential') logger.info('creating feature extractors...') feature_extractors = build_objects(config['feature_extractors']) logger.info('mapping the feature extractors over the contexts for test...') test_features = call_for_each_element(test_contexts, contexts_to_features, [feature_extractors, workers], data_type=data_type) logger.info('mapping the feature extractors over the contexts for train...') train_features = call_for_each_element(train_contexts, contexts_to_features, [feature_extractors, workers], data_type=data_type) logger.info('number of training instances: {}'.format(len(train_features))) logger.info('number of testing instances: {}'.format(len(test_features))) logger.info('All of your features now exist in their raw representation, but they may not be numbers yet') # END FEATURE EXTRACTION # BEGIN CONVERTING FEATURES TO NUMBERS logger.info('binarization flag: {}'.format(config['features']['binarize'])) # flatten so that we can properly binarize the features if config['features']['binarize'] is True: logger.info('Binarizing your features...') all_values = [] if data_type == 'sequential': all_values = flatten(train_features) elif data_type == 'plain': all_values = train_features elif data_type == 'token': all_values = flatten(train_features.values()) feature_names = [f for extractor in feature_extractors for f in extractor.get_feature_names()] features_num = len(feature_names) true_features_num = len(all_values[0]) logger.info('fitting binarizers...') binarizers = fit_binarizers(all_values) logger.info('binarizing test data...') test_features = call_for_each_element(test_features, binarize, [binarizers], data_type=data_type) logger.info('binarizing training data...') # TODO: this line hangs with alignment+w2v train_features = call_for_each_element(train_features, binarize, [binarizers], data_type=data_type) logger.info('All of your features are now scalars in numpy arrays') logger.info('training and test sets successfully generated') # the way that we persist depends upon the structure of the data (plain/sequence/token_dict) # TODO: remove this once we have a list containing all datasets if config['features']['persist']: if 'persist_format' in config['features']: persist_format = config['features']['persist_format'] else: persist_format = 'crf++' experiment_datasets = [{'name': 'test', 'features': test_features, 'tags': test_tags}, {'name': 'train', 'features': train_features, 'tags': train_tags}] feature_names = [f for extractor in feature_extractors for f in extractor.get_feature_names()] if config['features']['persist_dir']: persist_dir = config['features']['persist_dir'] else: persist_dir = os.path.getcwd() logger.info('persisting your features to: {}'.format(persist_dir)) # for each dataset, write a file and persist the features for dataset_obj in experiment_datasets: persist_features(dataset_obj['name'], dataset_obj['features'], persist_dir, feature_names=feature_names, tags=dataset_obj['tags'], file_format=persist_format) # BEGIN LEARNING # TODO: different sequence learning modules need different representation, we should wrap them in a class # TODO: create a consistent interface to sequence learners, will need to use *args and **kwargs because APIs are very different from sklearn.metrics import f1_score, precision_score, recall_score import numpy as np tag_map = {u'OK': 1, u'BAD': 0, 0: 0, 1: 1} if data_type == 'sequential': logger.info('training sequential model...') train_tags = [[tag_map[tag] for tag in seq] for seq in train_tags] test_tags = [[tag_map[tag] for tag in seq] for seq in test_tags] x_train = np.array([np.array(xi) for xi in train_features]) y_train = np.array([np.array(xi) for xi in train_tags]) x_test = np.array([numpy.array(xi) for xi in test_features]) y_test = np.array([numpy.array(xi) for xi in test_tags]) # pystruct from marmot.learning.pystruct_sequence_learner import PystructSequenceLearner sequence_learner = PystructSequenceLearner() sequence_learner.fit(x_train, y_train) structured_hyp = sequence_learner.predict(x_test) # only the last word in every sequence should be counted flattened_hyp = [] flattened_ref = [] if long_test: for idx, seq in enumerate(structured_hyp): flattened_hyp.append(seq[-1]) flattened_ref.append(y_test[idx][-1]) else: flattened_hyp = flatten(structured_hyp) flattened_ref = flatten(y_test) logger.info('scoring sequential model...') # TODO: the flattening is currently a hack to let us use the same evaluation code for structured and plain tasks # flattened_hyp = flatten(structured_hyp) # end pystruct # for idx, seq in enumerate(test_tags_seq): # cnt += len(seq) # if cnt >= len(test_predictions): # print("long predictions: {}, sequential: {}, sequence #{}".format(len(test_predictions), len(flatten(test_tags_seq)), idx)) # print("Sequence: ", test_contexts_seq[idx]) # if long_test: # cnt = -1 # new_predictions = [] # new_true = [] # for seq in test_tags_seq: # cnt += len(seq) # new_predictions.append(tag_map[test_predictions[cnt]]) # new_true.append(tag_map[seq[-1]]) # test_predictions = new_predictions # test_tags = new_true # # print(f1_score(test_predictions, test_tags, average=None)) print("Ref, hyp: ", len(flattened_ref), len(flattened_hyp)) logger.info('Structured prediction f1: ') print(f1_score(flattened_ref, flattened_hyp, average=None)) print(f1_score(flattened_ref, flattened_hyp, average='weighted', pos_label=None)) logger.info("Sequence correlation: ") print(sequence_correlation_weighted(y_test, structured_hyp, verbose=True)[1]) else: train_tags = [tag_map[tag] for tag in train_tags] test_tags = [tag_map[tag] for tag in test_tags] # data_type is 'token' or 'plain' logger.info('start training...') classifier_type = import_class(config['learning']['classifier']['module']) # train the classifier(s) classifier_map = map_classifiers(train_features, train_tags, classifier_type, data_type=data_type) logger.info('classifying the test instances') test_predictions = predict_all(test_features, classifier_map, data_type=data_type) # assert(len(test_predictions) == len(flatten(test_tags_seq))), "long predictions: {}, sequential: {}".format(len(test_predictions), len(flatten(test_tags_seq))) cnt = 0 test_predictions_seq = [] test_tags_seq_num = [] tag_map = {'OK': 1, 'BAD': 0, 1: 1, 0: 0} long_test = True if 'multiply_data_test' in config and (config['multiply_data_test'] == 'ngrams' or config['multiply_data_test'] == '1ton') else False for idx, seq in enumerate(test_tags_seq): test_predictions_seq.append([]) test_tags_seq_num.append([]) for w in seq: test_predictions_seq[-1].append(tag_map[test_predictions[cnt]]) test_tags_seq_num[-1].append(tag_map[w]) cnt += 1 # cnt += len(seq) # if cnt >= len(test_predictions): # print("long predictions: {}, sequential: {}, sequence #{}".format(len(test_predictions), len(flatten(test_tags_seq)), idx)) # print("Sequence: ", test_contexts_seq[idx]) if long_test: cnt = -1 new_predictions = [] new_true = [] for seq in test_tags_seq: cnt += len(seq) new_predictions.append(tag_map[test_predictions[cnt]]) new_true.append(tag_map[seq[-1]]) test_predictions = new_predictions test_tags = new_true print(f1_score(test_predictions, test_tags, average=None)) print(f1_score(test_predictions, test_tags, average='weighted', pos_label=None)) print("Precision: {}, recall: {}".format(precision_score(test_predictions, test_tags, average=None), recall_score(test_predictions, test_tags, average=None))) logger.info("Sequence correlation: ") print(sequence_correlation_weighted(test_tags_seq_num, test_predictions_seq, verbose=True)[1])
def main(config): workers = config['workers'] # REPRESENTATION GENERATION # main representations (source, target, tags) # training train_data_generators = build_objects(config['datasets']['training']) train_data = {} for gen in train_data_generators: data = gen.generate() for key in data: if key not in train_data: train_data[key] = [] train_data[key].extend(data[key]) # test test_data_generator = build_object(config['datasets']['test'][0]) test_data = test_data_generator.generate() # additional representations if 'representations' in config: representation_generators = build_objects(config['representations']) else: representation_generators = [] for r in representation_generators: train_data = r.generate(train_data) test_data = r.generate(test_data) logger.info('here are the keys in your representations: {}'.format(train_data.keys())) # the data_type is the format corresponding to the model of the data that the user wishes to learn data_type = config['contexts'] if 'contexts' in config else 'plain' test_contexts = create_contexts(test_data, data_type=data_type) train_contexts = create_contexts(train_data, data_type=data_type) logger.info('Vocabulary comparison -- coverage for each dataset: ') logger.info(compare_vocabulary([train_data['target'], test_data['target']])) # END REPRESENTATION GENERATION # FEATURE EXTRACTION # make sure the test_context and train_context keys are in sync # TODO: this is important when we are learning token-level classifiers # experiment_utils.sync_keys(train_contexts, test_contexts) # TODO: this is important when we are learning token-level classifiers # test_contexts = filter_contexts(test_contexts, min_total=min_total) # assert set(test_contexts.keys()) == set(train_contexts.keys()) train_tags = call_for_each_element(train_contexts, tags_from_contexts, data_type=data_type) test_tags = call_for_each_element(test_contexts, tags_from_contexts, data_type=data_type) # all of the feature extraction should be parallelizable # note that a feature extractor MUST be able to parse the context exchange format, or it should throw an error: # { 'token': <token>, index: <idx>, 'source': [<source toks>]', 'target': [<target toks>], 'tag': <tag>} logger.info('creating feature extractors...') feature_extractors = build_objects(config['feature_extractors']) logger.info('mapping the feature extractors over the contexts for test...') test_features = call_for_each_element(test_contexts, contexts_to_features, [feature_extractors, workers], data_type=data_type) logger.info('mapping the feature extractors over the contexts for train...') train_features = call_for_each_element(train_contexts, contexts_to_features, [feature_extractors, workers], data_type=data_type) logger.info('number of training instances: {}'.format(len(train_features))) logger.info('number of testing instances: {}'.format(len(test_features))) logger.info('All of your features now exist in their raw representation, but they may not be numbers yet') # END FEATURE EXTRACTION # BEGIN CONVERTING FEATURES TO NUMBERS logger.info('binarization flag: {}'.format(config['features']['binarize'])) # flatten so that we can properly binarize the features if config['features']['binarize'] is True: logger.info('Binarizing your features...') all_values = [] if data_type == 'sequential': all_values = flatten(train_features) elif data_type == 'plain': all_values = train_features elif data_type == 'token': all_values = flatten(train_features.values()) logger.info('fitting binarizers...') binarizers = fit_binarizers(all_values) logger.info('binarizing test data...') test_features = call_for_each_element(test_features, binarize, [binarizers], data_type=data_type) logger.info('binarizing training data...') # TODO: this line hangs with alignment+w2v train_features = call_for_each_element(train_features, binarize, [binarizers], data_type=data_type) logger.info('All of your features are now scalars in numpy arrays') logger.info('training and test sets successfully generated') # the way that we persist depends upon the structure of the data (plain/sequence/token_dict) # TODO: remove this once we have a list containing all datasets if config['features']['persist']: if 'persist_format' in config['features']: persist_format = config['features']['persist_format'] else: persist_format = 'crf++' experiment_datasets = [{'name': 'test', 'features': test_features, 'tags': test_tags}, {'name': 'train', 'features': train_features, 'tags': train_tags}] feature_names = [f for extractor in feature_extractors for f in extractor.get_feature_names()] if config['features']['persist_dir']: persist_dir = config['features']['persist_dir'] else: persist_dir = os.path.getcwd() logger.info('persisting your features to: '.format(persist_dir)) # for each dataset, write a file and persist the features for dataset_obj in experiment_datasets: persist_features(dataset_obj['name'], dataset_obj['features'], persist_dir, feature_names=feature_names, tags=dataset_obj['tags'], file_format=persist_format) # TODO: we should only learn and evaluate the model if this is what the user wants # TODO: we should be able to dump the features for each of the user's datasets to a file specified by the user # BEGIN LEARNING # TODO: different sequence learning modules need different representation, we should wrap them in a class # TODO: create a consistent interface to sequence learners, will need to use *args and **kwargs because APIs are very different from sklearn.metrics import f1_score import numpy as np if data_type == 'sequential': logger.info('training sequential model...') # TODO: move the tag and array conversion code to the utils of this module # TODO: check if X and y are in the format we expect # TODO: don't hardcode the dictionary tag_map = {u'OK': 1, u'BAD': 0} train_tags = [[tag_map[tag] for tag in seq] for seq in train_tags] test_tags = [[tag_map[tag] for tag in seq] for seq in test_tags] # make sure that everything is numpy # cast the dataset to numpy array (ndarrays) # note that these are _NOT_ matrices, because the inner sequences have different lengths x_train = np.array([np.array(xi) for xi in train_features]) y_train = np.array([np.array(xi) for xi in train_tags]) x_test = np.array([numpy.array(xi) for xi in test_features]) y_test = np.array([numpy.array(xi) for xi in test_tags]) # SEQLEARN # from seqlearn.perceptron import StructuredPerceptron # # # seqlearn needs a flat list of instances # x_train = np.array([i for seq in x_train for i in seq]) # y_train = np.array([i for seq in y_train for i in seq]) # x_test = np.array([i for seq in x_test for i in seq]) # y_test = np.array([i for seq in y_test for i in seq]) # # # seqlearn requires the lengths of each sequence # lengths_train = [len(seq) for seq in train_features] # lengths_test = [len(seq) for seq in test_features] # # clf = StructuredPerceptron(verbose=True, max_iter=400) # clf.fit(x_train, y_train, lengths_train) # # structured_predictions = clf.predict(x_test, lengths_test) # logger.info('f1 from seqlearn: {}'.format(f1_score(y_test, structured_predictions, average=None))) # END SEQLEARN # pystruct from marmot.learning.pystruct_sequence_learner import PystructSequenceLearner sequence_learner = PystructSequenceLearner() sequence_learner.fit(x_train, y_train) structured_hyp = sequence_learner.predict(x_test) logger.info('scoring sequential model...') # print('score: ' + str(structured_predictor.score(x_test, y_test))) # TODO: implement this in the config # classifier_type = import_class(config['learning']['classifier']['module']) # TODO: the flattening is currently a hack to let us use the same evaluation code for structured and plain tasks flattened_hyp = flatten(structured_hyp) # end pystruct test_predictions = flattened_hyp flattened_ref = flatten(y_test) test_tags = flattened_ref logger.info('Structured prediction f1: ') print(f1_score(flattened_ref, flattened_hyp, average=None)) else: # data_type is 'token' or 'plain' logger.info('start training...') classifier_type = import_class(config['learning']['classifier']['module']) # train the classifier(s) classifier_map = map_classifiers(train_features, train_tags, classifier_type, data_type=data_type) logger.info('classifying the test instances') test_predictions = predict_all(test_features, classifier_map, data_type=data_type) # TODO: this section only works for 'plain' print(f1_score(test_predictions, test_tags, average=None)) # EVALUATION logger.info('evaluating your results') # TODO: remove the hard coding of the tags here bad_count = sum(1 for t in test_tags if t == u'BAD' or t == 0) good_count = sum(1 for t in test_tags if t == u'OK' or t == 1) total = len(test_tags) assert (total == bad_count+good_count), 'tag counts should be correct' percent_good = good_count / total logger.info('percent good in test set: {}'.format(percent_good)) logger.info('percent bad in test set: {}'.format(1 - percent_good)) import numpy as np random_class_results = [] random_weighted_results = [] for i in range(20): random_tags = list(np.random.choice([1, 0], total, [percent_good, 1-percent_good])) # random_tags = [u'GOOD' for i in range(total)] random_class_f1 = f1_score(test_tags, random_tags, average=None) random_class_results.append(random_class_f1) logger.info('two class f1 random score ({}): {}'.format(i, random_class_f1)) # random_average_f1 = f1_score(random_tags, test_tags, average='weighted') random_average_f1 = weighted_fmeasure(test_tags, random_tags) random_weighted_results.append(random_average_f1) # logger.info('average f1 random score ({}): {}'.format(i, random_average_f1)) avg_random_class = np.average(random_class_results, axis=0) avg_weighted = np.average(random_weighted_results) logger.info('two class f1 random average score: {}'.format(avg_random_class)) logger.info('weighted f1 random average score: {}'.format(avg_weighted)) actual_class_f1 = f1_score(test_tags, test_predictions, average=None) actual_average_f1 = weighted_fmeasure(test_tags, test_predictions) logger.info('two class f1 ACTUAL SCORE: {}'.format(actual_class_f1)) logger.info('weighted f1 ACTUAL SCORE: {}'.format(actual_average_f1)) if data_type == 'token': f1_map = {} for token, predicted in test_predictions.iteritems(): logger.info("Evaluating results for token = " + token) actual = test_tags_actual[token] # print('Actual: ', actual) # print('Predicted: ', predicted) # logger.info("\ttotal instances: " + str(len(predicted))) f1_map[token] = weighted_fmeasure(actual, predicted) logger.info('Printing the map of f1 scores by token: ') print(f1_map) elif data_type == 'plain': f1 = weighted_fmeasure(test_tags, test_predictions)