def _censor_sparse_vectors_gen(vecs, idxs): for vec in vecs: new_vec = hashabledict() for idx in vec: if idx not in idxs: new_vec[idx] = vec[idx] yield new_vec
def _learning_curve_test_data_set(classifiers, train, test, worker_pool, verbose=False, no_simstring_cache=False, use_test_set=False, folds=10, min_perc=5, max_perc=100, step_perc=5, it_factor=1): # XXX: Not necessary any more! if verbose: print >> stderr, 'Calculating train set size...', train_size = 0 for d in train: for s in d: for a in s: train_size += 1 if verbose: print >> stderr, 'Done!' # XXX: if not no_simstring_cache: simstring_caching(classifiers, (train, test), verbose=verbose) # Collect the seen type to iterate over later seen_types = set() results_by_classifier = {} for classifier_id, classifier_class in classifiers.iteritems(): if verbose: print >> stderr, 'Classifier:', classifier_id, '...', from classifier.liblinear import hashabledict classifier = classifier_class() if verbose: print >> stderr, 'featurising train:', '...', train_lbls, train_vecs = classifier._gen_lbls_vecs(train) train_set = [e for e in izip(train_lbls, train_vecs)] assert len(train_lbls) == train_size, '{} != {}'.format( len(train_lbls), train_size) assert len(train_vecs) == train_size, '{} != {}'.format( len(train_vecs), train_size) assert len(train_set) == train_size, '{} != {}'.format( len(train_set), train_size) del train_lbls del train_vecs if verbose: print >> stderr, 'Done!', print >> stderr, 'featurising test', '...', test_lbls, test_vecs = classifier._gen_lbls_vecs(test) test_vecs = [hashabledict(d) for d in test_vecs] if verbose: print >> stderr, 'Done!', # Fix the seed so that we get comparable folds seed(0xd5347d33) args = ((classifier, fold) for fold in _train_fold_gen( train_set, min_perc, max_perc, step_perc, it_factor)) if worker_pool is None: res_it = (_train_fold(*arg) for arg in args) else: res_it = worker_pool.imap(__train_fold, args) classifier_results = defaultdict(list) print >> stderr, 'Training and evaluating models: ...', i = 0 for sample_size, fold_classifier in res_it: score, new_score = _score_classifier(fold_classifier, test_lbls, test_vecs) classifier_results[sample_size].append((score, new_score)) i += 1 if i % 10 == 0: print >> stderr, i, '...', print >> stderr, 'Done!' # Process the results for sample_size in sorted(e for e in classifier_results): results = classifier_results[sample_size] scores = [score for score, _ in results] new_scores = [new_score for _, new_score in results] macro_scores = [ms for ms, _, _, _, _ in scores] micro_scores = [ms for _, ms, _, _, _ in scores] tps = [tp for _, _, tp, _, _ in scores] fns = [fn for _, _, _, fn, _ in scores] res_dics = [d for _, _, _, _, d in scores] # New metrics ranks = [mean(rs) for rs, _, _ in new_scores] ambiguities = [mean(ambs) for _, ambs, _ in new_scores] recalls = [r for _, _, r in new_scores] # These are means of means ranks_mean = mean(ranks) ranks_stddev = stddev(ranks) ambiguities_mean = mean(ambiguities) ambiguities_stddev = stddev(ambiguities) recalls_mean = mean(recalls) recalls_stddev = stddev(recalls) classifier_result = ( mean(macro_scores), stddev(macro_scores), mean(micro_scores), stddev(micro_scores), mean(tps), stddev(tps), mean(fns), stddev(fns), res_dics, # New metrics ranks_mean, ranks_stddev, ambiguities_mean, ambiguities_stddev, recalls_mean, recalls_stddev) classifier_results[sample_size] = classifier_result if verbose: res_str = ( 'Results {size}: ' 'MACRO: {0:.3f} MACRO_STDDEV: {1:.3f} ' 'MICRO: {2:.3f} MICRO_STDDEV: {3:.3f} ' 'TP: {4:.3f} FP: {5:.3f} ' 'MEAN_RANK: {mean_rank:.3f} MEAN_RANK_STDDEV: {mean_rank_stddev:.3f} ' 'AVG_AMB: {avg_amb:.3f} AVG_AMB_STDDEV: {avg_amb_stddev:.3f} ' 'RECALL: {recall:.3f} RECALL_STDDEV: {recall_stddev:.3f}' ).format(*classifier_result, size=sample_size, mean_rank=ranks_mean, mean_rank_stddev=ranks_stddev, avg_amb=ambiguities_mean, avg_amb_stddev=ambiguities_stddev, recall=recalls_mean, recall_stddev=recalls_stddev) print res_str results_by_classifier[classifier_id] = classifier_results return results_by_classifier
def _learning_curve_test_data_set(classifiers, train, test, worker_pool, verbose=False, no_simstring_cache=False, use_test_set=False, folds=10, min_perc=5, max_perc=100, step_perc=5, it_factor=1): # XXX: Not necessary any more! if verbose: print >> stderr, 'Calculating train set size...', train_size = 0 for d in train: for s in d: for a in s: train_size += 1 if verbose: print >> stderr, 'Done!' # XXX: if not no_simstring_cache: simstring_caching(classifiers, (train, test), verbose=verbose) # Collect the seen type to iterate over later seen_types = set() results_by_classifier = {} for classifier_id, classifier_class in classifiers.iteritems(): if verbose: print >> stderr, 'Classifier:', classifier_id, '...', from classifier.liblinear import hashabledict classifier = classifier_class() if verbose: print >> stderr, 'featurising train:', '...', train_lbls, train_vecs = classifier._gen_lbls_vecs(train) train_set = [e for e in izip(train_lbls, train_vecs)] assert len(train_lbls) == train_size, '{} != {}'.format( len(train_lbls), train_size) assert len(train_vecs) == train_size, '{} != {}'.format( len(train_vecs), train_size) assert len(train_set) == train_size, '{} != {}'.format( len(train_set), train_size) del train_lbls del train_vecs if verbose: print >> stderr, 'Done!', print >> stderr, 'featurising test', '...', test_lbls, test_vecs = classifier._gen_lbls_vecs(test) test_vecs = [hashabledict(d) for d in test_vecs] if verbose: print >> stderr, 'Done!', # Fix the seed so that we get comparable folds seed(0xd5347d33) args = ((classifier, fold) for fold in _train_fold_gen(train_set, min_perc, max_perc, step_perc, it_factor)) if worker_pool is None: res_it = (_train_fold(*arg) for arg in args) else: res_it = worker_pool.imap(__train_fold, args) classifier_results = defaultdict(list) print >> stderr, 'Training and evaluating models: ...', i = 0 for sample_size, fold_classifier in res_it: score, new_score = _score_classifier(fold_classifier, test_lbls, test_vecs) classifier_results[sample_size].append((score, new_score)) i += 1 if i % 10 == 0: print >> stderr, i, '...', print >> stderr, 'Done!' # Process the results for sample_size in sorted(e for e in classifier_results): results = classifier_results[sample_size] scores = [score for score, _ in results] new_scores = [new_score for _, new_score in results] macro_scores = [ms for ms, _, _, _, _ in scores] micro_scores = [ms for _, ms, _, _, _ in scores] tps = [tp for _, _, tp, _, _ in scores] fns = [fn for _, _, _, fn, _ in scores] res_dics = [d for _, _, _, _, d in scores] # New metrics ranks = [mean(rs) for rs, _, _ in new_scores] ambiguities = [mean(ambs) for _, ambs, _ in new_scores] recalls = [r for _, _, r in new_scores] # These are means of means ranks_mean = mean(ranks) ranks_stddev = stddev(ranks) ambiguities_mean = mean(ambiguities) ambiguities_stddev = stddev(ambiguities) recalls_mean = mean(recalls) recalls_stddev = stddev(recalls) classifier_result = ( mean(macro_scores), stddev(macro_scores), mean(micro_scores), stddev(micro_scores), mean(tps), stddev(tps), mean(fns), stddev(fns), res_dics, # New metrics ranks_mean, ranks_stddev, ambiguities_mean, ambiguities_stddev, recalls_mean, recalls_stddev ) classifier_results[sample_size] = classifier_result if verbose: res_str = ('Results {size}: ' 'MACRO: {0:.3f} MACRO_STDDEV: {1:.3f} ' 'MICRO: {2:.3f} MICRO_STDDEV: {3:.3f} ' 'TP: {4:.3f} FP: {5:.3f} ' 'MEAN_RANK: {mean_rank:.3f} MEAN_RANK_STDDEV: {mean_rank_stddev:.3f} ' 'AVG_AMB: {avg_amb:.3f} AVG_AMB_STDDEV: {avg_amb_stddev:.3f} ' 'RECALL: {recall:.3f} RECALL_STDDEV: {recall_stddev:.3f}' ).format(*classifier_result, size=sample_size, mean_rank=ranks_mean, mean_rank_stddev=ranks_stddev, avg_amb=ambiguities_mean, avg_amb_stddev=ambiguities_stddev, recall=recalls_mean, recall_stddev=recalls_stddev ) print res_str results_by_classifier[classifier_id] = classifier_results return results_by_classifier
def _lexical_descent(classifiers, datasets, outdir, verbose=False, worker_pool=None, no_simstring_cache=False, use_test_set=False): # Check that we can in fact do a lexical descent for the classifier for classifier_name in classifiers: assert ('SIMSTRING' in classifier_name or 'TSURUOKA' in classifier_name or 'GAZETTER' in classifier_name) for classifier_name, classifier_class in classifiers.iteritems(): print 'Classifier:', classifier_name classifier = classifier_class() for dataset_name, dataset_getter in datasets.iteritems(): print 'Dataset:', dataset_name if verbose: print >> stderr, 'Reading data...', train_set, dev_set, test_set = dataset_getter() if use_test_set: train, test = list(chain(train_set, dev_set)), list(test_set) else: train, test = list(train_set), list(dev_set) del train_set, dev_set, test_set if verbose: print >> stderr, 'Done!' if not no_simstring_cache: simstring_caching((classifier_name, ), (train, test, ), verbose=verbose) train_lbls, train_vecs = classifier._gen_lbls_vecs(train) test_lbls, test_vecs = classifier._gen_lbls_vecs(test) train_vecs = [hashabledict(d) for d in train_vecs] test_vecs = [hashabledict(d) for d in test_vecs] train_uncensored_vecs = deepcopy(train_vecs) # Generate the folds for all iterations folds = [f for f in _k_folds(5, set(izip(train_lbls, train_vecs)))] #XXX: Constant # XXX: This is an ugly hack and bound to break: # Locate which vector ID;s that are used by SimString features and # by which feature from classifier.simstring.features import SIMSTRING_FEATURES sf_ids = [f().get_id() for f in SIMSTRING_FEATURES] vec_idxs_by_feat_id = defaultdict(set) for sf_id in sf_ids: for f_id in classifier.vec_index_by_feature_id: # NOTE: Not 100% safe check, could match by accident if sf_id in f_id: vec_idxs_by_feat_id[sf_id].add( classifier.vec_index_by_feature_id[f_id]) # Which ones never fired? i = 0 for i, sf_id in enumerate((id for id in sf_ids if id not in vec_idxs_by_feat_id), start=1): print sf_id, 'never fired' else: print '{} SimString feature(s) never fired'.format(i) res_dic = defaultdict(lambda : defaultdict(lambda : '-')) # Iteratively find the best candidate to_evaluate = set((f_id for f_id in vec_idxs_by_feat_id)) removed = set() iteration = 1 last_macro_score = None while to_evaluate: print 'Iteration:', iteration print 'Censoring vectors...', # Censor everything we have removed so far idxs_to_censor = set(i for i in chain( *(vec_idxs_by_feat_id[f_id] for f_id in removed))) train_vecs = [d for d in _censor_sparse_vectors_gen( train_vecs, idxs_to_censor)] train_data = set(izip(train_lbls, train_vecs)) train_folds = [] for fold in folds: f_lbls = (l for l, _ in fold) f_vecs = (d for d in _censor_sparse_vectors_gen( (v for _, v in fold), idxs_to_censor)) train_folds.append(set(izip(f_lbls, f_vecs))) print 'Done!' print 'Training and evaluating a model of our current state...', classifier._liblinear_train(train_lbls, train_vecs) print 'Done!' test_censored_vecs = [d for d in _censor_sparse_vectors_gen( test_vecs, idxs_to_censor)] curr_macro_score = score_classifier_by_tup(classifier, (test_lbls, test_censored_vecs))[0] print 'Current state on test is: {}'.format(curr_macro_score) if last_macro_score is not None: print 'Last state was: {} (diff: {})'.format(last_macro_score, curr_macro_score - last_macro_score) last_macro_score = curr_macro_score # Prepare to go parallel f_args = ((f_id, classifier, train_data, train_folds, to_censor) for f_id, to_censor in vec_idxs_by_feat_id.iteritems() if f_id in to_evaluate) # Also cram in our non-censored one in there f_args = chain(((None, classifier, train_data, train_folds, set()), ), f_args) score_by_knockout = {} print 'Evaluating knockouts ({} in total)'.format( len(to_evaluate) + 1) # TODO: A bit reduntant, prettify! if worker_pool is not None: i = 1 for f_id, mean in worker_pool.imap_unordered( __knockout_pass, f_args): score_by_knockout[f_id] = mean print 'it: {} k: {} res: {} {}'.format( iteration, i, f_id, mean) i += 1 else: for i, args in enumerate(f_args, start=1): f_id, mean = _knockout_pass(*args) score_by_knockout[f_id] = mean print 'it: {} k: {} res: {} {}'.format( iteration, i, f_id, mean) # Set the result dictionary for f_id, mean in score_by_knockout.iteritems(): res_dic[str(iteration)][f_id] = mean # And write the results incrementally for each round with open(join_path(outdir, 'descent_{}_{}.md'.format( classifier_name, dataset_name)), 'w') as md_file: from md import dict_to_table md_file.write(dict_to_table(res_dic, total=False, perc=False)) md_file.write('\n') # Find the best scoring one... scores = [(s, f_id) for f_id, s in score_by_knockout.iteritems()] scores.sort() scores.reverse() best_score, best_f_id = scores[0] print 'Round winner: {} with {}'.format(best_f_id, best_score) if best_f_id is None: # We are done, no removal gave a better score break removed.add(best_f_id) to_evaluate.remove(best_f_id) iteration += 1 if removed: # TODO: Could do more metrics here? print 'Training and evaluating a model of our previous state...', classifier._liblinear_train(train_lbls, train_uncensored_vecs) before_macro_score = score_classifier_by_tup(classifier, (test_lbls, test_vecs))[0] print 'Done!' print 'Training and evaluating a model of our current state...', train_censored_vecs = [d for d in _censor_sparse_vectors_gen(train_vecs, set(i for i in chain(*(vec_idxs_by_feat_id[f_id] for f_id in removed))))] classifier._liblinear_train(train_lbls, train_censored_vecs) print 'Done!' test_censored_vecs = [d for d in _censor_sparse_vectors_gen(test_vecs, set(i for i in chain(*(vec_idxs_by_feat_id[f_id] for f_id in removed))))] after_macro_score = score_classifier_by_tup(classifier, (test_lbls, test_censored_vecs))[0] res_str = 'Before: {} After: {}'.format(before_macro_score, after_macro_score) print res_str print 'Happy?' else: res_str = 'Unable to remove any lexical resource to make improvements...' print res_str # Ugly but saves the final result safely with open(join_path(outdir, 'descent_{}_{}.txt'.format( classifier_name, dataset_name)), 'w') as res_file: res_file.write(res_str)