def do_feature_set_analysis(train_instances, test_instances, folds, clf, param_grid, dense, outfile): groups = set(train_instances[0].feature_groups.keys()).intersection(test_instances[0].feature_groups.keys()) print groups X_train, y_train, feature_space = pipe.instances_to_matrix(train_instances, dense = dense, groups = groups) X_test, y_test = test_instances_to_matrix(feature_space, test_instances, dense = dense) model = get_optimal_model(X_train, y_train, folds, clf, param_grid, 'roc_auc') y_pred = model.predict(X_test) scores = get_scores(model, X_test) print("Test ROC: %f" % roc_auc_score( y_test, scores)) print(classification_report(y_test, y_pred)) fpr, tpr, thresholds = roc_curve(y_test, scores) np.set_printoptions(threshold='nan') for i in range(1, len(fpr), 100): print "Theshold: %0.4f FPR: %0.4f TPR: %0.4f" % (thresholds[i], fpr[i], tpr[i]) roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, lw=1, label='ALL (area = %0.4f)' % (roc_auc)) all_tpr = [] for g in groups: print g X_train, y_train, feature_space = pipe.instances_to_matrix(train_instances, groups = [g,], dense = dense) X_test, y_test = test_instances_to_matrix(feature_space, test_instances, dense = dense) model = get_optimal_model(X_train, y_train, folds, clf, param_grid, 'roc_auc') y_pred = model.predict(X_test) scores = get_scores(model, X_test) print("Test ROC: %f" % roc_auc_score( y_test, scores)) print(classification_report(y_test, y_pred)) fpr, tpr, thresholds = roc_curve(y_test, scores) roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, lw=1, label='%s (area = %0.4f)' % (g.split("_")[0], roc_auc)) print "\n"*4 plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck') plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Feature Set Analysis') plt.legend(loc="lower right", prop={'size':12}) plt.savefig(outfile)
def do_feature_selection(train_instances, test_instances, folds, clf, param_grid, dense, outfile): groups = set(train_instances[0].feature_groups.keys()).intersection( test_instances[0].feature_groups.keys()) X_train, y_train, feature_space = pipe.instances_to_matrix(train_instances, dense=dense, groups=groups) X_test, y_test = test_instances_to_matrix(feature_space, test_instances, dense=dense) all_tpr = [] (chi2values, pval) = chi2(X_train, y_train) feature_indices = [ i[0] for i in sorted(enumerate(pval), key=lambda x: x[1]) ] index_to_name = {v: k for k, v in feature_space.items()} feature_names = [index_to_name[i] for i in feature_indices] print feature_indices[0:200] print feature_names[0:200] for percentile in range(1, 10, 2): t0 = time() ch2 = SelectPercentile(chi2, percentile=percentile) X_train_trans = ch2.fit_transform(X_train, y_train) print("done in %fs" % (time() - t0)) model = get_optimal_model(X_train_trans, y_train, folds, clf, param_grid, 'roc_auc') X_test_trans = ch2.transform(X_test) scores = get_scores(model, X_test_trans) fpr, tpr, thresholds = roc_curve(y_test, scores) roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, lw=1, label='%d (area = %0.4f)' % (percentile, roc_auc)) print "\n" * 4 plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck') plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic example') plt.legend(loc="lower right") plt.savefig('feature_selection.png') print()
def label_row(self, row, column_indices, table_offset, congress, chamber, document_type, number, sponsor_indices): instance = self.get_instance_from_row(row, column_indices) X, y, space = pipe.instances_to_matrix( [ instance, ], feature_space=self.feature_space, dense=False) scores = self.model.decision_function(X) fields = [ 'congress', 'chamber', 'document_type', 'number', 'row', 'row_offset', 'row_length', 'score', 'state', 'sponsors' ] cmd = "insert into candidate_earmarks (" + ", ".join( fields ) + ") values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) returning id" attributes = instance.attributes state = self.geo_coder.get_state(attributes['entity_text']) cur = self.conn.cursor() if sponsor_indices: print sponsor_indices sponsors = [] for index in sponsor_indices: try: sponsor_cell = attributes['entity_text'].split("|")[index] sponsors_in_cell = string_functions.tokenize( string_functions.normalize_no_lower(sponsor_cell)) for sic in sponsors_in_cell: if sic in self.sponsor_coder.sponsors[congress]: sponsors.append(sic) except Exception as e: print "Index: %d" % index print len(attributes['entity_text'].split("|")) print attributes['entity_text'] logging.exception("SCREW UP") sponsors_string = "|".join(sponsors)[:1024] cur.execute(cmd, (congress, chamber, document_type, number, attributes['entity_text'], row.offset + table_offset, row.length, scores[0], state, sponsors_string)) curr_id = cur.fetchone()[0] for sponsor in sponsors: cur.execute( 'insert into sponsors (candidate_earmark_id, sponsor) values (%s, %s)', (curr_id, sponsor)) self.conn.commit()
def main(): parser = argparse.ArgumentParser(description='Match entities to OMB') parser.add_argument('--model', required = False, help='path to pickeld matching model') parser.add_argument('--data', required = False, help='path to pickeld instances') args = parser.parse_args() bills2008 = os.path.join(util.configuration.get_path_to_bills(), "/110/bills/hr/hr2764/text-versions/") bills2009 = os.path.join(util.configuration.get_path_to_bills(), "/111/bills/hr/hr1105/text-versions/") years = [ "111", "110","109", "108", "107", "106", "105", "104"] reports_base=util.configuration.get_path_to_reports() folders = [os.path.join(reports_base, year) for year in years] + [bills2008, bills2009] conn = psycopg2.connect(CONN_STRING) if args.model: feature_space = pickle.load(open(args.model+".feature_space", "rb")) model = joblib.load(args.model) logging.info("Loaded Model") elif args.data: keep_group = ['unigram_feature_generator', 'simple_entity_text_feature_generator', 'geo_feature_generator', 'sponsor_feature_generator'] instances = prepare_earmark_data.load_instances(args.data) ignore_groups = [ fg for fg in instances[0].feature_groups.keys() if fg not in keep_group] X, y, feature_space = pipe.instances_to_matrix(instances, ignore_groups = ignore_groups, dense = False) clf = svm.LinearSVC(C = 0.01) param_grid = {'C': [ 0.01, 0.1]} model = diagnostics.get_optimal_model (X, y, 5, clf, param_grid, 'roc_auc') else: exit() geo_coder = GeoCoder() sponsor_coder = SponsorCoder() earmark_detector = EarmarkDetector(geo_coder, sponsor_coder, conn, model, feature_space) p = mp.Pool(mp.cpu_count()) p.map(label_all, [(folder, earmark_detector) for folder in folders]) #for folder in folders: # label_all((folder, earmark_detector)); conn.close()
def do_feature_selection(train_instances, test_instances, folds, clf, param_grid, dense, outfile): groups = set(train_instances[0].feature_groups.keys()).intersection(test_instances[0].feature_groups.keys()) X_train, y_train, feature_space = pipe.instances_to_matrix(train_instances, dense = dense, groups = groups) X_test, y_test = test_instances_to_matrix(feature_space, test_instances, dense = dense) all_tpr = [] (chi2values, pval) = chi2(X_train, y_train) feature_indices = [i[0] for i in sorted(enumerate(pval), key=lambda x:x[1])] index_to_name = {v:k for k, v in feature_space.items()} feature_names = [index_to_name[i] for i in feature_indices] print feature_indices[0:200] print feature_names[0:200] for percentile in range(1, 10, 2): t0 = time() ch2 = SelectPercentile(chi2, percentile=percentile) X_train_trans = ch2.fit_transform(X_train, y_train) print("done in %fs" % (time() - t0)) model = get_optimal_model (X_train_trans, y_train, folds, clf, param_grid, 'roc_auc') X_test_trans = ch2.transform(X_test) scores = get_scores(model, X_test_trans) fpr, tpr, thresholds = roc_curve(y_test, scores) roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, lw=1, label='%d (area = %0.4f)' % (percentile, roc_auc)) print "\n"*4 plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck') plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic example') plt.legend(loc="lower right") plt.savefig('feature_selection.png') print()
def label_row(self, row, column_indices, table_offset, congress, chamber, document_type, number, sponsor_indices): instance = self.get_instance_from_row(row, column_indices) X, y, space = pipe.instances_to_matrix([instance,], feature_space = self.feature_space, dense = False) scores = self.model.decision_function(X) fields = ['congress', 'chamber','document_type','number', 'row', 'row_offset', 'row_length', 'score', 'state', 'sponsors'] cmd = "insert into candidate_earmarks (" + ", ".join(fields) + ") values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) returning id" attributes = instance.attributes state = self.geo_coder.get_state(attributes['entity_text']) cur = self.conn.cursor() if sponsor_indices: print sponsor_indices sponsors = [] for index in sponsor_indices: try: sponsor_cell = attributes['entity_text'].split("|")[index] sponsors_in_cell = string_functions.tokenize(string_functions.normalize_no_lower(sponsor_cell)) for sic in sponsors_in_cell: if sic in self.sponsor_coder.sponsors[congress]: sponsors.append(sic) except Exception as e: print "Index: %d" % index print len(attributes['entity_text'].split("|")) print attributes['entity_text'] logging.exception("SCREW UP") sponsors_string = "|".join(sponsors)[:1024] cur.execute(cmd, (congress, chamber, document_type, number, attributes['entity_text'], row.offset+table_offset, row.length, scores[0], state, sponsors_string)) curr_id = cur.fetchone()[0] for sponsor in sponsors: cur.execute('insert into sponsors (candidate_earmark_id, sponsor) values (%s, %s)', (curr_id,sponsor )) self.conn.commit()
def main(): parser = argparse.ArgumentParser(description='build classifier') parser.add_argument('--train', required=True, help='file to pickled training instances') parser.add_argument('--test', required=False, help='file to pickled test instances') parser.add_argument('--folds', required=False, type = int, default = 5, help='number of folds for cv') parser.add_argument('--alg', required = True, help = "'rf' for RandomForest, 'svm' for LinearSVC") subparsers = parser.add_subparsers(dest = "subparser_name", help='sub-command help') parser_grid = subparsers.add_parser('grid', help='tune hyper-parameters') parser_grid.add_argument('--scoring', required = True) parser_save = subparsers.add_parser('save', help='tune hyper-parameters') parser_save.add_argument('--scoring', required = True) parser_save.add_argument('--outfile', required = True) parser_error = subparsers.add_parser('error', help='do error analysis') parser_features = subparsers.add_parser('features', help='do feature analysis') parser_features.add_argument('--outfile', required = True) parser_error = subparsers.add_parser('relabel', help='do error analysis') args = parser.parse_args() print "Doing %s" % args.subparser_name print "Train: %s" %args.train print "Test: %s" % args.test if args.alg == 'svm': clf = svm.LinearSVC(C = 0.01) #param_grid = {'C': [ 0.001, 0.01, 0.1, 0.5, 1, 4, 10, 100]} param_grid = {'C': [ 0.01, 0.1]} dense = False else: clf = RandomForestClassifier(n_estimators=10,max_depth=None, random_state = 0,max_features = 'log2', n_jobs = -1) param_grid = {'n_estimators' : [10, 30, 50, 100, 300, 500], 'max_features' : ['log2', 'sqrt'] } dense = True if args.subparser_name == "save": groups = ['unigram_feature_generator', 'simple_entity_text_feature_generator', 'geo_feature_generator', 'sponsor_feature_generator'] instances = prepare_earmark_data.load_instances(args.train) print instances[0].feature_groups.keys() print instances[-1].feature_groups.keys() X, y, feature_space = pipe.instances_to_matrix(instances, dense = dense, groups = groups) save_model(X, y, feature_space, args.folds, clf, param_grid, args.scoring, args.outfile) elif args.subparser_name =="error": #this does error analysis on training data only! instances = prepare_earmark_data.load_instances(args.train) X, y, feature_space = pipe.instances_to_matrix(instances, dense = dense) error_analysis_for_labeling(instances, X, y, args.folds, clf, param_grid, args.train) elif args.subparser_name =="grid": if args.test: train_instances = prepare_earmark_data.load_instances(args.train) test_instances = prepare_earmark_data.load_instances(args.test) groups = set(train_instances[0].feature_groups.keys()).intersection(test_instances[0].feature_groups.keys()) X_train, y_train, train_feature_space = pipe.instances_to_matrix(train_instances, dense = dense, groups = groups) X_test, y_test, test_feature_space = pipe.instances_to_matrix(train_instances, dense = dense, groups = groups) keys_train = set(train_feature_space.keys()) keys_test = set(test_feature_space.keys()) intersection = list(keys_train & keys_train) feature_space = {intersection[i]:i for i in range(len(intersection))} X_train, y_train = test_instances_to_matrix(feature_space, train_instances, dense = dense) X_test, y_test = test_instances_to_matrix(feature_space, test_instances, dense = dense) else: instances = prepare_earmark_data.load_instances(args.train) X_train, y_train, feature_space = pipe.instances_to_matrix(instances, dense = dense) X_test = None y_test = None do_grid_search(X_train, y_train, args.folds, clf, param_grid, args.scoring, X_test, y_test) elif args.subparser_name == "features": if args.test: train_instances = prepare_earmark_data.load_instances(args.train) test_instances = prepare_earmark_data.load_instances(args.test) else: # this is just for exposition, would really want to cv over this instances = prepare_earmark_data.load_instances(args.train) X, y, feature_space = pipe.instances_to_matrix(instances, dense = dense) train, test = split_data_stratified(X, y, test_size = 0.33) train_instances = [instances[i] for i in train] test_instances = [instances[i] for i in test] #do_feature_selection(train_instances, test_instances, args.folds, clf, param_grid, dense, args.outfile) do_feature_set_analysis(train_instances, test_instances, args.folds, clf, param_grid, dense, args.outfile)
def main(): parser = argparse.ArgumentParser(description='build classifier') parser.add_argument('--train', required=True, help='file to pickled training instances') parser.add_argument('--test', required=False, help='file to pickled test instances') parser.add_argument('--folds', required=False, type=int, default=5, help='number of folds for cv') parser.add_argument('--alg', required=True, help="'rf' for RandomForest, 'svm' for LinearSVC") subparsers = parser.add_subparsers(dest="subparser_name", help='sub-command help') parser_grid = subparsers.add_parser('grid', help='tune hyper-parameters') parser_grid.add_argument('--scoring', required=True) parser_save = subparsers.add_parser('save', help='tune hyper-parameters') parser_save.add_argument('--scoring', required=True) parser_save.add_argument('--outfile', required=True) parser_error = subparsers.add_parser('error', help='do error analysis') parser_features = subparsers.add_parser('features', help='do feature analysis') parser_features.add_argument('--outfile', required=True) parser_error = subparsers.add_parser('relabel', help='do error analysis') args = parser.parse_args() print "Doing %s" % args.subparser_name print "Train: %s" % args.train print "Test: %s" % args.test if args.alg == 'svm': clf = svm.LinearSVC(C=0.01) #param_grid = {'C': [ 0.001, 0.01, 0.1, 0.5, 1, 4, 10, 100]} param_grid = {'C': [0.01, 0.1]} dense = False else: clf = RandomForestClassifier(n_estimators=10, max_depth=None, random_state=0, max_features='log2', n_jobs=-1) param_grid = { 'n_estimators': [10, 30, 50, 100, 300, 500], 'max_features': ['log2', 'sqrt'] } dense = True if args.subparser_name == "save": groups = [ 'unigram_feature_generator', 'simple_entity_text_feature_generator', 'geo_feature_generator', 'sponsor_feature_generator' ] instances = prepare_earmark_data.load_instances(args.train) print instances[0].feature_groups.keys() print instances[-1].feature_groups.keys() X, y, feature_space = pipe.instances_to_matrix(instances, dense=dense, groups=groups) save_model(X, y, feature_space, args.folds, clf, param_grid, args.scoring, args.outfile) elif args.subparser_name == "error": #this does error analysis on training data only! instances = prepare_earmark_data.load_instances(args.train) X, y, feature_space = pipe.instances_to_matrix(instances, dense=dense) error_analysis_for_labeling(instances, X, y, args.folds, clf, param_grid, args.train) elif args.subparser_name == "grid": if args.test: train_instances = prepare_earmark_data.load_instances(args.train) test_instances = prepare_earmark_data.load_instances(args.test) groups = set( train_instances[0].feature_groups.keys()).intersection( test_instances[0].feature_groups.keys()) X_train, y_train, train_feature_space = pipe.instances_to_matrix( train_instances, dense=dense, groups=groups) X_test, y_test, test_feature_space = pipe.instances_to_matrix( train_instances, dense=dense, groups=groups) keys_train = set(train_feature_space.keys()) keys_test = set(test_feature_space.keys()) intersection = list(keys_train & keys_train) feature_space = { intersection[i]: i for i in range(len(intersection)) } X_train, y_train = test_instances_to_matrix(feature_space, train_instances, dense=dense) X_test, y_test = test_instances_to_matrix(feature_space, test_instances, dense=dense) else: instances = prepare_earmark_data.load_instances(args.train) X_train, y_train, feature_space = pipe.instances_to_matrix( instances, dense=dense) X_test = None y_test = None do_grid_search(X_train, y_train, args.folds, clf, param_grid, args.scoring, X_test, y_test) elif args.subparser_name == "features": if args.test: train_instances = prepare_earmark_data.load_instances(args.train) test_instances = prepare_earmark_data.load_instances(args.test) else: # this is just for exposition, would really want to cv over this instances = prepare_earmark_data.load_instances(args.train) X, y, feature_space = pipe.instances_to_matrix(instances, dense=dense) train, test = split_data_stratified(X, y, test_size=0.33) train_instances = [instances[i] for i in train] test_instances = [instances[i] for i in test] #do_feature_selection(train_instances, test_instances, args.folds, clf, param_grid, dense, args.outfile) do_feature_set_analysis(train_instances, test_instances, args.folds, clf, param_grid, dense, args.outfile)
def do_feature_set_analysis(train_instances, test_instances, folds, clf, param_grid, dense, outfile): groups = set(train_instances[0].feature_groups.keys()).intersection( test_instances[0].feature_groups.keys()) print groups X_train, y_train, feature_space = pipe.instances_to_matrix(train_instances, dense=dense, groups=groups) X_test, y_test = test_instances_to_matrix(feature_space, test_instances, dense=dense) model = get_optimal_model(X_train, y_train, folds, clf, param_grid, 'roc_auc') y_pred = model.predict(X_test) scores = get_scores(model, X_test) print("Test ROC: %f" % roc_auc_score(y_test, scores)) print(classification_report(y_test, y_pred)) fpr, tpr, thresholds = roc_curve(y_test, scores) np.set_printoptions(threshold='nan') for i in range(1, len(fpr), 100): print "Theshold: %0.4f FPR: %0.4f TPR: %0.4f" % (thresholds[i], fpr[i], tpr[i]) roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, lw=1, label='ALL (area = %0.4f)' % (roc_auc)) all_tpr = [] for g in groups: print g X_train, y_train, feature_space = pipe.instances_to_matrix( train_instances, groups=[ g, ], dense=dense) X_test, y_test = test_instances_to_matrix(feature_space, test_instances, dense=dense) model = get_optimal_model(X_train, y_train, folds, clf, param_grid, 'roc_auc') y_pred = model.predict(X_test) scores = get_scores(model, X_test) print("Test ROC: %f" % roc_auc_score(y_test, scores)) print(classification_report(y_test, y_pred)) fpr, tpr, thresholds = roc_curve(y_test, scores) roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, lw=1, label='%s (area = %0.4f)' % (g.split("_")[0], roc_auc)) print "\n" * 4 plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck') plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Feature Set Analysis') plt.legend(loc="lower right", prop={'size': 12}) plt.savefig(outfile)
def main(): parser = argparse.ArgumentParser(description='Match entities to OMB') parser.add_argument('--model', required=False, help='path to pickeld matching model') parser.add_argument('--data', required=False, help='path to pickeld instances') args = parser.parse_args() bills2008 = os.path.join(util.configuration.get_path_to_bills(), "/110/bills/hr/hr2764/text-versions/") bills2009 = os.path.join(util.configuration.get_path_to_bills(), "/111/bills/hr/hr1105/text-versions/") years = ["111", "110", "109", "108", "107", "106", "105", "104"] reports_base = util.configuration.get_path_to_reports() folders = [os.path.join(reports_base, year) for year in years] + [bills2008, bills2009] conn = psycopg2.connect(CONN_STRING) if args.model: feature_space = pickle.load(open(args.model + ".feature_space", "rb")) model = joblib.load(args.model) logging.info("Loaded Model") elif args.data: keep_group = [ 'unigram_feature_generator', 'simple_entity_text_feature_generator', 'geo_feature_generator', 'sponsor_feature_generator' ] instances = prepare_earmark_data.load_instances(args.data) ignore_groups = [ fg for fg in instances[0].feature_groups.keys() if fg not in keep_group ] X, y, feature_space = pipe.instances_to_matrix( instances, ignore_groups=ignore_groups, dense=False) clf = svm.LinearSVC(C=0.01) param_grid = {'C': [0.01, 0.1]} model = diagnostics.get_optimal_model(X, y, 5, clf, param_grid, 'roc_auc') else: exit() geo_coder = GeoCoder() sponsor_coder = SponsorCoder() earmark_detector = EarmarkDetector(geo_coder, sponsor_coder, conn, model, feature_space) p = mp.Pool(mp.cpu_count()) p.map(label_all, [(folder, earmark_detector) for folder in folders]) #for folder in folders: # label_all((folder, earmark_detector)); conn.close()