def main(): parser = argparse.ArgumentParser(description='Match entities to OMB') parser.add_argument('--model', required = False, help='path to pickeld matching model') parser.add_argument('--data', required = False, help='path to pickeld instances') args = parser.parse_args() bills2008 = os.path.join(util.configuration.get_path_to_bills(), "/110/bills/hr/hr2764/text-versions/") bills2009 = os.path.join(util.configuration.get_path_to_bills(), "/111/bills/hr/hr1105/text-versions/") years = [ "111", "110","109", "108", "107", "106", "105", "104"] reports_base=util.configuration.get_path_to_reports() folders = [os.path.join(reports_base, year) for year in years] + [bills2008, bills2009] conn = psycopg2.connect(CONN_STRING) if args.model: feature_space = pickle.load(open(args.model+".feature_space", "rb")) model = joblib.load(args.model) logging.info("Loaded Model") elif args.data: keep_group = ['unigram_feature_generator', 'simple_entity_text_feature_generator', 'geo_feature_generator', 'sponsor_feature_generator'] instances = prepare_earmark_data.load_instances(args.data) ignore_groups = [ fg for fg in instances[0].feature_groups.keys() if fg not in keep_group] X, y, feature_space = pipe.instances_to_matrix(instances, ignore_groups = ignore_groups, dense = False) clf = svm.LinearSVC(C = 0.01) param_grid = {'C': [ 0.01, 0.1]} model = diagnostics.get_optimal_model (X, y, 5, clf, param_grid, 'roc_auc') else: exit() geo_coder = GeoCoder() sponsor_coder = SponsorCoder() earmark_detector = EarmarkDetector(geo_coder, sponsor_coder, conn, model, feature_space) p = mp.Pool(mp.cpu_count()) p.map(label_all, [(folder, earmark_detector) for folder in folders]) #for folder in folders: # label_all((folder, earmark_detector)); conn.close()
def main(): parser = argparse.ArgumentParser(description='build classifier') parser.add_argument('--train', required=True, help='file to pickled training instances') parser.add_argument('--test', required=False, help='file to pickled test instances') parser.add_argument('--folds', required=False, type = int, default = 5, help='number of folds for cv') parser.add_argument('--alg', required = True, help = "'rf' for RandomForest, 'svm' for LinearSVC") subparsers = parser.add_subparsers(dest = "subparser_name", help='sub-command help') parser_grid = subparsers.add_parser('grid', help='tune hyper-parameters') parser_grid.add_argument('--scoring', required = True) parser_save = subparsers.add_parser('save', help='tune hyper-parameters') parser_save.add_argument('--scoring', required = True) parser_save.add_argument('--outfile', required = True) parser_error = subparsers.add_parser('error', help='do error analysis') parser_features = subparsers.add_parser('features', help='do feature analysis') parser_features.add_argument('--outfile', required = True) parser_error = subparsers.add_parser('relabel', help='do error analysis') args = parser.parse_args() print "Doing %s" % args.subparser_name print "Train: %s" %args.train print "Test: %s" % args.test if args.alg == 'svm': clf = svm.LinearSVC(C = 0.01) #param_grid = {'C': [ 0.001, 0.01, 0.1, 0.5, 1, 4, 10, 100]} param_grid = {'C': [ 0.01, 0.1]} dense = False else: clf = RandomForestClassifier(n_estimators=10,max_depth=None, random_state = 0,max_features = 'log2', n_jobs = -1) param_grid = {'n_estimators' : [10, 30, 50, 100, 300, 500], 'max_features' : ['log2', 'sqrt'] } dense = True if args.subparser_name == "save": groups = ['unigram_feature_generator', 'simple_entity_text_feature_generator', 'geo_feature_generator', 'sponsor_feature_generator'] instances = prepare_earmark_data.load_instances(args.train) print instances[0].feature_groups.keys() print instances[-1].feature_groups.keys() X, y, feature_space = pipe.instances_to_matrix(instances, dense = dense, groups = groups) save_model(X, y, feature_space, args.folds, clf, param_grid, args.scoring, args.outfile) elif args.subparser_name =="error": #this does error analysis on training data only! instances = prepare_earmark_data.load_instances(args.train) X, y, feature_space = pipe.instances_to_matrix(instances, dense = dense) error_analysis_for_labeling(instances, X, y, args.folds, clf, param_grid, args.train) elif args.subparser_name =="grid": if args.test: train_instances = prepare_earmark_data.load_instances(args.train) test_instances = prepare_earmark_data.load_instances(args.test) groups = set(train_instances[0].feature_groups.keys()).intersection(test_instances[0].feature_groups.keys()) X_train, y_train, train_feature_space = pipe.instances_to_matrix(train_instances, dense = dense, groups = groups) X_test, y_test, test_feature_space = pipe.instances_to_matrix(train_instances, dense = dense, groups = groups) keys_train = set(train_feature_space.keys()) keys_test = set(test_feature_space.keys()) intersection = list(keys_train & keys_train) feature_space = {intersection[i]:i for i in range(len(intersection))} X_train, y_train = test_instances_to_matrix(feature_space, train_instances, dense = dense) X_test, y_test = test_instances_to_matrix(feature_space, test_instances, dense = dense) else: instances = prepare_earmark_data.load_instances(args.train) X_train, y_train, feature_space = pipe.instances_to_matrix(instances, dense = dense) X_test = None y_test = None do_grid_search(X_train, y_train, args.folds, clf, param_grid, args.scoring, X_test, y_test) elif args.subparser_name == "features": if args.test: train_instances = prepare_earmark_data.load_instances(args.train) test_instances = prepare_earmark_data.load_instances(args.test) else: # this is just for exposition, would really want to cv over this instances = prepare_earmark_data.load_instances(args.train) X, y, feature_space = pipe.instances_to_matrix(instances, dense = dense) train, test = split_data_stratified(X, y, test_size = 0.33) train_instances = [instances[i] for i in train] test_instances = [instances[i] for i in test] #do_feature_selection(train_instances, test_instances, args.folds, clf, param_grid, dense, args.outfile) do_feature_set_analysis(train_instances, test_instances, args.folds, clf, param_grid, dense, args.outfile)
def main(): parser = argparse.ArgumentParser(description='build classifier') parser.add_argument('--train', required=True, help='file to pickled training instances') parser.add_argument('--test', required=False, help='file to pickled test instances') parser.add_argument('--folds', required=False, type=int, default=5, help='number of folds for cv') parser.add_argument('--alg', required=True, help="'rf' for RandomForest, 'svm' for LinearSVC") subparsers = parser.add_subparsers(dest="subparser_name", help='sub-command help') parser_grid = subparsers.add_parser('grid', help='tune hyper-parameters') parser_grid.add_argument('--scoring', required=True) parser_save = subparsers.add_parser('save', help='tune hyper-parameters') parser_save.add_argument('--scoring', required=True) parser_save.add_argument('--outfile', required=True) parser_error = subparsers.add_parser('error', help='do error analysis') parser_features = subparsers.add_parser('features', help='do feature analysis') parser_features.add_argument('--outfile', required=True) parser_error = subparsers.add_parser('relabel', help='do error analysis') args = parser.parse_args() print "Doing %s" % args.subparser_name print "Train: %s" % args.train print "Test: %s" % args.test if args.alg == 'svm': clf = svm.LinearSVC(C=0.01) #param_grid = {'C': [ 0.001, 0.01, 0.1, 0.5, 1, 4, 10, 100]} param_grid = {'C': [0.01, 0.1]} dense = False else: clf = RandomForestClassifier(n_estimators=10, max_depth=None, random_state=0, max_features='log2', n_jobs=-1) param_grid = { 'n_estimators': [10, 30, 50, 100, 300, 500], 'max_features': ['log2', 'sqrt'] } dense = True if args.subparser_name == "save": groups = [ 'unigram_feature_generator', 'simple_entity_text_feature_generator', 'geo_feature_generator', 'sponsor_feature_generator' ] instances = prepare_earmark_data.load_instances(args.train) print instances[0].feature_groups.keys() print instances[-1].feature_groups.keys() X, y, feature_space = pipe.instances_to_matrix(instances, dense=dense, groups=groups) save_model(X, y, feature_space, args.folds, clf, param_grid, args.scoring, args.outfile) elif args.subparser_name == "error": #this does error analysis on training data only! instances = prepare_earmark_data.load_instances(args.train) X, y, feature_space = pipe.instances_to_matrix(instances, dense=dense) error_analysis_for_labeling(instances, X, y, args.folds, clf, param_grid, args.train) elif args.subparser_name == "grid": if args.test: train_instances = prepare_earmark_data.load_instances(args.train) test_instances = prepare_earmark_data.load_instances(args.test) groups = set( train_instances[0].feature_groups.keys()).intersection( test_instances[0].feature_groups.keys()) X_train, y_train, train_feature_space = pipe.instances_to_matrix( train_instances, dense=dense, groups=groups) X_test, y_test, test_feature_space = pipe.instances_to_matrix( train_instances, dense=dense, groups=groups) keys_train = set(train_feature_space.keys()) keys_test = set(test_feature_space.keys()) intersection = list(keys_train & keys_train) feature_space = { intersection[i]: i for i in range(len(intersection)) } X_train, y_train = test_instances_to_matrix(feature_space, train_instances, dense=dense) X_test, y_test = test_instances_to_matrix(feature_space, test_instances, dense=dense) else: instances = prepare_earmark_data.load_instances(args.train) X_train, y_train, feature_space = pipe.instances_to_matrix( instances, dense=dense) X_test = None y_test = None do_grid_search(X_train, y_train, args.folds, clf, param_grid, args.scoring, X_test, y_test) elif args.subparser_name == "features": if args.test: train_instances = prepare_earmark_data.load_instances(args.train) test_instances = prepare_earmark_data.load_instances(args.test) else: # this is just for exposition, would really want to cv over this instances = prepare_earmark_data.load_instances(args.train) X, y, feature_space = pipe.instances_to_matrix(instances, dense=dense) train, test = split_data_stratified(X, y, test_size=0.33) train_instances = [instances[i] for i in train] test_instances = [instances[i] for i in test] #do_feature_selection(train_instances, test_instances, args.folds, clf, param_grid, dense, args.outfile) do_feature_set_analysis(train_instances, test_instances, args.folds, clf, param_grid, dense, args.outfile)
def main(): parser = argparse.ArgumentParser(description='Match entities to OMB') parser.add_argument('--model', required=False, help='path to pickeld matching model') parser.add_argument('--data', required=False, help='path to pickeld instances') args = parser.parse_args() bills2008 = os.path.join(util.configuration.get_path_to_bills(), "/110/bills/hr/hr2764/text-versions/") bills2009 = os.path.join(util.configuration.get_path_to_bills(), "/111/bills/hr/hr1105/text-versions/") years = ["111", "110", "109", "108", "107", "106", "105", "104"] reports_base = util.configuration.get_path_to_reports() folders = [os.path.join(reports_base, year) for year in years] + [bills2008, bills2009] conn = psycopg2.connect(CONN_STRING) if args.model: feature_space = pickle.load(open(args.model + ".feature_space", "rb")) model = joblib.load(args.model) logging.info("Loaded Model") elif args.data: keep_group = [ 'unigram_feature_generator', 'simple_entity_text_feature_generator', 'geo_feature_generator', 'sponsor_feature_generator' ] instances = prepare_earmark_data.load_instances(args.data) ignore_groups = [ fg for fg in instances[0].feature_groups.keys() if fg not in keep_group ] X, y, feature_space = pipe.instances_to_matrix( instances, ignore_groups=ignore_groups, dense=False) clf = svm.LinearSVC(C=0.01) param_grid = {'C': [0.01, 0.1]} model = diagnostics.get_optimal_model(X, y, 5, clf, param_grid, 'roc_auc') else: exit() geo_coder = GeoCoder() sponsor_coder = SponsorCoder() earmark_detector = EarmarkDetector(geo_coder, sponsor_coder, conn, model, feature_space) p = mp.Pool(mp.cpu_count()) p.map(label_all, [(folder, earmark_detector) for folder in folders]) #for folder in folders: # label_all((folder, earmark_detector)); conn.close()