def main():

    parser = argparse.ArgumentParser(description='Match entities to OMB')
    parser.add_argument('--model', required = False, help='path to pickeld matching model')
    parser.add_argument('--data', required = False, help='path to pickeld instances')

    args = parser.parse_args()

    bills2008 = os.path.join(util.configuration.get_path_to_bills(), "/110/bills/hr/hr2764/text-versions/")
    bills2009 = os.path.join(util.configuration.get_path_to_bills(), "/111/bills/hr/hr1105/text-versions/")

    years = [ "111", "110","109", "108", "107", "106", "105", "104"]

    reports_base=util.configuration.get_path_to_reports()
    folders = [os.path.join(reports_base, year) for year in years] + [bills2008, bills2009]


    conn = psycopg2.connect(CONN_STRING)

    if args.model:
        feature_space = pickle.load(open(args.model+".feature_space", "rb"))
        model = joblib.load(args.model)
        logging.info("Loaded Model")

    elif args.data:
        keep_group = ['unigram_feature_generator', 'simple_entity_text_feature_generator', 'geo_feature_generator', 'sponsor_feature_generator']
        instances = prepare_earmark_data.load_instances(args.data)
        ignore_groups = [ fg for fg in instances[0].feature_groups.keys() if fg not in keep_group]
        X, y, feature_space = pipe.instances_to_matrix(instances, ignore_groups = ignore_groups,  dense = False)
        clf = svm.LinearSVC(C = 0.01)
        param_grid = {'C': [ 0.01, 0.1]}
        model = diagnostics.get_optimal_model (X, y, 5, clf, param_grid, 'roc_auc')
    else:
        exit()

    geo_coder = GeoCoder()
    sponsor_coder = SponsorCoder()


    earmark_detector = EarmarkDetector(geo_coder, sponsor_coder, conn, model, feature_space)


    p = mp.Pool(mp.cpu_count())
    p.map(label_all, [(folder, earmark_detector) for folder in folders])


    #for folder in folders:
    #   label_all((folder, earmark_detector));

    conn.close()
def main():
        parser = argparse.ArgumentParser(description='build classifier')

        parser.add_argument('--train',  required=True, help='file to pickled training instances')
        parser.add_argument('--test',  required=False, help='file to pickled test instances')
        parser.add_argument('--folds',  required=False, type = int, default = 5, help='number of folds for cv')
        parser.add_argument('--alg', required = True, help = "'rf' for RandomForest, 'svm' for LinearSVC")
        subparsers = parser.add_subparsers(dest = "subparser_name", help='sub-command help')

        parser_grid = subparsers.add_parser('grid', help='tune hyper-parameters')
        parser_grid.add_argument('--scoring', required = True)

        parser_save = subparsers.add_parser('save', help='tune hyper-parameters')
        parser_save.add_argument('--scoring', required = True)
        parser_save.add_argument('--outfile', required = True)

        parser_error = subparsers.add_parser('error', help='do error analysis')
        parser_features = subparsers.add_parser('features', help='do feature analysis')
        parser_features.add_argument('--outfile', required = True)

        parser_error = subparsers.add_parser('relabel', help='do error analysis')

        args = parser.parse_args() 

        print "Doing %s" % args.subparser_name
        print "Train: %s" %args.train
        print "Test: %s" % args.test

        if args.alg == 'svm':
            clf = svm.LinearSVC(C = 0.01)
            #param_grid = {'C': [ 0.001, 0.01, 0.1, 0.5, 1, 4, 10, 100]}
            param_grid = {'C': [ 0.01, 0.1]}

            dense = False

        else:
            clf = RandomForestClassifier(n_estimators=10,max_depth=None, random_state = 0,max_features = 'log2', n_jobs = -1)
            param_grid = {'n_estimators' : [10, 30, 50, 100, 300, 500], 'max_features' : ['log2', 'sqrt'] }
            dense = True

        if args.subparser_name == "save":
            

            groups = ['unigram_feature_generator', 'simple_entity_text_feature_generator', 'geo_feature_generator', 'sponsor_feature_generator']

            instances = prepare_earmark_data.load_instances(args.train)
            print instances[0].feature_groups.keys()
            print instances[-1].feature_groups.keys()

            X, y, feature_space = pipe.instances_to_matrix(instances, dense = dense, groups = groups)
            save_model(X, y, feature_space, args.folds, clf, param_grid, args.scoring, args.outfile)

        elif args.subparser_name =="error":

            #this does error analysis on training data only!
            instances = prepare_earmark_data.load_instances(args.train)
            X, y, feature_space = pipe.instances_to_matrix(instances, dense = dense)
            error_analysis_for_labeling(instances, X, y, args.folds, clf, param_grid, args.train)

        elif args.subparser_name =="grid":


            if args.test:
                train_instances = prepare_earmark_data.load_instances(args.train)
                test_instances = prepare_earmark_data.load_instances(args.test)


                groups = set(train_instances[0].feature_groups.keys()).intersection(test_instances[0].feature_groups.keys())

                X_train, y_train, train_feature_space = pipe.instances_to_matrix(train_instances, dense = dense, groups = groups)
                X_test, y_test, test_feature_space = pipe.instances_to_matrix(train_instances, dense = dense, groups = groups)


                keys_train = set(train_feature_space.keys())
                keys_test = set(test_feature_space.keys())
                intersection = list(keys_train & keys_train)
                feature_space = {intersection[i]:i for i in range(len(intersection))}
                X_train, y_train = test_instances_to_matrix(feature_space, train_instances, dense = dense)
                X_test, y_test = test_instances_to_matrix(feature_space, test_instances, dense = dense)



        
                

            else:
                instances = prepare_earmark_data.load_instances(args.train)
                X_train, y_train, feature_space = pipe.instances_to_matrix(instances, dense = dense)
                X_test = None
                y_test = None
                
            do_grid_search(X_train, y_train, args.folds, clf, param_grid, args.scoring, X_test, y_test)
            

        elif args.subparser_name == "features":

            if args.test:
                train_instances = prepare_earmark_data.load_instances(args.train)
                test_instances = prepare_earmark_data.load_instances(args.test)
                
            else:
                # this is just for exposition, would really want to cv over this
                instances = prepare_earmark_data.load_instances(args.train)
                X, y, feature_space = pipe.instances_to_matrix(instances, dense = dense)
                train, test =  split_data_stratified(X, y, test_size = 0.33)
                train_instances = [instances[i] for i in train]
                test_instances = [instances[i] for i in test]
                
            #do_feature_selection(train_instances, test_instances, args.folds, clf, param_grid, dense, args.outfile)

            do_feature_set_analysis(train_instances, test_instances, args.folds, clf, param_grid, dense, args.outfile)
예제 #3
0
def main():
    parser = argparse.ArgumentParser(description='build classifier')

    parser.add_argument('--train',
                        required=True,
                        help='file to pickled training instances')
    parser.add_argument('--test',
                        required=False,
                        help='file to pickled test instances')
    parser.add_argument('--folds',
                        required=False,
                        type=int,
                        default=5,
                        help='number of folds for cv')
    parser.add_argument('--alg',
                        required=True,
                        help="'rf' for RandomForest, 'svm' for LinearSVC")
    subparsers = parser.add_subparsers(dest="subparser_name",
                                       help='sub-command help')

    parser_grid = subparsers.add_parser('grid', help='tune hyper-parameters')
    parser_grid.add_argument('--scoring', required=True)

    parser_save = subparsers.add_parser('save', help='tune hyper-parameters')
    parser_save.add_argument('--scoring', required=True)
    parser_save.add_argument('--outfile', required=True)

    parser_error = subparsers.add_parser('error', help='do error analysis')
    parser_features = subparsers.add_parser('features',
                                            help='do feature analysis')
    parser_features.add_argument('--outfile', required=True)

    parser_error = subparsers.add_parser('relabel', help='do error analysis')

    args = parser.parse_args()

    print "Doing %s" % args.subparser_name
    print "Train: %s" % args.train
    print "Test: %s" % args.test

    if args.alg == 'svm':
        clf = svm.LinearSVC(C=0.01)
        #param_grid = {'C': [ 0.001, 0.01, 0.1, 0.5, 1, 4, 10, 100]}
        param_grid = {'C': [0.01, 0.1]}

        dense = False

    else:
        clf = RandomForestClassifier(n_estimators=10,
                                     max_depth=None,
                                     random_state=0,
                                     max_features='log2',
                                     n_jobs=-1)
        param_grid = {
            'n_estimators': [10, 30, 50, 100, 300, 500],
            'max_features': ['log2', 'sqrt']
        }
        dense = True

    if args.subparser_name == "save":

        groups = [
            'unigram_feature_generator',
            'simple_entity_text_feature_generator', 'geo_feature_generator',
            'sponsor_feature_generator'
        ]

        instances = prepare_earmark_data.load_instances(args.train)
        print instances[0].feature_groups.keys()
        print instances[-1].feature_groups.keys()

        X, y, feature_space = pipe.instances_to_matrix(instances,
                                                       dense=dense,
                                                       groups=groups)
        save_model(X, y, feature_space, args.folds, clf, param_grid,
                   args.scoring, args.outfile)

    elif args.subparser_name == "error":

        #this does error analysis on training data only!
        instances = prepare_earmark_data.load_instances(args.train)
        X, y, feature_space = pipe.instances_to_matrix(instances, dense=dense)
        error_analysis_for_labeling(instances, X, y, args.folds, clf,
                                    param_grid, args.train)

    elif args.subparser_name == "grid":

        if args.test:
            train_instances = prepare_earmark_data.load_instances(args.train)
            test_instances = prepare_earmark_data.load_instances(args.test)

            groups = set(
                train_instances[0].feature_groups.keys()).intersection(
                    test_instances[0].feature_groups.keys())

            X_train, y_train, train_feature_space = pipe.instances_to_matrix(
                train_instances, dense=dense, groups=groups)
            X_test, y_test, test_feature_space = pipe.instances_to_matrix(
                train_instances, dense=dense, groups=groups)

            keys_train = set(train_feature_space.keys())
            keys_test = set(test_feature_space.keys())
            intersection = list(keys_train & keys_train)
            feature_space = {
                intersection[i]: i
                for i in range(len(intersection))
            }
            X_train, y_train = test_instances_to_matrix(feature_space,
                                                        train_instances,
                                                        dense=dense)
            X_test, y_test = test_instances_to_matrix(feature_space,
                                                      test_instances,
                                                      dense=dense)

        else:
            instances = prepare_earmark_data.load_instances(args.train)
            X_train, y_train, feature_space = pipe.instances_to_matrix(
                instances, dense=dense)
            X_test = None
            y_test = None

        do_grid_search(X_train, y_train, args.folds, clf, param_grid,
                       args.scoring, X_test, y_test)

    elif args.subparser_name == "features":

        if args.test:
            train_instances = prepare_earmark_data.load_instances(args.train)
            test_instances = prepare_earmark_data.load_instances(args.test)

        else:
            # this is just for exposition, would really want to cv over this
            instances = prepare_earmark_data.load_instances(args.train)
            X, y, feature_space = pipe.instances_to_matrix(instances,
                                                           dense=dense)
            train, test = split_data_stratified(X, y, test_size=0.33)
            train_instances = [instances[i] for i in train]
            test_instances = [instances[i] for i in test]

        #do_feature_selection(train_instances, test_instances, args.folds, clf, param_grid, dense, args.outfile)

        do_feature_set_analysis(train_instances, test_instances, args.folds,
                                clf, param_grid, dense, args.outfile)
def main():

    parser = argparse.ArgumentParser(description='Match entities to OMB')
    parser.add_argument('--model',
                        required=False,
                        help='path to pickeld matching model')
    parser.add_argument('--data',
                        required=False,
                        help='path to pickeld instances')

    args = parser.parse_args()

    bills2008 = os.path.join(util.configuration.get_path_to_bills(),
                             "/110/bills/hr/hr2764/text-versions/")
    bills2009 = os.path.join(util.configuration.get_path_to_bills(),
                             "/111/bills/hr/hr1105/text-versions/")

    years = ["111", "110", "109", "108", "107", "106", "105", "104"]

    reports_base = util.configuration.get_path_to_reports()
    folders = [os.path.join(reports_base, year)
               for year in years] + [bills2008, bills2009]

    conn = psycopg2.connect(CONN_STRING)

    if args.model:
        feature_space = pickle.load(open(args.model + ".feature_space", "rb"))
        model = joblib.load(args.model)
        logging.info("Loaded Model")

    elif args.data:
        keep_group = [
            'unigram_feature_generator',
            'simple_entity_text_feature_generator', 'geo_feature_generator',
            'sponsor_feature_generator'
        ]
        instances = prepare_earmark_data.load_instances(args.data)
        ignore_groups = [
            fg for fg in instances[0].feature_groups.keys()
            if fg not in keep_group
        ]
        X, y, feature_space = pipe.instances_to_matrix(
            instances, ignore_groups=ignore_groups, dense=False)
        clf = svm.LinearSVC(C=0.01)
        param_grid = {'C': [0.01, 0.1]}
        model = diagnostics.get_optimal_model(X, y, 5, clf, param_grid,
                                              'roc_auc')
    else:
        exit()

    geo_coder = GeoCoder()
    sponsor_coder = SponsorCoder()

    earmark_detector = EarmarkDetector(geo_coder, sponsor_coder, conn, model,
                                       feature_space)

    p = mp.Pool(mp.cpu_count())
    p.map(label_all, [(folder, earmark_detector) for folder in folders])

    #for folder in folders:
    #   label_all((folder, earmark_detector));

    conn.close()