Пример #1
0
def main():
    # Handle input options and arguments
    usage = "%prog project splits_file"
    parser = OptionParser(usage=usage)

    (options, args) = parser.parse_args()

    project = args[0]
    splits_file = args[1]

    dirs.set_project(project, splits_file)

    preprocess_for_brown_clustering()
Пример #2
0
def main():

    usage = "%prog project label splits_file"
    parser = OptionParser(usage=usage)
    parser.add_option('-t', dest='target_col', default=0,
                      help='Target column; default=%default')
    parser.add_option('-w', dest='weight_col', default=-1,
                      help='weight column; default=%default')

    #parser.add_option('-m', dest='model', default='LR',
    #                  help='Model: (LR|SVM|MNB|SVMNB); default=%default')

    (options, args) = parser.parse_args()
    if len(args) < 2:
        sys.exit("Please provide input arguments")

    project = args[0]
    label_file = args[1]
    splits_file = args[2]

    dirs.set_project(project, splits_file)

    target_col = int(options.target_col)
    weight_col = int(options.weight_col)

    model_type = 'LR'
    reuse = False
    verbose = 1

    unigrams = ['ngrams,n=1,transform=binarize']
    unigrams_and_bigrams = unigrams + ['ngrams,n=2,transform=binarize']
    ub_personas_old = unigrams_and_bigrams + ['pkl,subdir=personas,source=personasdpm,transform=binarize']
    ub_personas_new = unigrams_and_bigrams + ['pkl,subdir=personas,source=personas,transform=binarize']
    ub_personas_and_stories = ub_personas_new + ['pkl,subdir=personas,source=storytypesold,transform=normalizel2']
    all_feature_list = [
        'ngrams,n=1,transform=binarize',
        'ngrams,n=2,transform=binarize,min_df=2',
        'list,subdir=brown,source=brown',
        'pkl,subdir=lda,source=lda,transform=binarize',
        'pkl,subdir=personas,source=personas,transform=binarize',
        'list,subdir=stanford,source=pos,transform=binarize',
        'list,subdir=stanford,source=ner,transform=binarize',
        'list,subdir=stanford,source=dependency_links,transform=binarize,min_df=2,lower=1',
        'list,subdir=stanford,source=jkgrams,transform=binarize,min_df=2,lower=1',
        'list,subdir=stanford,source=sentiments,transform=binarize',
        'list,subdir=semafor,source=frames,transform=binarize,lower=1',
        'list,subdir=amalgram,source=ss_tags,transform=binarize,lower=1',
    ]

    #exps = [unigrams, unigrams_and_bigrams, ub_personas_old, ub_personas_new, ub_personas_and_stories, all_feature_list]
    #names = ['unigrams', 'bigrams', 'personas_dpm', 'personas_new', 'personas_and_stories', 'all_features']

    exps = [unigrams_and_bigrams, ub_personas_old, ub_personas_new, ub_personas_and_stories]
    names = ['unigrams_and_bigrams', 'personas_dpm', 'personas_new', 'personas_and_stories']

    n_eval_iters = 20
    dev_prop = 0.1

    for i, features in enumerate(exps):
        for t in range(10):
            print 'experiment', i, '; test_fold', t
            experiment2.run_experiment(name=names[i],
                                       label_file=label_file,
                                       target=target_col,
                                       test_fold=t,
                                       feature_list=features,
                                       model_type=model_type,
                                       n_eval_iters=n_eval_iters,
                                       eval_prop=dev_prop,
                                       reuse=False,
                                       verbose=verbose,
                                       weight_col=weight_col,
                                       best_alphas=None,
                                       additional_label_files=None,
                                       additional_label_weights=None,
                                       metric='f1',
                                       only_unanimous=True)  # run experiment