예제 #1
0
    parser.add_argument('-i', '--input', type=str,
                        required=True, dest='infolder',
                        help='path to folder with pan dataset for a language')
    parser.add_argument('-o', '--output', type=str,
                        required=True, dest='outfolder',
                        help='path to folder where model should be written')
    parser.add_argument('-m', '--model', type=str,
                        required=True, dest='model',
                        help='path to learned model to use for predictions')

    args = parser.parse_args()
    model = args.model
    infolder = args.infolder
    outfolder = args.outfolder

    dataset = ProfilingDataset(infolder)
    print('Loaded {} users...\n'.format(len(dataset.entries)))
    config = dataset.config
    tasks = config.tasks
    all_models = joblib.load(model)
    if not all(task in tasks for task in all_models.keys()):
        print("The models you are using aren't all specified in config file")
        print('Did you change the config file after training???!')
        print('Exiting.. try training again.')
        exit(1)
    print('\n--------------- Thy time of Judgement ---------------')
    for task in tasks:
        test_data(dataset, all_models[task], task)
    # write output to file
    dataset.write_data(outfolder)
예제 #2
0
파일: cross.py 프로젝트: Sandy4321/pangram
    parser.add_argument('-i',
                        '--input',
                        type=str,
                        required=True,
                        dest='infolder',
                        help='path to folder with pan dataset for a language')
    parser.add_argument('-n',
                        '--numfolds',
                        type=int,
                        dest='num_folds',
                        default=4,
                        help='Number of folds to use in cross validation')

    args = parser.parse_args()
    infolder = args.infolder
    num_folds = args.num_folds

    print('Loading dataset...')
    dataset = ProfilingDataset(infolder)
    print('Loaded %s users...\n' % len(dataset.entries))
    config = dataset.config
    tasks = config.tasks
    print('\n--------------- Thy time of Running ---------------')
    for task in tasks:
        tictac = from_recipe(config.recipes[task])
        cross_val(dataset, task, tictac, num_folds)
    # print results at end
    print('\n--------------- Thy time of Judgement ---------------')
    for message in log:
        print(message)
예제 #3
0
def main_():
    infolder = "../DATA/pan16-author-profiling-training-dataset-2016-04-25/pan16-author-profiling-training-dataset-english-2016-02-29/"
    outfolder = "models/"
    print('Loading dataset->Grouping User texts.\n')
    dataset = ProfilingDataset(infolder)
    print('Loaded {} users...\n'.format(len(dataset.entries)))
    # get config
    config = dataset.config
    tasks = config.tasks
    print('\n--------------- Thy time of Running ---------------')
    for task in tasks:
        print('Learning to judge %s..' % task)
        # load data
        X, y = dataset.get_data(task)
    X, y = dataset.get_data('age')
    #X, y = dataset.get_data('gender')
    print len(X)
    #print X[0]
    X = preprocess.preprocess(X)
    
    grams3 = TfidfVectorizer(analyzer='word', ngram_range=[2,2], max_features=5000, stop_words='english')
    svm = SVC(kernel='rbf', C=10, gamma=1, class_weight='balanced', probability=False)
    pipe = Pipeline([('3grams',grams3), ('svm', svm)])

    soac = features.SOAC_Model2(max_df=1.0, min_df=1, tokenizer_var='sklearn', max_features=None)
    svm = SVC(kernel='rbf', C=1, gamma=1, class_weight='balanced', probability=False)
    #combined = FeatureUnion([('count_tokens', countTokens), ('count_hash', countHash),
    #                         ('count_urls', countUrls), ('count_replies', countReplies), 
    #                          ('soa', soa), ('soac', soac)])+
    #combined = FeatureUnion([('count_tokens', countTokens), ('count_hash', countHash),
    #                         ('count_urls', countUrls), ('count_replies', countReplies)])
    pipe1 = Pipeline([('soac',soac), ('svm', svm)])

    LSImodel = LSI_Model(num_topics=100)
    svm = SVC(kernel='rbf', C=0.1, gamma=1, class_weight='balanced', probability=False)
    #pipe2 = Pipeline([('counts',combined), ('svm', svm)])
    pipe2 = Pipeline([('LSI',LSImodel), ('svm', svm)])

    # Base Models
    base_models = [pipe, pipe1, pipe2]
    base_model_names = ['3grams', 'soac', 'lsi']

    # Meta Voting Models
    eclf = VotingClassifier(estimators=[("0", pipe), ('1', pipe1), ('2', pipe2)], voting='soft')
    eclfh = VotingClassifier(estimators=[("0", pipe), ('1', pipe1), ('2', pipe2)], voting='hard')
    voting_dic = {'votingf':eclf, 'votingh':eclfh}
    combinator_names = ['majority', 'weights', 'accuracy', 'optimal']
    #meta_models_names = ['votingf', 'votingh', 'space3', 'meta'] + combinator_names
    meta_models_names = ['space3'] + combinator_names
    #meta_models_names = []
    ## all_models ##
    all_models_names = base_model_names + meta_models_names


    #eclf = VotingClassifier(estimators=[("0", pipe), ('1', pipe1)], voting='soft')
    #eclfh = VotingClassifier(estimators=[("0", pipe), ('1', pipe1)], voting='hard')
    #models = [pipe,pipe1,eclf, eclfh]
    #model_names = ['3grams', 'soac', 'voting', 'votingh']

    results = {'over':[]}
    for name in all_models_names:
        results[name] = {'pred': [], 'conf': [], 'rep': [], 'acc': []}

    num_folds = 4
    train_split = 0.3
    meta_split = 0.5
    cv_rounds = 1
    t0 = time.time()
    t1 = t0
    for j in xrange(cv_rounds):
        X_train, X_cv, y_train, y_cv = train_test_split(X, y, test_size=train_split, stratify=y)
        for i, x in enumerate(X_train):
            if len(x)==0:
                X_train.remove(x)
                y_train.remove(y_train[i])
        for i, x in enumerate(X_cv):
            if len(x)==0:
                X_cv.remove(x)
                y_cv.remove(y_cv[i])
        if meta_split > 0:
            X_meta, X_cv, y_meta, y_cv = train_test_split(X_cv, y_cv, test_size=meta_split, stratify=y_cv)
            print len(X_train), len(X_cv), len(X_meta), len(X_cv) + len(X_train) + len(X_meta), len(X)
        else:
            print len(X_train), len(X_cv), len(X_cv) + len(X_train) , len(X)
        trained_base_models = []
        predictions = []
        base_predictions_cv = []
        base_predictions_meta = []
        for i, model in enumerate(base_models):
            model.fit(X_train,y_train)
            trained_base_models.append(model)
            predict = model.predict(X_cv)
            predictions.append(predict)
            base_predictions_cv.append(predict)
            base_predictions_meta.append(model.predict(X_meta))
            results[base_model_names[i]]['pred'].append(predict)
            results[base_model_names[i]]['acc'].append(accuracy_score(y_cv, predict))
            results[base_model_names[i]]['conf'].append(confusion_matrix(y_cv, predict, labels=list(set(y))))
            results[base_model_names[i]]['rep'].append(classification_report(y_cv, predict, labels=list(set(y))))
        trained_all_models = copy.deepcopy(trained_base_models)
        for name in meta_models_names:
            #print name
            if name =='votingf' or name=='votingh':
                model = voting_dic[name]
                model.fit(X_train, y_train)
                predict = model.predict(X_cv)
            if name == 'space':
                models_for_space = {}
                cv_scores = []
                for i, base_trained_model in enumerate(trained_base_models):
                    models_for_space[base_model_names[i]] = base_trained_model
                    cv_scores.append(base_trained_model.score(X_meta, y_meta))
                model = combinations.SubSpaceEnsemble4_2(models_for_space, cv_scores, k=6, weights=[0.65,0.35,0.32,6], N_rand=10, rand_split=0.6)
                model.fit(X_meta, y_meta)
                predict = model.predict(X_cv)
            if name == 'space3':
                models_for_space = {}
                for i, base_trained_model in enumerate(trained_base_models):
                    models_for_space[base_model_names[i]] = base_trained_model
                model = SubSpaceEnsemble3(models_for_space, k=5, weights= [2,1,3,0.6])
                model.fit(X_train, y_train)
                predict = model.predict(X_cv)
            if name == 'meta':
                model_dic = {}
                for i, base_trained_model in enumerate(trained_base_models):
                    model_dic[base_model_names[i]] = base_trained_model
                model = Metaclassifier(models=model_dic, C=1.0, weights='balanced')
                model.fit(X_meta, y_meta)
                predict = model.predict(X_cv)
            if name in combinator_names:
                #print 'mpike'
                model = combinations.Combinator(scheme=name, weights= [1/float(len(base_predictions_meta)) for i in xrange(len(base_predictions_meta))])
                model.fit(base_predictions_meta, y_meta)
                predict = model.predict(base_predictions_cv)
            trained_all_models.append(model)
            predictions.append(predict)
            results[name]['pred'].append(predict)
            results[name]['acc'].append(accuracy_score(y_cv, predict))
            results[name]['conf'].append(confusion_matrix(y_cv, predict, labels=list(set(y))))
            results[name]['rep'].append(classification_report(y_cv, predict, labels=list(set(y))))
        print('Round %d took: %0.3f seconds') % (j, time.time()-t1)
        t1 = time.time()
    print('Total time: %0.3f seconds') % (time.time()-t0)

    for name in all_models_names:
        print '%%%%%%%%%%%%%%%%  ' + name  + '  % %%%%%%%%%%%%%%%%%%%%%%%'
        print '#################################'
        mean_acc = 0
        mean_prec = 0
        mean_rec = 0
        mean_f1 = 0
        conf = numpy.zeros([5,5])
        for i in xrange(cv_rounds):
            mean_acc += results[name]['acc'][i]
            #print results[key]['report'][i].split('     ')
            mean_prec += float(results[name]['rep'][i].split('     ')[-4][2:])
            mean_rec += float(results[name]['rep'][i].split('     ')[-3][2:])
            mean_f1 += float(results[name]['rep'][i].split('     ')[-2][2:])
            conf += results[name]['conf'][i]
        mean_acc = mean_acc/float(cv_rounds)
        mean_prec = mean_prec/float(cv_rounds)
        mean_rec = mean_rec/float(cv_rounds)
        mean_f1 = mean_f1/float(cv_rounds)
        conf = conf/float(cv_rounds)
        print('Accuracy : {}'.format(mean_acc))
        print('Precision : {}'.format(mean_prec))
        print('Recall : {}'.format(mean_rec))
        print('F1 : {}'.format(mean_f1))
        print('Confusion matrix :\n {}'.format(conf))
        print '#################################'
    print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
예제 #4
0
                        help='path to folder with pan dataset for a language')
    parser.add_argument('-f', '--feature', type=str,
                        dest='feature', default='gender',
                        help='feature to plot learning curves for')
    parser.add_argument('-r', '--recipe', type=str,
                        dest='recipe',
                        help='path to the recipe to use, if not specified '
                             'default recipe is used')

    args = parser.parse_args()
    infolder = args.infolder
    task = args.feature
    recipe = args.recipe

    print 'Loading dataset...'
    data = ProfilingDataset(infolder)
    print 'Loaded {} users...\n'.format(len(data.entries))
    config = data.config
    tasks = config.tasks
    if task in tasks:
        print ('Creating learning curves for %s task..' % task)
        if not recipe:
            recipe = config.recipes[task]
            clf = from_recipe(recipe)
        else:
            clf = from_recipe(recipe)
        print ('Loading recipe from file %s..' % recipe)
        X, y = data.get_data(task)
        # Cross validation with 100 iterations to get smoother mean test and train
        # score curves, each time with 20% data randomly selected as a validation set.
        cv = cross_validation.KFold(len(X), n_folds=5, random_state=0)
예제 #5
0

if __name__ == '__main__':
    parser = ArgumentParser(description='Train pan model on pan dataset')
    parser.add_argument('-i', '--input', type=str,
                        required=True, dest='infolder',
                        help='path to folder with pan dataset for a language')
    parser.add_argument('-o', '--output', type=str,
                        required=True, dest='outfolder',
                        help='path to folder where model should be written')

    args = parser.parse_args()
    infolder = args.infolder
    outfolder = args.outfolder

    dataset = ProfilingDataset(infolder)
    print('Loaded {} users...\n'.format(len(dataset.entries)))
    # get config
    config = dataset.config
    tasks = config.tasks
    print('\n--------------- Thy time of Running ---------------')
    all_models = {}
    for task in tasks:
        print('Learning to judge %s..' % task)
        # load data
        X, y = dataset.get_data(task)
        tictac = from_recipe(config.recipes[task])
        all_models[task] = tictac.fit(X, y)
    modelfile = os.path.join(outfolder, '%s.bin' % dataset.lang)
    print('Writing model to {}'.format(modelfile))
    joblib.dump(all_models, modelfile, compress=3)
예제 #6
0
                        default='gender',
                        help='feature to plot learning curves for')
    parser.add_argument('-r',
                        '--recipe',
                        type=str,
                        dest='recipe',
                        help='path to the recipe to use, if not specified '
                        'default recipe is used')

    args = parser.parse_args()
    infolder = args.infolder
    task = args.feature
    recipe = args.recipe

    print 'Loading dataset...'
    data = ProfilingDataset(infolder)
    print 'Loaded {} users...\n'.format(len(data.entries))
    config = data.config
    tasks = config.tasks
    if task in tasks:
        print('Creating learning curves for %s task..' % task)
        if not recipe:
            recipe = config.recipes[task]
            clf = from_recipe(recipe)
        else:
            clf = from_recipe(recipe)
        print('Loading recipe from file %s..' % recipe)
        X, y = data.get_data(task)
        # Cross validation with 100 iterations to get smoother mean test and train
        # score curves, each time with 20% data randomly selected as a validation set.
        cv = cross_validation.KFold(len(X), n_folds=5, random_state=0)