Пример #1
0
def re_train(model, model_loc, train, dev, corpus_loc, num, num_runs, classes):
    """
    runs the back feed mechanism.
    :param model: classificator
    :param model_loc: If the model was not loaded, a raw one is trained and
    saved to this location
    :param train: File with annotated training tweets
    :param dev: File with annotated development tweets
    :param corpus_loc: Directory of the Sentiment140 corpus
    :param num: Amount of added corpus data per iteration
    :param num_runs: Number of iterations
    :param classes: Variable holding bit flags corresponding to class labels --
    definition is in preprocessing variable 'ltd'
    """
    # load training data, is needed either way
    train_labels, train_tweets, train_pos = preprocessing.parse(train, classes)
    x_train = [e[0] + "\t" + e[1] for e in zip(train_tweets, train_pos)]
    y_train = train_labels
    # x_train, y_train = uniformify(x_train, y_train)

    # if model is raw, train it
    if not hasattr(model, "grid_scores_"):
        logging.info("No trained model given, building training features for " "binary class codes: %s" % bin(classes))
        model.fit(x_train, y_train)

        logging.info("writing new model to disk at %s.." % model_loc)
        with open(model_loc, "w") as sink:
            cPickle.dump(model, sink)
        logging.info("done.")

    # get test data
    dev_labels, dev_tweets, dev_pos = preprocessing.parse(dev, classes)
    x_dev = [e[0] + "\t" + e[1] for e in zip(dev_tweets, dev_pos)]
    y_dev = dev_labels
    # x_dev, y_dev = uniformify(x_dev, y_dev)

    # initial eval
    logging.info("Initial evaluation..")
    print_scores(model, x_dev, y_dev, classes)

    # print label distribution, in order to check on how the ratio of pos to neg
    # influences the scoring. Seems pretty balanced now that len(t_gold_1) is
    # equal to len(t_gold_2)
    # print('num t_gold_1 '+str(len([l for l in y_train if l == 1])))
    # print('num t_gold_2 '+str(len([l for l in y_train if l == 2])))
    # print('t_1 '+str(len([l for l in model.predict(x_train) if l == 1])))
    # print('t_2 '+str(len([l for l in model.predict(x_train) if l == 2])))
    # print('num d_gold_1 '+str(len([l for l in y_dev if l == 1])))
    # print('num d_gold_2 '+str(len([l for l in y_dev if l == 2])))
    # print('d_1 '+str(len([l for l in model.predict(x_dev) if l == 1])))
    # print('d_2 '+str(len([l for l in model.predict(x_dev) if l == 2])))

    # feedback loop
    logging.info("Initializing backfeed instance..")
    feed = Feeder(corpus_loc)
    logging.info("done. Now starting backfeed loop")
    for count in range(1, num_runs + 1):
        feed.add_best_n(model, num, x_train, y_train)
        logging.info("Retrain run %i" % count)
        print_scores(model, x_dev, y_dev, classes)
Пример #2
0
def run(model, x_train, y_train, x_test, y_test, mode, retrain=30, amount=300, token=''):
    # initial step
    model.fit(x_train, y_train)
    logging.info('initial evaluation')
    get_score(model, x_test, y_test)

    # external sources set-up
    cl = pickle.load(open('cl.model', 'rb'))
    af = pickle.load(open('af.model', 'rb'))
    km = pickle.load(open('km.model', 'rb'))
    classes = model.best_estimator_.named_steps['svm'].classes_
    cl.add_filter_ranges(**{str(POS): (1.5, float('inf')),
                            str(NEG): (float('-inf'), -1.5),
                            str(NEU): (-1.5, 1.5)})
    cl.add_weight(5, classes)
    af.add_filter_ranges(**{str(POS): (0.4, float('inf')),
                            str(NEG): (float('-inf'), -0.4),
                            str(NEU): (-0.4, 0.4)})
    af.add_weight(2.2, classes)
    km.add_filter_ranges(**{str(POS): (2.5, float('inf')),
                            str(NEG): (float('-inf'), -2.5),
                            str(NEU): (-2.5, 2.5)})
    km.add_weight(40, classes)

    # source inclusion
    feed = Feeder()
    if token == 'km':
        feed.add_mutator(km)
    if token == 'af':
        feed.add_mutator(af)
    if token == 'cl':
        feed.add_mutator(cl)

    # retrain loop, feedback, and evaluation
    for i in range(retrain):
        logging.debug('count nr. %i' % i)
        feed.add_best_n(model, amount, x_train, y_train, False, mode)
        get_score(model, x_test, y_test)