示例#1
0
def make_guess_tweet(model, user_ids, tweets_data_dir, output_file):
    ''' process users with their retrieved tweets, while the above method used
    preprocessed text files. '''
    assert(os.path.exists(tweets_data_dir))
    assert(not os.path.exists(output_file))

    tick = Tick()

    fout = open(output_file, 'w')
    for user_id in user_ids:
        fname = os.path.join(tweets_data_dir, user_id)

        if not os.path.exists(fname):
            continue

        context = contextFromTweetFile(fname)
        if context is None:
            continue

        p_label, conf = model.eval_all(context)[0]

        fout.write(user_id + '\t' + p_label + '\t' + str(conf) + '\n')

        tick.tick()

    fout.close()
def trainModelFromFile(filename):
    assert os.path.exists(filename)

    maxent.set_verbose(1)
    m = cmaxent.MaxentModel()
    m.begin_add_event()

    tick = Tick()

    for line in open(filename):
        tokens = line.rstrip("\n").split(" ")
        label = tokens[0]
        context = []
        for pair in tokens[1:]:
            f, v = pair.split(":")
            context.append((str(f), float(v)))
        weight = 1.0
        m.add_event(context, label, weight)

        tick.tick()

    m.end_add_event(PRUNE_COUNT)
    m.train(LBFGS_ITERATION, "lbfgs", PRIOR_WEIGHT, TOLERANCE)

    return m
示例#3
0
def trainModelFromFile(filename, model_file = None):
    assert(os.path.exists(filename))

    maxent.set_verbose(1)
    m = cmaxent.MaxentModel()
    m.begin_add_event()

    tick = Tick()

    for line in open(filename):
        tokens = line.rstrip('\n').split(' ')
        label = tokens[0]
        context = []
        for pair in tokens[1:]:
            f, v = pair.split(':')
            context.append((str(f), float(v)))
        weight = 1.0
        m.add_event(context, label, weight)

        tick.tick()

    m.end_add_event(PRUNE_COUNT)
    m.train(LBFGS_ITERATION, 'lbfgs', PRIOR_WEIGHT, TOLERANCE)

    if model_file:
        m.save(model_file)
        print "Model saved to %s" % model_file

    return m
def getModel(train_file, tweets_dir):
    assert(os.path.exists(train_file))
    assert(os.path.exists(tweets_dir))

    maxent.set_verbose(1)
    m = cmaxent.MaxentModel()
    m.begin_add_event()

    tick = Tick()

    for line in open(train_file):
        user_id, label = line.rstrip('\n').split('\t')
        tweet_file = os.path.join(tweets_dir, user_id)
        context = contextFromTweetFile(tweet_file)

        if context is None:
            continue

        weight = 1.0
        m.add_event(context, label, weight)

        tick.tick()

    m.end_add_event(PRUNE_COUNT)
    m.train(LBFGS_ITERATION, 'lbfgs', PRIOR_WEIGHT, TOLERANCE)

    return m
def get_model(users_label, model_file=None):
    maxent.set_verbose(1)
    m = cmaxent.MaxentModel()
    m.begin_add_event()

    tick = Tick()

    for (user_id, label) in users_label:
        context = readFollowingContext(user_id)
        weight = 1.0
        m.add_event(context, label, weight)

        tick.tick()

    m.end_add_event(PRUNE_COUNT)
    m.train(LBFGS_ITERATION, "lbfgs", PRIOR_WEIGHT, TOLERANCE)

    if model_file:
        m.save(model_file)
        print "Model saved to %s" % model_file

    return m
示例#6
0
def make_guess(model, user_ids, text_data_dir, output_file):
    assert(os.path.exists(text_data_dir))
    assert(not os.path.exists(output_file))

    tick = Tick()

    fout = open(output_file, 'w')

    for user_id in user_ids:
        fname = os.path.join(text_data_dir, user_id)

        if not os.path.exists(fname):
            continue

        text = open(fname).read()
        context = contextFromText(text)
        p_label, conf = model.eval_all(context)[0]

        fout.write(user_id + '\t' + p_label + '\t' + str(conf) + '\n')

        tick.tick()

    fout.close()
tweets_dir = os.path.join(DATA, 'twitter/tweets_data_3200')
#model_file = os.path.join(DATA, 'twitter/models/Dec17_text_heuristic.model')
model_file = os.path.join(DATA, 'twitter/models/Dec17_text_heuristic_1e4.model')
test_file = os.path.join(DATA, 'twitter/annotated/Dec16_g6Label_split2.csv')


# load the pre-trained model, the features (or context) in the model are parsed
# words, not integer mapped values
model = cmaxent.MaxentModel()
model.load(model_file)
print 'model loaded from %s' % model_file


# Collect the prediction and real class label
tick = Tick()
prediction_real_pairs = []
for line in open(test_file):
    user_id, r_label = line.rstrip('\n').split('\t')
    tweet_file = os.path.join(tweets_dir, user_id)
    context = contextFromTweetFile(tweet_file)

    if context is None:
        continue

    p_label = model.predict(context)
    prediction_real_pairs.append((int(p_label), int(r_label)))
    tick.tick()

print 'prediction finished on the file %s' % test_file