Пример #1
0
def make_guess_tweet(model, user_ids, tweets_data_dir, output_file):
    ''' process users with their retrieved tweets, while the above method used
    preprocessed text files. '''
    assert(os.path.exists(tweets_data_dir))
    assert(not os.path.exists(output_file))

    tick = Tick()

    fout = open(output_file, 'w')
    for user_id in user_ids:
        fname = os.path.join(tweets_data_dir, user_id)

        if not os.path.exists(fname):
            continue

        context = contextFromTweetFile(fname)
        if context is None:
            continue

        p_label, conf = model.eval_all(context)[0]

        fout.write(user_id + '\t' + p_label + '\t' + str(conf) + '\n')

        tick.tick()

    fout.close()
Пример #2
0
def getModel(train_file, tweets_dir):
    assert(os.path.exists(train_file))
    assert(os.path.exists(tweets_dir))

    maxent.set_verbose(1)
    m = cmaxent.MaxentModel()
    m.begin_add_event()

    tick = Tick()

    for line in open(train_file):
        user_id, label = line.rstrip('\n').split('\t')
        tweet_file = os.path.join(tweets_dir, user_id)
        context = contextFromTweetFile(tweet_file)

        if context is None:
            continue

        weight = 1.0
        m.add_event(context, label, weight)

        tick.tick()

    m.end_add_event(PRUNE_COUNT)
    m.train(LBFGS_ITERATION, 'lbfgs', PRIOR_WEIGHT, TOLERANCE)

    return m
Пример #3
0

# load the pre-trained model, the features (or context) in the model are parsed
# words, not integer mapped values
model = cmaxent.MaxentModel()
model.load(model_file)
print 'model loaded from %s' % model_file


# Collect the prediction and real class label
tick = Tick()
prediction_real_pairs = []
for line in open(test_file):
    user_id, r_label = line.rstrip('\n').split('\t')
    tweet_file = os.path.join(tweets_dir, user_id)
    context = contextFromTweetFile(tweet_file)

    if context is None:
        continue

    p_label = model.predict(context)
    prediction_real_pairs.append((int(p_label), int(r_label)))
    tick.tick()

print 'prediction finished on the file %s' % test_file


# Report the accuracy
def confMatrix(results):
    cmat = np.zeros([6,6], dtype=float)
def readTextContext(user_id):
    tweets_dir = os.path.join(DATA, 'twitter/tweets_data_3200')
    tweet_file = os.path.join(tweets_dir, user_id)
    context = contextFromTweetFile(tweet_file)
    return context