Exemplo n.º 1
0
import random
import sys

import numpy as np
from sklearn.ensemble import RandomForestClassifier

import utils

# Performs classification using RandomForest classifier.

if __name__ == '__main__':
    service_port = int(sys.argv[1])
    np.random.seed(1337)
    utils.init_ngrams()
    tweets = utils.process_tweets(utils.TRAIN_PROCESSED_FILE,
                                  service_port,
                                  test_file=False)
    if utils.TRAIN:
        train_tweets, val_tweets = utils.split_data(tweets)
    else:
        random.shuffle(tweets)
        train_tweets = tweets
    del tweets
    print('Extracting features & training batches')
    clf = RandomForestClassifier(n_jobs=2, random_state=0)
    batch_size = len(train_tweets)
    i = 1
    n_train_batches = int(np.ceil(len(train_tweets) / float(batch_size)))
    for training_set_X, training_set_y in utils.extract_features(
            train_tweets,
            test_file=False,
Exemplo n.º 2
0
            feature_vector.append(vocab.get(words[-1]))
    return feature_vector


if __name__ == '__main__':
    train = len(sys.argv) == 2
    service_port = int(sys.argv[1])
    np.random.seed(1337)
    vocab_size = 90000
    batch_size = 8 * 500
    max_length = 40
    filters = 600
    kernel_size = 3
    vocab = utils.top_n_words(FREQ_DIST_FILE, vocab_size, shift=1)
    glove_vectors = get_glove_vectors(vocab)
    processed_tweets = utils.process_tweets(TRAIN_PROCESSED_FILE, service_port, test_file=False, get_feature_vector=get_feature_vector)
    tweets = [processed_tweet[-1] for processed_tweet in processed_tweets]
    labels = np.array([processed_tweet[1] for processed_tweet in processed_tweets])
    # Create and embedding matrix
    embedding_matrix = np.random.randn(vocab_size + 1, dim) * 0.01
    # Seed it with GloVe vectors
    for word, i in vocab.items():
        glove_vector = glove_vectors.get(word)
        if glove_vector is not None:
            embedding_matrix[i] = glove_vector
    tweets = pad_sequences(tweets, maxlen=max_length, padding='post')
    shuffled_indices = np.random.permutation(tweets.shape[0])
    tweets = tweets[shuffled_indices]
    labels = labels[shuffled_indices]
    if train:
        model = Sequential()