Пример #1
0
"""
from gensim.models.word2vec import Word2Vec
from spacy.en import English
from regression import BaseBowRegressor
from language import tokenize_document

# better tokenizer
nlp = English()

NUM_PARTITIONS = 70
WINDOW_SIZE = 4
VECTOR_SIZE = 100
MODEL_FILE = "w2v_%d_parts_%d_vector_%d_window" % (NUM_PARTITIONS, VECTOR_SIZE,
                                                   WINDOW_SIZE)

reviews_texts, useful_votes, funny_votes, cool_votes, review_stars = BaseBowRegressor.get_reviews_data(
    range(1, NUM_PARTITIONS))

sentences = [tokenize_document(txt) for txt in enumerate(reviews_texts)]

# build the word2vec model and save it
w2v = Word2Vec(sentences=sentences,
               size=VECTOR_SIZE,
               alpha=0.025,
               window=WINDOW_SIZE,
               min_count=2,
               sample=1e-5,
               workers=4,
               negative=10)
w2v.init_sims(replace=True)
w2v.save(MODEL_FILE)
Пример #2
0
# import nltk
import numpy as np
from spacy.en import English
from regression import BaseBowRegressor
from functools import partial
from nltk import word_tokenize


# better tokenizer
nlp = English()

NUM_PARTITIONS = 30

FILTER_ENGLISH = False # -- set to true for real code, its just super f****n slow.

reviews_texts, useful_votes, funny_votes, cool_votes, review_stars = BaseBowRegressor.get_reviews_data(range(1, NUM_PARTITIONS))

def tokenize_document(docpair, use_nltk=True):
    print 'working on doc {}'.format(docpair[0])
    if not use_nltk:
        if FILTER_ENGLISH:
            return [x.lower_.encode('ascii',errors='ignore') for x in nlp(docpair[1]) if detect_language(x) == 'english']
        return [x.lower_.encode('ascii',errors='ignore') for x in nlp(docpair[1])]
    else:
        if FILTER_ENGLISH:
            return [x.encode('ascii',errors='ignore').lower() for x in word_tokenize(docpair[1]) if detect_language(x) == 'english']
        return [x.encode('ascii',errors='ignore').lower() for x in word_tokenize(docpair[1])]


def parallel_run(f, parms):
    '''
Пример #3
0
    if maxlen is None:
        maxlen = np.max(lengths)

    x = np.zeros((nb_samples, maxlen, WORDVECTOR_LENGTH)).astype('float32')
    for idx, s in enumerate(sequences):
        x[idx, :lengths[idx]] = s[:maxlen]
    return x


WORD2VEC_MODEL = "w2v_70_parts_30_vector_4_window"
PARTITIONS_TRAINING = range(1, 15)
PARTITIONS_TESTING = range(20, 22)

model = Word2Vec.load(WORD2VEC_MODEL)

reviews_train, _, funny_votes_train, _, _ = BaseBowRegressor.get_reviews_data(
    PARTITIONS_TRAINING)

count = 0
for votes in funny_votes_train:
    if votes > 0:
        count += 1

print "Total non-zero votes: %d of %d" % (count, len(funny_votes_train))

print "Tokenizing"
NUM_ELEMENTS_TRAIN = None
NUM_ELEMENTS_TEST = None
reviews_tokens_train = [
    language.tokenize_document(txt)
    for txt in enumerate(reviews_train[:NUM_ELEMENTS_TRAIN])
]
Пример #4
0
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import LabeledSentence
from regression import BaseBowRegressor

import nltk

reviews_texts, _, _, _, _ = BaseBowRegressor.get_reviews_data(range(1, 70))
sentences = []
print "Tokenizing sentences..."
for i, review in enumerate(reviews_texts):
    tokens = nltk.word_tokenize(review)
    tokens = [token.lower() for token in tokens]
    sentences.append(LabeledSentence(words=tokens, labels=["REVIEW_" + str(i)]))

print "Doc2Vec"
model = Doc2Vec(sentences, size=100, window=8, min_count=5, workers=4)
Пример #5
0
    if maxlen is None:
        maxlen = np.max(lengths)

    x = np.zeros((nb_samples, maxlen, WORDVECTOR_LENGTH)).astype('float32')
    for idx, s in enumerate(sequences):
        x[idx, :lengths[idx]] = s[:maxlen]
    return x


WORD2VEC_MODEL = "w2v_70_parts_30_vector_4_window"
PARTITIONS_TRAINING = range(1, 15)
PARTITIONS_TESTING = range(20,22)

model = Word2Vec.load(WORD2VEC_MODEL)

reviews_train, _, funny_votes_train, _, _ = BaseBowRegressor.get_reviews_data(PARTITIONS_TRAINING)

count = 0
for votes in funny_votes_train:
    if votes > 0:
        count += 1

print "Total non-zero votes: %d of %d" % (count, len(funny_votes_train))


print "Tokenizing"
NUM_ELEMENTS_TRAIN = None
NUM_ELEMENTS_TEST = None
reviews_tokens_train = [language.tokenize_document(txt) for txt in enumerate(reviews_train[:NUM_ELEMENTS_TRAIN])]

X_train = tokens_to_word_vectors(reviews_tokens_train, model)
Пример #6
0
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import LabeledSentence
from regression import BaseBowRegressor

import nltk

reviews_texts, _, _, _, _ = BaseBowRegressor.get_reviews_data(range(1, 70))
sentences = []
print "Tokenizing sentences..."
for i, review in enumerate(reviews_texts):
    tokens = nltk.word_tokenize(review)
    tokens = [token.lower() for token in tokens]
    sentences.append(LabeledSentence(words=tokens,
                                     labels=["REVIEW_" + str(i)]))

print "Doc2Vec"
model = Doc2Vec(sentences, size=100, window=8, min_count=5, workers=4)
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import LabeledSentence
from regression import BaseBowRegressor
from evaluation import rmslog_error
from sklearn.ensemble import GradientBoostingRegressor

import nltk
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.linear_model import SGDClassifier, SGDRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.svm import LinearSVC, SVR

reviews_texts, useful_votes, funny_votes, cool_votes, review_stars = BaseBowRegressor.get_reviews_data(range(1, 30))
model = Doc2Vec.load("docvecs_70")

N = 50000
M = 100
X = np.zeros((N, M))
y = funny_votes[:N]
for i in range(N):
    if 'REVIEW_' + str(i) not in model:
        print str(i) + "not in model?"
        X[i,:] = np.zeros(M)
    else:
        X[i,:] = model['REVIEW_' + str(i)]

N_test = 10000
X_test = np.zeros((N_test, M))