""" from gensim.models.word2vec import Word2Vec from spacy.en import English from regression import BaseBowRegressor from language import tokenize_document # better tokenizer nlp = English() NUM_PARTITIONS = 70 WINDOW_SIZE = 4 VECTOR_SIZE = 100 MODEL_FILE = "w2v_%d_parts_%d_vector_%d_window" % (NUM_PARTITIONS, VECTOR_SIZE, WINDOW_SIZE) reviews_texts, useful_votes, funny_votes, cool_votes, review_stars = BaseBowRegressor.get_reviews_data( range(1, NUM_PARTITIONS)) sentences = [tokenize_document(txt) for txt in enumerate(reviews_texts)] # build the word2vec model and save it w2v = Word2Vec(sentences=sentences, size=VECTOR_SIZE, alpha=0.025, window=WINDOW_SIZE, min_count=2, sample=1e-5, workers=4, negative=10) w2v.init_sims(replace=True) w2v.save(MODEL_FILE)
# import nltk import numpy as np from spacy.en import English from regression import BaseBowRegressor from functools import partial from nltk import word_tokenize # better tokenizer nlp = English() NUM_PARTITIONS = 30 FILTER_ENGLISH = False # -- set to true for real code, its just super f****n slow. reviews_texts, useful_votes, funny_votes, cool_votes, review_stars = BaseBowRegressor.get_reviews_data(range(1, NUM_PARTITIONS)) def tokenize_document(docpair, use_nltk=True): print 'working on doc {}'.format(docpair[0]) if not use_nltk: if FILTER_ENGLISH: return [x.lower_.encode('ascii',errors='ignore') for x in nlp(docpair[1]) if detect_language(x) == 'english'] return [x.lower_.encode('ascii',errors='ignore') for x in nlp(docpair[1])] else: if FILTER_ENGLISH: return [x.encode('ascii',errors='ignore').lower() for x in word_tokenize(docpair[1]) if detect_language(x) == 'english'] return [x.encode('ascii',errors='ignore').lower() for x in word_tokenize(docpair[1])] def parallel_run(f, parms): '''
if maxlen is None: maxlen = np.max(lengths) x = np.zeros((nb_samples, maxlen, WORDVECTOR_LENGTH)).astype('float32') for idx, s in enumerate(sequences): x[idx, :lengths[idx]] = s[:maxlen] return x WORD2VEC_MODEL = "w2v_70_parts_30_vector_4_window" PARTITIONS_TRAINING = range(1, 15) PARTITIONS_TESTING = range(20, 22) model = Word2Vec.load(WORD2VEC_MODEL) reviews_train, _, funny_votes_train, _, _ = BaseBowRegressor.get_reviews_data( PARTITIONS_TRAINING) count = 0 for votes in funny_votes_train: if votes > 0: count += 1 print "Total non-zero votes: %d of %d" % (count, len(funny_votes_train)) print "Tokenizing" NUM_ELEMENTS_TRAIN = None NUM_ELEMENTS_TEST = None reviews_tokens_train = [ language.tokenize_document(txt) for txt in enumerate(reviews_train[:NUM_ELEMENTS_TRAIN]) ]
from gensim.models.doc2vec import Doc2Vec from gensim.models.doc2vec import LabeledSentence from regression import BaseBowRegressor import nltk reviews_texts, _, _, _, _ = BaseBowRegressor.get_reviews_data(range(1, 70)) sentences = [] print "Tokenizing sentences..." for i, review in enumerate(reviews_texts): tokens = nltk.word_tokenize(review) tokens = [token.lower() for token in tokens] sentences.append(LabeledSentence(words=tokens, labels=["REVIEW_" + str(i)])) print "Doc2Vec" model = Doc2Vec(sentences, size=100, window=8, min_count=5, workers=4)
if maxlen is None: maxlen = np.max(lengths) x = np.zeros((nb_samples, maxlen, WORDVECTOR_LENGTH)).astype('float32') for idx, s in enumerate(sequences): x[idx, :lengths[idx]] = s[:maxlen] return x WORD2VEC_MODEL = "w2v_70_parts_30_vector_4_window" PARTITIONS_TRAINING = range(1, 15) PARTITIONS_TESTING = range(20,22) model = Word2Vec.load(WORD2VEC_MODEL) reviews_train, _, funny_votes_train, _, _ = BaseBowRegressor.get_reviews_data(PARTITIONS_TRAINING) count = 0 for votes in funny_votes_train: if votes > 0: count += 1 print "Total non-zero votes: %d of %d" % (count, len(funny_votes_train)) print "Tokenizing" NUM_ELEMENTS_TRAIN = None NUM_ELEMENTS_TEST = None reviews_tokens_train = [language.tokenize_document(txt) for txt in enumerate(reviews_train[:NUM_ELEMENTS_TRAIN])] X_train = tokens_to_word_vectors(reviews_tokens_train, model)
from gensim.models.doc2vec import Doc2Vec from gensim.models.doc2vec import LabeledSentence from regression import BaseBowRegressor from evaluation import rmslog_error from sklearn.ensemble import GradientBoostingRegressor import nltk import numpy as np from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer from sklearn.linear_model import SGDClassifier, SGDRegressor from sklearn.metrics import mean_squared_error, mean_absolute_error from sklearn.svm import LinearSVC, SVR reviews_texts, useful_votes, funny_votes, cool_votes, review_stars = BaseBowRegressor.get_reviews_data(range(1, 30)) model = Doc2Vec.load("docvecs_70") N = 50000 M = 100 X = np.zeros((N, M)) y = funny_votes[:N] for i in range(N): if 'REVIEW_' + str(i) not in model: print str(i) + "not in model?" X[i,:] = np.zeros(M) else: X[i,:] = model['REVIEW_' + str(i)] N_test = 10000 X_test = np.zeros((N_test, M))