示例#1
0
    sentences = []
    for row in data.train_df['review'].append(data.unlabeled_df['review']):
        sentences_df = pandas.DataFrame(nltk.sent_tokenize(row.decode('utf-8').strip()), columns=['sentence'])
        sentences_df = process.raw_to_words(sentences_df, 'sentence')
        sentences += sentences_df['sentence'].tolist()

    model = gensim.models.Word2Vec(sentences, size=word_vec_dim, window=10, min_count=1, workers=1, seed=process.seed)
    return model


# word2vec = build_word2vec()
word2vec = gensim.models.Word2Vec.load_word2vec_format('./process/300features_10contexts.bin', binary=True)
word2vec.init_sims(replace=True)

del data.unlabeled_df
train_df = process.raw_to_texts(data.train_df, 'review', dictionary=word2vec.vocab)
del data.train_df
test_df = process.raw_to_texts(data.test_df, 'review', dictionary=word2vec.vocab)
del data.test_df

sequence_tokenizer = keras.preprocessing.text.Tokenizer()
sequence_tokenizer.fit_on_texts(line.encode('utf-8') for line in train_df['review'].values)

max_features = len(sequence_tokenizer.word_index)

train = process.texts_to_sequences(train_df, 'review', sequence_tokenizer, maxlen=2500)
del train_df
test = process.texts_to_sequences(test_df, 'review', sequence_tokenizer, maxlen=2500)
del test_df

weights = numpy.zeros((max_features + 1, word_vec_dim))
示例#2
0
# -*- coding: utf-8 -*-

from __future__ import absolute_import
from __future__ import division
from __future__ import generators
from __future__ import nested_scopes
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import with_statement

import sklearn.feature_extraction

import data
import process

train_df = process.raw_to_texts(data.train_df, 'review', remove_stopwords=True)
del data.train_df
test_df = process.raw_to_texts(data.test_df, 'review', remove_stopwords=True)
del data.test_df
unlabeled_df = process.raw_to_texts(data.unlabeled_df, 'review', remove_stopwords=True)
del data.unlabeled_df

vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(stop_words='english', ngram_range=(1, 3),
                                                             max_features=10000, sublinear_tf=True)
vectorizer.fit(train_df['review'].append(unlabeled_df['review']))
del unlabeled_df

train = vectorizer.transform(train_df['review']).toarray()
del train_df
test = vectorizer.transform(test_df['review']).toarray()
del test_df