from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression from sklearn import cross_validation import pandas as pd import numpy as np from helpers.kaggle_word2vec_utility import KaggleWord2VecUtility train = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'labeledTrainData.tsv'), header=0, delimiter="\t", quoting=3) test = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'testData.tsv'), header=0, delimiter="\t", quoting=3) y = train["sentiment"] print("Cleaning and parsing movie reviews...\n") traindata = [] for i in range(0, len(train["review"])): traindata.append(" ".join(KaggleWord2VecUtility.review_to_word_list(train["review"][i], False))) testdata = [] for i in range(0, len(test["review"])): testdata.append(" ".join(KaggleWord2VecUtility.review_to_word_list(test["review"][i], False))) print('vectorizing... ') tfv = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1, stop_words='english') X_all = traindata + testdata lentrain = len(traindata) print("fitting pipeline... ") tfv.fit(X_all) X_all = tfv.transform(X_all) X = X_all[:lentrain] X_test = X_all[lentrain:]
train = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'labeledTrainData.tsv'), header=0, delimiter="\t", quoting=3) test = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'testData.tsv'), header=0, delimiter="\t", quoting=3) y = train["sentiment"] print("Cleaning and parsing movie reviews...\n") traindata = [] for i in range(0, len(train["review"])): traindata.append(" ".join( KaggleWord2VecUtility.review_to_word_list(train["review"][i], False))) testdata = [] for i in range(0, len(test["review"])): testdata.append(" ".join( KaggleWord2VecUtility.review_to_word_list(test["review"][i], False))) print('vectorizing... ') tfv = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1, stop_words='english')
def convert_review_to_words(review_str): return ' '.join( KaggleWord2VecUtility.review_to_word_list(review_str))
def convert_review_to_words(review_str): return ' '.join(KaggleWord2VecUtility.review_to_word_list(review_str))