from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import cross_validation
import pandas as pd
import numpy as np

from helpers.kaggle_word2vec_utility import KaggleWord2VecUtility

train = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'labeledTrainData.tsv'), header=0, delimiter="\t", quoting=3)
test = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'testData.tsv'), header=0, delimiter="\t", quoting=3)
y = train["sentiment"]
print("Cleaning and parsing movie reviews...\n")
traindata = []
for i in range(0, len(train["review"])):
    traindata.append(" ".join(KaggleWord2VecUtility.review_to_word_list(train["review"][i], False)))
testdata = []
for i in range(0, len(test["review"])):
    testdata.append(" ".join(KaggleWord2VecUtility.review_to_word_list(test["review"][i], False)))
print('vectorizing... ')
tfv = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 2),
                      use_idf=1, smooth_idf=1, sublinear_tf=1, stop_words='english')
X_all = traindata + testdata
lentrain = len(traindata)

print("fitting pipeline... ")
tfv.fit(X_all)
X_all = tfv.transform(X_all)

X = X_all[:lentrain]
X_test = X_all[lentrain:]
train = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data',
                                 'labeledTrainData.tsv'),
                    header=0,
                    delimiter="\t",
                    quoting=3)
test = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data',
                                'testData.tsv'),
                   header=0,
                   delimiter="\t",
                   quoting=3)
y = train["sentiment"]
print("Cleaning and parsing movie reviews...\n")
traindata = []
for i in range(0, len(train["review"])):
    traindata.append(" ".join(
        KaggleWord2VecUtility.review_to_word_list(train["review"][i], False)))
testdata = []
for i in range(0, len(test["review"])):
    testdata.append(" ".join(
        KaggleWord2VecUtility.review_to_word_list(test["review"][i], False)))
print('vectorizing... ')
tfv = TfidfVectorizer(min_df=3,
                      max_features=None,
                      strip_accents='unicode',
                      analyzer='word',
                      token_pattern=r'\w{1,}',
                      ngram_range=(1, 2),
                      use_idf=1,
                      smooth_idf=1,
                      sublinear_tf=1,
                      stop_words='english')
 def convert_review_to_words(review_str):
     return ' '.join(
         KaggleWord2VecUtility.review_to_word_list(review_str))
 def convert_review_to_words(review_str):
     return ' '.join(KaggleWord2VecUtility.review_to_word_list(review_str))