from pysrc import processData
from gensim.models import doc2vec
import pandas as pd
import numpy as np
import nltk.data
import logging

train = pd.read_csv("/path/labeledTrainData.tsv",
                    header=0, delimiter="\t", quoting=3)
# test = pd.read_csv("/Users/shirleyyoung/Documents/Kaggle/Bag_of_Words_Meets_Bags_of_Popcorn/testData.tsv",
#                   header=0, delimiter="\t", quoting=3)
unlabeled_train = pd.read_csv("/path/unlabeledTrainData.tsv",
                    header=0, delimiter="\t", quoting=3)

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
labeled = [processData.review_to_sentences(review, tokenizer) for review in train["review"]]
unlabeled = [processData.review_to_sentences(review, tokenizer) for review in unlabeled_train["review"]]

# print(type(labeled[0]))
# print(labeled[0])
# input("Press enter to continue...")

def labelizeReviews(reviewSet, labelType):
    """
    add label to each review
    :param reviewSet:
    :param label: the label to be put on the review
    :return:
    """
    labelized = []
    for index, review in enumerate(reviewSet):
# train.shape: get the dimensions of the data set
train = pd.read_csv("/path/labeledTrainData.tsv",
                    header=0, delimiter="\t", quoting=3)
unlabeled_train = pd.read_csv("/path/unlabeledTrainData.tsv",
                    header=0, delimiter="\t", quoting=3)

# use nltk to split the review to sentences
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

bag_sentences = []
# Note we are appending lists of lists to bag_sentences.
# use bag_sentences.append()
# += join all the lists together
print("Parsing sentences from labeled training set")
for review in train["review"]:
    bag_sentences.append(processData.review_to_sentences(review, tokenizer, False, True, False))

print("Parsing sentences from unlabeled set")
for review in unlabeled_train["review"]:
    bag_sentences.append(processData.review_to_sentences(review, tokenizer, False, True, False))

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

# Set values for the parameters in Word2Vec
num_features = 500  # word vector dimensionality
# minimum word count: any word that does not occur at least this many times
# across all documents is ignored
min_word_count = 40
num_workers = 4  # Number of threads to run in parallel
context = 10  # Context window size