from pysrc import processData from gensim.models import doc2vec import pandas as pd import numpy as np import nltk.data import logging train = pd.read_csv("/path/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3) # test = pd.read_csv("/Users/shirleyyoung/Documents/Kaggle/Bag_of_Words_Meets_Bags_of_Popcorn/testData.tsv", # header=0, delimiter="\t", quoting=3) unlabeled_train = pd.read_csv("/path/unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3) tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') labeled = [processData.review_to_sentences(review, tokenizer) for review in train["review"]] unlabeled = [processData.review_to_sentences(review, tokenizer) for review in unlabeled_train["review"]] # print(type(labeled[0])) # print(labeled[0]) # input("Press enter to continue...") def labelizeReviews(reviewSet, labelType): """ add label to each review :param reviewSet: :param label: the label to be put on the review :return: """ labelized = [] for index, review in enumerate(reviewSet):
# train.shape: get the dimensions of the data set train = pd.read_csv("/path/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3) unlabeled_train = pd.read_csv("/path/unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3) # use nltk to split the review to sentences tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') bag_sentences = [] # Note we are appending lists of lists to bag_sentences. # use bag_sentences.append() # += join all the lists together print("Parsing sentences from labeled training set") for review in train["review"]: bag_sentences.append(processData.review_to_sentences(review, tokenizer, False, True, False)) print("Parsing sentences from unlabeled set") for review in unlabeled_train["review"]: bag_sentences.append(processData.review_to_sentences(review, tokenizer, False, True, False)) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\ level=logging.INFO) # Set values for the parameters in Word2Vec num_features = 500 # word vector dimensionality # minimum word count: any word that does not occur at least this many times # across all documents is ignored min_word_count = 40 num_workers = 4 # Number of threads to run in parallel context = 10 # Context window size