def main():
    """
    main function to make prediction
    use random forest
    :return:
    """
    train = pd.read_csv("/path/labeledTrainData.tsv",
                    header=0, delimiter="\t", quoting=3)
    test = pd.read_csv("/path/testData.tsv",
                   header=0, delimiter="\t", quoting=3)

    modelName = "/path/Word2VectforNLPTraining"
    model = Word2Vec.load(modelName)

    print("Processing training data...")
    cleaned_training_data = processData.clean_data(train)
    trainingDataFV = getAvgFeatureVecs(cleaned_training_data,model)
    print("Processing test data...")
    cleaned_test_data = processData.clean_data(test)
    testDataFV = getAvgFeatureVecs(cleaned_test_data,model)

    n_estimators = 100
    result = randomForestClassifier.rfClassifer(n_estimators, trainingDataFV, train["sentiment"],testDataFV)
    output = pd.DataFrame(data={"id": test["id"], "sentiment": result})
    output.to_csv("Word2Vec_AvgVecPredict.csv", index=False, quoting=3)
def main():

    modelName = "Word2VectforNLPTraining"
    model = Word2Vec.load(modelName)

    # model.init_sims(replace=True)

    word_vectors = model.syn0
    # print(word_vectors[0])
    num_clusters = int(word_vectors.shape[0] / 5)
    # print("number of clusters: {}".format(num_clusters))
    # input("Press enter to continue:")
    print("Clustering...")
    startTime = time.time()
    cluster_index = kMeans.kmeans(num_clusters, word_vectors)
    endTime = time.time()

    print("Time taken for clustering: {} seconds".format(endTime - startTime))


    # create a word/index dictionary, mapping each vocabulary word to a cluster number
    # zip(): make an iterator that aggregates elements from each of the iterables
    index_word_map = dict(zip(model.index2word, cluster_index))

    def create_bag_of_centroids(reviewData):
        """
        assign each word in the review to a centroid
        this returns a numpy array with the dimension as num_clusters
        each will be served as one feature for classification
        :param reviewData:
        :return:
        """
        featureVector = np.zeros(num_clusters, dtype=np.float)
        for word in reviewData:
            if word in index_word_map:
                index = index_word_map[word]
                featureVector[index] += 1
        return featureVector

    train = pd.read_csv("/path/labeledTrainData.tsv",
                    header=0, delimiter="\t", quoting=3)
    test = pd.read_csv("/path/testData.tsv",
                   header=0, delimiter="\t", quoting=3)

    trainingDataFV = np.zeros((train["review"].size, num_clusters), dtype=np.float)
    testDataFV = np.zeros((test["review"].size, num_clusters), dtype=np.float)

    print("Processing training data...")
    counter = 0
    cleaned_training_data = processData.clean_data(train)
    for review in cleaned_training_data:
        trainingDataFV[counter] = create_bag_of_centroids(review)
        counter += 1

    print("Processing test data...")
    counter = 0
    cleaned_test_data = processData.clean_data(test)
    for review in cleaned_test_data:
        testDataFV[counter] = create_bag_of_centroids(review)
        counter += 1

    n_estimators = 100
    result = randomForestClassifier.rfClassifer(n_estimators, trainingDataFV, train["sentiment"],testDataFV)
    output = pd.DataFrame(data={"id": test["id"], "sentiment": result})
    output.to_csv("Doc2Vec_Clustering.csv", index=False, quoting=3)