def main(): modelName = "../../classifier/Word2VectforNLPTraining" model = Word2Vec.load(open(modelName,"rb")) # model.init_sims(replace=True) wordVectors = model.syn0 # print(wordVectors[0]) num_clusters = int(wordVectors.shape[0] / 5) # print("number of clusters: {}".format(num_clusters)) # input("Press enter to continue:") print("Clustering...") startTime = time.time() clusterIndex = cfun.kmeans(num_clusters, wordVectors) endTime = time.time() print("Time taken for clustering: {} seconds".format(endTime - startTime)) clusterf = open("../../classifier/doc2vec/clusterIndex.pickle","wb") pickle.dump(clusterIndex,clusterf) # create a word/index dictionary, mapping each vocabulary word to a cluster number # zip(): make an iterator that aggregates elements from each of the iterables index_word_map = dict(zip(model.index2word, clusterIndex)) train = pd.read_csv("../../data/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3) test = pd.read_csv("../../data/testData.tsv", header=0, delimiter="\t", quoting=3) trainingDataFV = np.zeros((train["review"].size, num_clusters), dtype=np.float) testDataFV = np.zeros((test["review"].size, num_clusters), dtype=np.float) # We dont really need to clean the data as the junk terms will be ignored anyway. This is due to the fact that we did not consider these while creating the model # and hence they will not feature in the model's vocabulary. Still this step will expedite the classification and feature vector creation. print("Processing training data...") counter = 0 cleanedTrainingData = preProc.clean_data(train) for review in cleanedTrainingData: trainingDataFV[counter] = cfun.create_bag_of_centroids(review,num_clusters,index_word_map) counter += 1 print("Processing test data...") counter = 0 cleaned_test_data = preProc.clean_data(test) for review in cleaned_test_data: testDataFV[counter] = cfun.create_bag_of_centroids(review,num_clusters,index_word_map) counter += 1 n_estimators = 100 result = cfun.rfClassifer(n_estimators, trainingDataFV, train["sentiment"],testDataFV) output = pd.DataFrame(data={"id": test["id"], "sentiment": result}) output.to_csv("Word2Vec_Clustering.csv", index=False, quoting=3)
def main(): #Set up logging configurations logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) model = doc2vec.Doc2Vec(hashfxn=myhash) #Load the trained model model = doc2vec.Doc2Vec.load("../../classifier/Doc2VecTaggedDocs") word_vectors = model.syn0 num_clusters = int(word_vectors.shape[0] / 5) # print("number of clusters: {}".format(num_clusters)) print("Clustering...") startTime = time.time() cluster_index = cfun.kmeans(num_clusters, word_vectors) endTime = time.time() print("Time taken for clustering: {} minutes".format( (endTime - startTime) / 60)) clusterf = open("../../classifier/clusterIndex.pickle", "wb") #Save clusters pickle.dump(cluster_index, clusterf) # create a word/index dictionary, mapping each vocabulary word to a cluster number # zip(): make an iterator that aggregates elements from each of the iterables index_word_map = dict(zip(model.index2word, cluster_index)) train = pd.read_csv("../../data/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3) test = pd.read_csv("../../data/testData.tsv", header=0, delimiter="\t", quoting=3) #Create feature vectors for training data trainingDataFV = np.zeros((train["review"].size, num_clusters), dtype=np.float) #Create feature vectors for test data testDataFV = np.zeros((test["review"].size, num_clusters), dtype=np.float) #Populate feature vectors after cleaing the data print("Processing training data...") counter = 0 cleaned_training_data = preProc.clean_data(train) for review in cleaned_training_data: trainingDataFV[counter] = cfun.create_bag_of_centroids( review, num_clusters, index_word_map) counter += 1 print("Processing test data...") counter = 0 cleaned_test_data = preProc.clean_data(test) for review in cleaned_test_data: testDataFV[counter] = cfun.create_bag_of_centroids( review, num_clusters, index_word_map) counter += 1 n_estimators = 100 result = cfun.rfClassifer(n_estimators, trainingDataFV, train["sentiment"], testDataFV) output = pd.DataFrame(data={"id": test["id"], "sentiment": result}) output.to_csv("Doc2Vec_Clustering.csv", index=False, quoting=3)
def main(): modelName = "../../classifier/Word2VectforNLPTraining" model = Word2Vec.load(open(modelName, "rb")) # model.init_sims(replace=True) wordVectors = model.syn0 # print(wordVectors[0]) num_clusters = int(wordVectors.shape[0] / 5) # print("number of clusters: {}".format(num_clusters)) # input("Press enter to continue:") print("Clustering...") startTime = time.time() clusterIndex = cfun.kmeans(num_clusters, wordVectors) endTime = time.time() print("Time taken for clustering: {} seconds".format(endTime - startTime)) clusterf = open("../../classifier/doc2vec/clusterIndex.pickle", "wb") pickle.dump(clusterIndex, clusterf) # create a word/index dictionary, mapping each vocabulary word to a cluster number # zip(): make an iterator that aggregates elements from each of the iterables index_word_map = dict(zip(model.index2word, clusterIndex)) train = pd.read_csv("../../data/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3) test = pd.read_csv("../../data/testData.tsv", header=0, delimiter="\t", quoting=3) trainingDataFV = np.zeros((train["review"].size, num_clusters), dtype=np.float) testDataFV = np.zeros((test["review"].size, num_clusters), dtype=np.float) # We dont really need to clean the data as the junk terms will be ignored anyway. This is due to the fact that we did not consider these while creating the model # and hence they will not feature in the model's vocabulary. Still this step will expedite the classification and feature vector creation. print("Processing training data...") counter = 0 cleanedTrainingData = preProc.clean_data(train) for review in cleanedTrainingData: trainingDataFV[counter] = cfun.create_bag_of_centroids( review, num_clusters, index_word_map) counter += 1 print("Processing test data...") counter = 0 cleaned_test_data = preProc.clean_data(test) for review in cleaned_test_data: testDataFV[counter] = cfun.create_bag_of_centroids( review, num_clusters, index_word_map) counter += 1 n_estimators = 100 result = cfun.rfClassifer(n_estimators, trainingDataFV, train["sentiment"], testDataFV) output = pd.DataFrame(data={"id": test["id"], "sentiment": result}) output.to_csv("Word2Vec_Clustering.csv", index=False, quoting=3)
def main(): #Set up logging configurations logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) model = doc2vec.Doc2Vec(hashfxn=myhash) #Load the trained model model = doc2vec.Doc2Vec.load("../../classifier/Doc2VecTaggedDocs") word_vectors = model.syn0 num_clusters = int(word_vectors.shape[0] / 5) # print("number of clusters: {}".format(num_clusters)) print("Clustering...") startTime = time.time() cluster_index = cfun.kmeans(num_clusters, word_vectors) endTime = time.time() print("Time taken for clustering: {} minutes".format((endTime - startTime)/60)) clusterf = open("../../classifier/clusterIndex.pickle","wb") #Save clusters pickle.dump(cluster_index,clusterf) # create a word/index dictionary, mapping each vocabulary word to a cluster number # zip(): make an iterator that aggregates elements from each of the iterables index_word_map = dict(zip(model.index2word, cluster_index)) train = pd.read_csv("../../data/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3) test = pd.read_csv("../../data/testData.tsv", header=0, delimiter="\t", quoting=3) #Create feature vectors for training data trainingDataFV = np.zeros((train["review"].size, num_clusters), dtype=np.float) #Create feature vectors for test data testDataFV = np.zeros((test["review"].size, num_clusters), dtype=np.float) #Populate feature vectors after cleaing the data print("Processing training data...") counter = 0 cleaned_training_data = preProc.clean_data(train) for review in cleaned_training_data: trainingDataFV[counter] = cfun.create_bag_of_centroids(review,num_clusters,index_word_map) counter += 1 print("Processing test data...") counter = 0 cleaned_test_data = preProc.clean_data(test) for review in cleaned_test_data: testDataFV[counter] = cfun.create_bag_of_centroids(review,num_clusters,index_word_map) counter += 1 n_estimators = 100 result = cfun.rfClassifer(n_estimators, trainingDataFV, train["sentiment"],testDataFV) output = pd.DataFrame(data={"id": test["id"], "sentiment": result}) output.to_csv("Doc2Vec_Clustering.csv", index=False, quoting=3)