# train data path_train = './spam_train.csv' sms_words, class_lables = common_utils.read_sms(path_train) vocabulary_list = common_utils.create_vocabulary_list(sms_words) train_marked_words = common_utils.set_of_words_list_to_vector(vocabulary_list, sms_words) train_marked_words = np.array(train_marked_words) p_words_spamicity, p_words_healthy, p_spam = naive_bayes.training(train_marked_words, class_lables) # classify test data path = './spam_data.csv' sms_words, class_lables = common_utils.read_sms(path) sms_list = open(path, "r").readlines() for i in range(len(sms_words)): smsType = naive_bayes.classify(vocabulary_list, p_words_spamicity, p_words_healthy, p_spam, sms_words[i]) if smsType == 0: row = "ham\t" + (sms_list[i].split('\t')[1]) result_list.append(row) else: row = "spam\t" + (sms_list[i].split('\t')[1]) result_list.append(row) common_utils.write_file("result", result_list) # quality control path_full_spam = './spam_full.csv' quality_control = naive_bayes.quality_control(path_full_spam, 1000) common_utils.write_file("quality_control", quality_control)
temp = [] # tokenize the sentence into words for j in word_tokenize(sentence): temp.append(j.lower()) data.append(temp) # create cbow model cbow_model = gensim.models.Word2Vec(data, min_count=1, size=100, window=5) # create skip gram model skip_gram_model = gensim.models.Word2Vec(data, min_count=1, size=100, window=5, sg=1) consistent_wordsim = common_utils.read_consistent_wordsim( path_wordsim, vocabulary) for item in consistent_wordsim: cbow_sim = cbow_model.similarity(item[0], item[1]) skip_gram_sim = skip_gram_model.similarity(item[0], item[1]) data_cbow_sim.append('{}\t{}\t{}'.format(item[0], item[1], cbow_sim)) data_skip_gram_sim.append('{}\t{}\t{}'.format(item[0], item[1], skip_gram_sim)) common_utils.write_file('data_cbow_sim', data_cbow_sim) common_utils.write_file('data_skip_gram_sim', data_skip_gram_sim)
common_utils.print_document_topic_distribution(document_topic_prob, documents, number_of_topics, 3, "./document-topic.txt") path_wordsim = './wordsim353_sim_rel/wordsim_similarity_goldstandard.txt' data_cos = [] data_scalar = [] plsa_matrix = pd.DataFrame(data=topic_word_prob, columns=vocabulary) consistent_wordsim = common_utils.read_consistent_wordsim( path_wordsim, vocabulary) for item in consistent_wordsim: vector1 = plsa_matrix.get(item[0]).values vector2 = plsa_matrix.get(item[1]).values vec1 = np.array([vector1]) vec2 = np.array([vector1]) vec1 = vec1[~np.isnan(vec1)] vec2 = vec2[~np.isnan(vec2)] number_cos = common_utils.cosine_similarity(vec1, vec2) number_scalar = common_utils.scalar(vec1, vec2) # PLSA model data_cos.append('{}\t{}\t{}'.format(item[0], item[1], number_cos)) data_scalar.append('{}\t{}\t{}'.format(item[0], item[1], number_scalar)) # write to file common_utils.write_file('data_plsa_cos', data_cos) common_utils.write_file('data_plsa_scalar', data_scalar)
common_words_games_dict = counter_games_dict.most_common(30) common_words_news_dict = counter_news_dict.most_common(30) for word in common_words_google_blog_dict: google_blog_list.append(word[0]) for word in common_words_games_dict: games_list.append(word[0]) for word in common_words_news_dict: news_list.append(word[0]) # delete duplicates and sort stop_words = tokenizator.unique_words(stop_words) stop_words.sort() google_blog_list = tokenizator.unique_words(google_blog_list) google_blog_list.sort() games_list = tokenizator.unique_words(games_list) games_list.sort() news_list = tokenizator.unique_words(news_list) news_list.sort() # write files common_utils.write_file("stop_words.txt", stop_words) common_utils.write_file("google_blog_dict.txt", google_blog_list) common_utils.write_file("games_dict.txt", games_list) common_utils.write_file("news_dict.txt", news_list)