Exemplo n.º 1
0
# train data
path_train = './spam_train.csv'
sms_words, class_lables = common_utils.read_sms(path_train)
vocabulary_list = common_utils.create_vocabulary_list(sms_words)
train_marked_words = common_utils.set_of_words_list_to_vector(vocabulary_list, sms_words)
train_marked_words = np.array(train_marked_words)
p_words_spamicity, p_words_healthy, p_spam = naive_bayes.training(train_marked_words, class_lables)

# classify test data
path = './spam_data.csv'
sms_words, class_lables = common_utils.read_sms(path)
sms_list = open(path, "r").readlines()

for i in range(len(sms_words)):
    smsType = naive_bayes.classify(vocabulary_list, p_words_spamicity,
                                p_words_healthy, p_spam, sms_words[i])
    if smsType == 0:
        row = "ham\t" + (sms_list[i].split('\t')[1])
        result_list.append(row)
    else:
        row = "spam\t" + (sms_list[i].split('\t')[1])
        result_list.append(row)

common_utils.write_file("result", result_list)

# quality control
path_full_spam = './spam_full.csv'
quality_control = naive_bayes.quality_control(path_full_spam, 1000)

common_utils.write_file("quality_control", quality_control)
Exemplo n.º 2
0
    temp = []

    # tokenize the sentence into words
    for j in word_tokenize(sentence):
        temp.append(j.lower())

    data.append(temp)

# create cbow model
cbow_model = gensim.models.Word2Vec(data, min_count=1, size=100, window=5)

# create skip gram model
skip_gram_model = gensim.models.Word2Vec(data,
                                         min_count=1,
                                         size=100,
                                         window=5,
                                         sg=1)

consistent_wordsim = common_utils.read_consistent_wordsim(
    path_wordsim, vocabulary)

for item in consistent_wordsim:
    cbow_sim = cbow_model.similarity(item[0], item[1])
    skip_gram_sim = skip_gram_model.similarity(item[0], item[1])

    data_cbow_sim.append('{}\t{}\t{}'.format(item[0], item[1], cbow_sim))
    data_skip_gram_sim.append('{}\t{}\t{}'.format(item[0], item[1],
                                                  skip_gram_sim))

common_utils.write_file('data_cbow_sim', data_cbow_sim)
common_utils.write_file('data_skip_gram_sim', data_skip_gram_sim)
Exemplo n.º 3
0
common_utils.print_document_topic_distribution(document_topic_prob, documents,
                                               number_of_topics, 3,
                                               "./document-topic.txt")

path_wordsim = './wordsim353_sim_rel/wordsim_similarity_goldstandard.txt'
data_cos = []
data_scalar = []

plsa_matrix = pd.DataFrame(data=topic_word_prob, columns=vocabulary)

consistent_wordsim = common_utils.read_consistent_wordsim(
    path_wordsim, vocabulary)

for item in consistent_wordsim:
    vector1 = plsa_matrix.get(item[0]).values
    vector2 = plsa_matrix.get(item[1]).values
    vec1 = np.array([vector1])
    vec2 = np.array([vector1])
    vec1 = vec1[~np.isnan(vec1)]
    vec2 = vec2[~np.isnan(vec2)]
    number_cos = common_utils.cosine_similarity(vec1, vec2)
    number_scalar = common_utils.scalar(vec1, vec2)

    # PLSA model
    data_cos.append('{}\t{}\t{}'.format(item[0], item[1], number_cos))
    data_scalar.append('{}\t{}\t{}'.format(item[0], item[1], number_scalar))

#  write to file
common_utils.write_file('data_plsa_cos', data_cos)
common_utils.write_file('data_plsa_scalar', data_scalar)
Exemplo n.º 4
0
common_words_games_dict = counter_games_dict.most_common(30)
common_words_news_dict = counter_news_dict.most_common(30)

for word in common_words_google_blog_dict:
    google_blog_list.append(word[0])

for word in common_words_games_dict:
    games_list.append(word[0])

for word in common_words_news_dict:
    news_list.append(word[0])

# delete duplicates and sort
stop_words = tokenizator.unique_words(stop_words)
stop_words.sort()

google_blog_list = tokenizator.unique_words(google_blog_list)
google_blog_list.sort()

games_list = tokenizator.unique_words(games_list)
games_list.sort()

news_list = tokenizator.unique_words(news_list)
news_list.sort()

# write files
common_utils.write_file("stop_words.txt", stop_words)

common_utils.write_file("google_blog_dict.txt", google_blog_list)
common_utils.write_file("games_dict.txt", games_list)
common_utils.write_file("news_dict.txt", news_list)