Python tfidf примеры использования

Язык программирования: Python

Пространство имен/Пакет: preprocess

Метод/Функция: tfidf

Примеров на hotexamples.com: 3

Python tfidf - 3 примера найдено. Это лучшие примеры Python кода для preprocess.tfidf, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Пример #1

Показать файл

Файл: Voting-General.py Проект: BYGX-wcr/DM-Final

def experiment(train_dataset, test_dataset, train_labels, test_labels=None):
    total_dataset = train_dataset + test_dataset
    seg_dataset = prep.seg_words(total_dataset)
    seg_dataset = prep.eliminate_noise(seg_dataset, "，。、\t “”；")
    vec_dataset = prep.tfidf(seg_dataset)

    vec_train_dataset = vec_dataset[0:len(train_dataset)]
    vec_test_dataset = vec_dataset[len(train_dataset):]

    EnsClf_model = train_EnsClf(vec_train_dataset, train_labels)
    res = predict_EnsClf(vec_test_dataset, EnsClf_model)
    if test_labels != None:
        print("accuracy: {0}".format(score_EnsClf(vec_test_dataset, test_labels, EnsClf_model)))

    return res

Пример #2

Показать файл

        with connection.cursor() as cursor:
                # Read a single record
                sql = "select r.`user_id`, r.`patent_id`, r.`ranking` from `patent_info` i join `patent_rank` r on i.`patent_id`=r.`patent_id` where i.`publication` like %s and i.`query` = %s"
                cursor.execute(sql, (hold_st, 1 ))     
                for result in cursor.fetchall():
                    rank_patent_id_h.append(result['patent_id'])
                    user_id_h.append(result['user_id'])
                    ranking_h.append(result['ranking'])
    finally:
        connection.close()
    return rank_patent_id_h, user_id_h, ranking_h
"""

# 准备推荐输入数据
abstract = Series(row_abstract, index = all_patent_id)
ab_vector = preprocess.tfidf(abstract)
ab_vector['item_id'] = all_patent_id
#print(ab_vector)
title = Series(row_title, index = all_patent_id)
t_vector = preprocess.tfidf(title)
#t_vector.index.name = 'item_id'
t_vector['item_id'] = all_patent_id
#print(t_vector)

t_factor = t_vector.fillna(0)
#input abstract feature metrix
a_factor = ab_vector.fillna(0)
#get the all feature metrix by merge
factor = pd.merge(t_factor, a_factor, on = 'item_id', suffixes = ('_t', '_a'), 
                  how = 'outer')
# 输入原始数据评分,三列

Пример #3

Показать файл

from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
"""
This is an unsupervised method to check if two sentences are similar.

Find tfidf for each sentence then calculate the cosine similarity for sentences that are in the same line. If the similary
is bigger than the thresholod label the two sentence as similar otherwise there not 
"""

threshold_similarity = 0.85

df = pd.read_csv("proccessed.csv")
labels = df['is_duplicate']

#calculate tfidf for each sentence
questions = tfidf()

size = questions.shape[0]
total_questions = int(size / 2)

#Label if two sentences are similar or not
correct_answers = 0
for i in range(total_questions):
    similarity = cosine_similarity(questions[i],
                                   questions[total_questions + i])
    if similarity >= threshold_similarity:
        label = 1
    else:
        label = 0
    #check if the label given from the model is correct
    if label == labels[i]: