Python tfidf 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: preprocess

메소드/함수: tfidf

hotexamples.com에서의 예제들: 3

Python tfidf - 3개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 preprocess.tfidf에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: Voting-General.py 프로젝트: BYGX-wcr/DM-Final

def experiment(train_dataset, test_dataset, train_labels, test_labels=None):
    total_dataset = train_dataset + test_dataset
    seg_dataset = prep.seg_words(total_dataset)
    seg_dataset = prep.eliminate_noise(seg_dataset, "，。、\t “”；")
    vec_dataset = prep.tfidf(seg_dataset)

    vec_train_dataset = vec_dataset[0:len(train_dataset)]
    vec_test_dataset = vec_dataset[len(train_dataset):]

    EnsClf_model = train_EnsClf(vec_train_dataset, train_labels)
    res = predict_EnsClf(vec_test_dataset, EnsClf_model)
    if test_labels != None:
        print("accuracy: {0}".format(score_EnsClf(vec_test_dataset, test_labels, EnsClf_model)))

    return res

예제 #2

파일 보기

        with connection.cursor() as cursor:
                # Read a single record
                sql = "select r.`user_id`, r.`patent_id`, r.`ranking` from `patent_info` i join `patent_rank` r on i.`patent_id`=r.`patent_id` where i.`publication` like %s and i.`query` = %s"
                cursor.execute(sql, (hold_st, 1 ))     
                for result in cursor.fetchall():
                    rank_patent_id_h.append(result['patent_id'])
                    user_id_h.append(result['user_id'])
                    ranking_h.append(result['ranking'])
    finally:
        connection.close()
    return rank_patent_id_h, user_id_h, ranking_h
"""

# 准备推荐输入数据
abstract = Series(row_abstract, index = all_patent_id)
ab_vector = preprocess.tfidf(abstract)
ab_vector['item_id'] = all_patent_id
#print(ab_vector)
title = Series(row_title, index = all_patent_id)
t_vector = preprocess.tfidf(title)
#t_vector.index.name = 'item_id'
t_vector['item_id'] = all_patent_id
#print(t_vector)

t_factor = t_vector.fillna(0)
#input abstract feature metrix
a_factor = ab_vector.fillna(0)
#get the all feature metrix by merge
factor = pd.merge(t_factor, a_factor, on = 'item_id', suffixes = ('_t', '_a'), 
                  how = 'outer')
# 输入原始数据评分,三列

예제 #3

파일 보기

from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
"""
This is an unsupervised method to check if two sentences are similar.

Find tfidf for each sentence then calculate the cosine similarity for sentences that are in the same line. If the similary
is bigger than the thresholod label the two sentence as similar otherwise there not 
"""

threshold_similarity = 0.85

df = pd.read_csv("proccessed.csv")
labels = df['is_duplicate']

#calculate tfidf for each sentence
questions = tfidf()

size = questions.shape[0]
total_questions = int(size / 2)

#Label if two sentences are similar or not
correct_answers = 0
for i in range(total_questions):
    similarity = cosine_similarity(questions[i],
                                   questions[total_questions + i])
    if similarity >= threshold_similarity:
        label = 1
    else:
        label = 0
    #check if the label given from the model is correct
    if label == labels[i]: