예제 #1
0
def experiment(train_dataset, test_dataset, train_labels, test_labels=None):
    total_dataset = train_dataset + test_dataset
    seg_dataset = prep.seg_words(total_dataset)
    seg_dataset = prep.eliminate_noise(seg_dataset, ",。、\t “”;")
    vec_dataset = prep.tfidf(seg_dataset)

    vec_train_dataset = vec_dataset[0:len(train_dataset)]
    vec_test_dataset = vec_dataset[len(train_dataset):]

    EnsClf_model = train_EnsClf(vec_train_dataset, train_labels)
    res = predict_EnsClf(vec_test_dataset, EnsClf_model)
    if test_labels != None:
        print("accuracy: {0}".format(score_EnsClf(vec_test_dataset, test_labels, EnsClf_model)))

    return res
예제 #2
0
        with connection.cursor() as cursor:
                # Read a single record
                sql = "select r.`user_id`, r.`patent_id`, r.`ranking` from `patent_info` i join `patent_rank` r on i.`patent_id`=r.`patent_id` where i.`publication` like %s and i.`query` = %s"
                cursor.execute(sql, (hold_st, 1 ))     
                for result in cursor.fetchall():
                    rank_patent_id_h.append(result['patent_id'])
                    user_id_h.append(result['user_id'])
                    ranking_h.append(result['ranking'])
    finally:
        connection.close()
    return rank_patent_id_h, user_id_h, ranking_h
"""

# 准备推荐输入数据
abstract = Series(row_abstract, index = all_patent_id)
ab_vector = preprocess.tfidf(abstract)
ab_vector['item_id'] = all_patent_id
#print(ab_vector)
title = Series(row_title, index = all_patent_id)
t_vector = preprocess.tfidf(title)
#t_vector.index.name = 'item_id'
t_vector['item_id'] = all_patent_id
#print(t_vector)

t_factor = t_vector.fillna(0)
#input abstract feature metrix
a_factor = ab_vector.fillna(0)
#get the all feature metrix by merge
factor = pd.merge(t_factor, a_factor, on = 'item_id', suffixes = ('_t', '_a'), 
                  how = 'outer')
# 输入原始数据评分,三列
예제 #3
0
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
"""
This is an unsupervised method to check if two sentences are similar.

Find tfidf for each sentence then calculate the cosine similarity for sentences that are in the same line. If the similary
is bigger than the thresholod label the two sentence as similar otherwise there not 
"""

threshold_similarity = 0.85

df = pd.read_csv("proccessed.csv")
labels = df['is_duplicate']

#calculate tfidf for each sentence
questions = tfidf()

size = questions.shape[0]
total_questions = int(size / 2)

#Label if two sentences are similar or not
correct_answers = 0
for i in range(total_questions):
    similarity = cosine_similarity(questions[i],
                                   questions[total_questions + i])
    if similarity >= threshold_similarity:
        label = 1
    else:
        label = 0
    #check if the label given from the model is correct
    if label == labels[i]: