def experiment(train_dataset, test_dataset, train_labels, test_labels=None): total_dataset = train_dataset + test_dataset seg_dataset = prep.seg_words(total_dataset) seg_dataset = prep.eliminate_noise(seg_dataset, ",。、\t “”;") vec_dataset = prep.tfidf(seg_dataset) vec_train_dataset = vec_dataset[0:len(train_dataset)] vec_test_dataset = vec_dataset[len(train_dataset):] EnsClf_model = train_EnsClf(vec_train_dataset, train_labels) res = predict_EnsClf(vec_test_dataset, EnsClf_model) if test_labels != None: print("accuracy: {0}".format(score_EnsClf(vec_test_dataset, test_labels, EnsClf_model))) return res
with connection.cursor() as cursor: # Read a single record sql = "select r.`user_id`, r.`patent_id`, r.`ranking` from `patent_info` i join `patent_rank` r on i.`patent_id`=r.`patent_id` where i.`publication` like %s and i.`query` = %s" cursor.execute(sql, (hold_st, 1 )) for result in cursor.fetchall(): rank_patent_id_h.append(result['patent_id']) user_id_h.append(result['user_id']) ranking_h.append(result['ranking']) finally: connection.close() return rank_patent_id_h, user_id_h, ranking_h """ # 准备推荐输入数据 abstract = Series(row_abstract, index = all_patent_id) ab_vector = preprocess.tfidf(abstract) ab_vector['item_id'] = all_patent_id #print(ab_vector) title = Series(row_title, index = all_patent_id) t_vector = preprocess.tfidf(title) #t_vector.index.name = 'item_id' t_vector['item_id'] = all_patent_id #print(t_vector) t_factor = t_vector.fillna(0) #input abstract feature metrix a_factor = ab_vector.fillna(0) #get the all feature metrix by merge factor = pd.merge(t_factor, a_factor, on = 'item_id', suffixes = ('_t', '_a'), how = 'outer') # 输入原始数据评分,三列
from sklearn.metrics.pairwise import cosine_similarity import pandas as pd """ This is an unsupervised method to check if two sentences are similar. Find tfidf for each sentence then calculate the cosine similarity for sentences that are in the same line. If the similary is bigger than the thresholod label the two sentence as similar otherwise there not """ threshold_similarity = 0.85 df = pd.read_csv("proccessed.csv") labels = df['is_duplicate'] #calculate tfidf for each sentence questions = tfidf() size = questions.shape[0] total_questions = int(size / 2) #Label if two sentences are similar or not correct_answers = 0 for i in range(total_questions): similarity = cosine_similarity(questions[i], questions[total_questions + i]) if similarity >= threshold_similarity: label = 1 else: label = 0 #check if the label given from the model is correct if label == labels[i]: