def add_tfidf(): filenames = read_file('../dataset/KDD_filelist').split(',') tfidf_path = '../data_preparation/data_temp/KDD/tfidfByfile/' savePath = '../dataset/KDD/node_features/' for filename in filenames: filename = filename.strip() filepath = tfidf_path + filename file = read_file(filepath).split('\n') word_tfidf = {} for line in file: if len(line) != 0: word, tfidf = line.split(' ') word_tfidf[word] = float(tfidf) add_feature(savePath + filename, word_tfidf)
def add_position(): filenames = read_file('../dataset/KDD_filelist').split(',') filepath = '../dataset/KDD/abstracts/' savePath = '../dataset/KDD/node_features/' for filename in filenames: filtered_text = filter_text(read_file(filepath + filename)) filtered_text = filtered_text.split() word_position = {} for i in range(len(filtered_text)): word = filtered_text[i] if word not in word_position.keys(): word_position[word] = float(1) / (i + 1) else: word_position[word] = word_position[word] + float(1) / (i + 1) add_feature(savePath + filename, word_position)
def del_feature(): filenames = read_file('../dataset/KDD_filelist').split(',') tfidf_path = '../data_preparation/data_temp/KDD/tfidfByfile/' savePath = '../dataset/KDD/node_features/' for filename in filenames: filename = filename.strip() filepath = tfidf_path + filename file = read_file(filepath).split('\n') word_tfidf = [] for line in file: if len(line) != 0: a, b = line.split(',') word_tfidf.append(a) with open(filepath, 'w', encoding='utf-8') as f: for line in word_tfidf: f.write(line + '\n')
def get_phrases(pr, graph, abstr_path, file_name, ng=2): """返回一个list:[('large numbers', 0.0442255866192), ('Internet criminal', 0.0440296017801)]""" text = rm_tags(read_file(abstr_path + file_name)) tokens = word_tokenize(text.lower()) edges = graph.edge phrases = set() for n in range(2, ng + 1): for ngram in ngrams(tokens, n): # For each n-gram, if all tokens are words, and if the normalized # head and tail are found in the graph -- i.e. if both are nodes # connected by an edge -- this n-gram is a key phrase. if all(is_word(token) for token in ngram): head, tail = normalized_token(ngram[0]), normalized_token( ngram[-1]) if head in edges and tail in edges[head] and pos_tag( [ngram[-1]])[0][1] != 'JJ': phrase = ' '.join( list(normalized_token(word) for word in ngram)) phrases.add(phrase) if ng == 2: phrase2to3 = set() for p1 in phrases: for p2 in phrases: if p1.split()[-1] == p2.split()[0] and p1 != p2: phrase = ' '.join([p1.split()[0]] + p2.split()) phrase2to3.add(phrase) phrases |= phrase2to3 phrase_score = {} for phrase in phrases: score = 0 for word in phrase.split(): score += pr.get(word, 0) plenth = len(phrase.split()) if plenth == 1: phrase_score[phrase] = score elif plenth == 2: phrase_score[phrase] = score * 0.6 # 此处根据词组词控制词组分数 else: phrase_score[phrase] = score / 3 # 此处根据词组词控制词组分数 # phrase_score[phrase] = score/len(phrase.split()) sorted_phrases = sorted(phrase_score.items(), key=lambda d: d[1], reverse=True) # print(sorted_phrases) sorted_word = sorted(pr.items(), key=lambda d: d[1], reverse=True) # print(sorted_word) out_sorted = sorted(sorted_phrases + sorted_word, key=lambda d: d[1], reverse=True) return out_sorted
def evaluate_extraction(dataset, method_name, ngrams=2, damping=0.85, omega=None, phi=None, alter_topn=None, alter_edge=None, alter_node=None): """ 评价实验结果 omega,phi, [0]代表不适用任何特征,权重设置为1。None为所有特征的简单加和。[-1]只用最后一个特征。 """ if dataset == 'KDD': abstr_dir = '../dataset/KDD/abstracts/' out_dir = '../result/' gold_dir = '../dataset/KDD/gold/' edge_dir = '../dataset/KDD/edge_features/' node_dir = '../dataset/KDD/node_features/' file_names = read_file('../dataset/KDD_filelist').split(',') topn = 4 elif dataset == 'WWW': abstr_dir = '../dataset/WWW/abstracts/' out_dir = '../result/' gold_dir = '../dataset/WWW/gold/' edge_dir = '../dataset/WWW/edge_features/' node_dir = '../dataset/WWW/node_features/' file_names = read_file('../dataset/WWW_filelist').split(',') topn = 5 else: print('wrong dataset name') if not os.path.exists(out_dir): os.makedirs(out_dir) if alter_edge: edge_dir = alter_edge if alter_node: node_dir = alter_node if alter_topn: topn = alter_topn count = 0 gold_count = 0 extract_count = 0 mrr = 0 prcs_micro = 0 recall_micro = 0 for file_name in file_names: # print(file_name) pr, graph = wpr(edge_dir + file_name, node_dir + file_name, omega=omega, phi=phi, d=damping) gold = read_file(gold_dir + file_name) keyphrases = get_phrases(pr, graph, abstr_dir, file_name, ng=ngrams) top_phrases = [] for phrase in keyphrases: if phrase[0] not in str(top_phrases): top_phrases.append(phrase[0]) if len(top_phrases) == topn: break golds = gold.split('\n') if golds[-1] == '': golds = golds[:-1] golds = list(' '.join(list(normalized_token(w) for w in g.split())) for g in golds) count_micro = 0 position = [] for phrase in top_phrases: if phrase in golds: count += 1 count_micro += 1 position.append(top_phrases.index(phrase)) if position != []: mrr += 1 / (position[0] + 1) gold_count += len(golds) extract_count += len(top_phrases) prcs_micro += count_micro / len(top_phrases) recall_micro += count_micro / len(golds) prcs = count / extract_count recall = count / gold_count f1 = 2 * prcs * recall / (prcs + recall) mrr /= len(file_names) prcs_micro /= len(file_names) recall_micro /= len(file_names) f1_micro = 2 * prcs_micro * recall_micro / (prcs_micro + recall_micro) print(prcs, recall, f1, mrr) tofile_result = method_name + ',' + str(prcs) + ',' + str(recall) + ',' + str(f1) + ',' + str(mrr) + ',' \ + str(prcs_micro) + ',' + str(recall_micro) + \ ',' + str(f1_micro) + ',\n' with open(out_dir + dataset + '_RESULTS.csv', mode='a', encoding='utf8') as f: f.write(tofile_result)