예제 #1
0
def add_tfidf():
    filenames = read_file('../dataset/KDD_filelist').split(',')
    tfidf_path = '../data_preparation/data_temp/KDD/tfidfByfile/'
    savePath = '../dataset/KDD/node_features/'
    for filename in filenames:
        filename = filename.strip()
        filepath = tfidf_path + filename
        file = read_file(filepath).split('\n')
        word_tfidf = {}
        for line in file:
            if len(line) != 0:
                word, tfidf = line.split(' ')
                word_tfidf[word] = float(tfidf)
        add_feature(savePath + filename, word_tfidf)
예제 #2
0
def add_position():
    filenames = read_file('../dataset/KDD_filelist').split(',')
    filepath = '../dataset/KDD/abstracts/'
    savePath = '../dataset/KDD/node_features/'
    for filename in filenames:
        filtered_text = filter_text(read_file(filepath + filename))
        filtered_text = filtered_text.split()
        word_position = {}
        for i in range(len(filtered_text)):
            word = filtered_text[i]
            if word not in word_position.keys():
                word_position[word] = float(1) / (i + 1)
            else:
                word_position[word] = word_position[word] + float(1) / (i + 1)
        add_feature(savePath + filename, word_position)
예제 #3
0
def del_feature():
    filenames = read_file('../dataset/KDD_filelist').split(',')
    tfidf_path = '../data_preparation/data_temp/KDD/tfidfByfile/'
    savePath = '../dataset/KDD/node_features/'
    for filename in filenames:
        filename = filename.strip()
        filepath = tfidf_path + filename
        file = read_file(filepath).split('\n')
        word_tfidf = []
        for line in file:
            if len(line) != 0:
                a, b = line.split(',')
                word_tfidf.append(a)
        with open(filepath, 'w', encoding='utf-8') as f:
            for line in word_tfidf:
                f.write(line + '\n')
예제 #4
0
def get_phrases(pr, graph, abstr_path, file_name, ng=2):
    """返回一个list:[('large numbers', 0.0442255866192), ('Internet criminal', 0.0440296017801)]"""
    text = rm_tags(read_file(abstr_path + file_name))
    tokens = word_tokenize(text.lower())
    edges = graph.edge
    phrases = set()

    for n in range(2, ng + 1):
        for ngram in ngrams(tokens, n):

            # For each n-gram, if all tokens are words, and if the normalized
            # head and tail are found in the graph -- i.e. if both are nodes
            # connected by an edge -- this n-gram is a key phrase.
            if all(is_word(token) for token in ngram):
                head, tail = normalized_token(ngram[0]), normalized_token(
                    ngram[-1])

                if head in edges and tail in edges[head] and pos_tag(
                    [ngram[-1]])[0][1] != 'JJ':
                    phrase = ' '.join(
                        list(normalized_token(word) for word in ngram))
                    phrases.add(phrase)

    if ng == 2:
        phrase2to3 = set()
        for p1 in phrases:
            for p2 in phrases:
                if p1.split()[-1] == p2.split()[0] and p1 != p2:
                    phrase = ' '.join([p1.split()[0]] + p2.split())
                    phrase2to3.add(phrase)
        phrases |= phrase2to3

    phrase_score = {}
    for phrase in phrases:
        score = 0
        for word in phrase.split():
            score += pr.get(word, 0)
        plenth = len(phrase.split())
        if plenth == 1:
            phrase_score[phrase] = score
        elif plenth == 2:
            phrase_score[phrase] = score * 0.6  # 此处根据词组词控制词组分数
        else:
            phrase_score[phrase] = score / 3  # 此处根据词组词控制词组分数
        # phrase_score[phrase] = score/len(phrase.split())
    sorted_phrases = sorted(phrase_score.items(),
                            key=lambda d: d[1],
                            reverse=True)
    # print(sorted_phrases)
    sorted_word = sorted(pr.items(), key=lambda d: d[1], reverse=True)
    # print(sorted_word)
    out_sorted = sorted(sorted_phrases + sorted_word,
                        key=lambda d: d[1],
                        reverse=True)
    return out_sorted
예제 #5
0
def evaluate_extraction(dataset,
                        method_name,
                        ngrams=2,
                        damping=0.85,
                        omega=None,
                        phi=None,
                        alter_topn=None,
                        alter_edge=None,
                        alter_node=None):
    """
    评价实验结果

    omega,phi, [0]代表不适用任何特征,权重设置为1。None为所有特征的简单加和。[-1]只用最后一个特征。
    """
    if dataset == 'KDD':
        abstr_dir = '../dataset/KDD/abstracts/'
        out_dir = '../result/'
        gold_dir = '../dataset/KDD/gold/'
        edge_dir = '../dataset/KDD/edge_features/'
        node_dir = '../dataset/KDD/node_features/'
        file_names = read_file('../dataset/KDD_filelist').split(',')
        topn = 4
    elif dataset == 'WWW':
        abstr_dir = '../dataset/WWW/abstracts/'
        out_dir = '../result/'
        gold_dir = '../dataset/WWW/gold/'
        edge_dir = '../dataset/WWW/edge_features/'
        node_dir = '../dataset/WWW/node_features/'
        file_names = read_file('../dataset/WWW_filelist').split(',')
        topn = 5
    else:
        print('wrong dataset name')
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    if alter_edge:
        edge_dir = alter_edge
    if alter_node:
        node_dir = alter_node
    if alter_topn:
        topn = alter_topn

    count = 0
    gold_count = 0
    extract_count = 0
    mrr = 0
    prcs_micro = 0
    recall_micro = 0
    for file_name in file_names:
        # print(file_name)
        pr, graph = wpr(edge_dir + file_name,
                        node_dir + file_name,
                        omega=omega,
                        phi=phi,
                        d=damping)

        gold = read_file(gold_dir + file_name)
        keyphrases = get_phrases(pr, graph, abstr_dir, file_name, ng=ngrams)
        top_phrases = []
        for phrase in keyphrases:
            if phrase[0] not in str(top_phrases):
                top_phrases.append(phrase[0])
            if len(top_phrases) == topn:
                break
        golds = gold.split('\n')
        if golds[-1] == '':
            golds = golds[:-1]
        golds = list(' '.join(list(normalized_token(w) for w in g.split()))
                     for g in golds)
        count_micro = 0
        position = []
        for phrase in top_phrases:
            if phrase in golds:
                count += 1
                count_micro += 1
                position.append(top_phrases.index(phrase))
        if position != []:
            mrr += 1 / (position[0] + 1)
        gold_count += len(golds)
        extract_count += len(top_phrases)
        prcs_micro += count_micro / len(top_phrases)
        recall_micro += count_micro / len(golds)

    prcs = count / extract_count
    recall = count / gold_count
    f1 = 2 * prcs * recall / (prcs + recall)
    mrr /= len(file_names)
    prcs_micro /= len(file_names)
    recall_micro /= len(file_names)
    f1_micro = 2 * prcs_micro * recall_micro / (prcs_micro + recall_micro)
    print(prcs, recall, f1, mrr)

    tofile_result = method_name + ',' + str(prcs) + ',' + str(recall) + ',' + str(f1) + ',' + str(mrr) + ',' \
        + str(prcs_micro) + ',' + str(recall_micro) + \
        ',' + str(f1_micro) + ',\n'
    with open(out_dir + dataset + '_RESULTS.csv', mode='a',
              encoding='utf8') as f:
        f.write(tofile_result)