def citetextrank(name, dataset):

    dataset = dataset.lower()
    cfg = ConfigParser()
    cfg.read(os.path.join("./config", dataset + '.ini'))

    abstract_dir = cfg.get('dataset', 'abstract')
    doc_weight = int(cfg.get('ctr', 'doc_weight'))
    citing_weight = int(cfg.get('ctr', 'citing_weight'))
    cited_weight = int(cfg.get('ctr', 'cited_weight'))
    window = int(cfg.get('graph', 'window'))
    with_tag = cfg.getboolean('dataset', 'with_tag')
    damping = float(cfg.get('graph', 'damping'))

    text = filter_text(read_file(os.path.join(abstract_dir, name)),
                       with_tag=with_tag)
    edge_f = get_edge_freq(text, window=window)
    citing_edge_freq = cite_edge_freq(name, dataset, 'citing')
    cited_edge_freq = cite_edge_freq(name, dataset, 'cited')

    edge_weight = dict()
    for edge in edge_f:
        edge_weight[edge] = doc_weight * edge_f.get(edge, 0) \
                          + citing_weight * citing_edge_freq.get(edge, 0) \
                          + cited_weight * cited_edge_freq.get(edge, 0)
    edges = dict2list(edge_weight)
    graph = build_graph(edges)
    pr = nx.pagerank(graph, alpha=damping)
    return pr, graph
def singletpr(name, dataset):

    dataset = dataset.lower()
    cfg = ConfigParser()
    cfg.read(os.path.join("./config", dataset+'.ini'))

    window = int(cfg.get('graph', 'window'))
    damping = float(cfg.get('graph', 'damping'))
    abstract_dir = cfg.get('dataset', 'abstract')
    with_tag = cfg.getboolean('dataset', 'with_tag')
    lda_dir = cfg.get('dataset', 'lda')

    doc_path = os.path.join(abstract_dir, name)
    text = read_file(doc_path)
    candidates = filter_text(text, with_tag=with_tag)
    edges = dict2list(get_edge_freq(candidates, window=window))
    graph = build_graph(edges)
    lda = read_lda(os.path.join(lda_dir, name))
    pr = nx.pagerank(graph, alpha=damping, personalization=lda)
    return pr, graph
示例#3
0
def extract_cossim(dataset):
    dataset = dataset.lower()
    cfg = ConfigParser()
    cfg.read(os.path.join('./config', dataset+'.ini'))
    window = int(cfg.get('graph', 'window'))
    damping = float(cfg.get('graph', 'damping'))
    abstract_dir = cfg.get('dataset', 'abstract')
    with_tag = cfg.getboolean('dataset', 'with_tag')
    filelist = cfg.get('dataset', 'filelist')

    cfg.read('./config/global.ini')
    vec_path = cfg.get('embedding', 'wiki_vec')

    names = read_file(filelist).split()
    wvmodel = gensim.models.Word2Vec.load(vec_path)
    for name in names:
        doc_path = os.path.join(abstract_dir, name)
        text = read_file(doc_path)
        text_candidates = filter_text(text, with_tag=with_tag)
        edge_freq = get_edge_freq(text_candidates, window=window)
        save_feature(edge_freq)
def wordattractionrank(name, dataset):

    dataset = dataset.lower()
    cfg = ConfigParser()
    cfg.read(os.path.join('./config', dataset + '.ini'))
    window = int(cfg.get('graph', 'window'))
    damping = float(cfg.get('graph', 'damping'))
    abstract_dir = cfg.get('dataset', 'abstract')
    with_tag = cfg.getboolean('dataset', 'with_tag')

    cfg.read('./config/global.ini')
    vec_path = cfg.get('embedding', 'wiki_vec')

    doc_path = os.path.join(abstract_dir, name)
    text = read_file(doc_path)
    stemdict = stem2word(text)
    text_candidate = filter_text(text, with_tag=with_tag)
    edge_freq = get_edge_freq(text_candidate, window=window)
    wvmodel = gensim.models.KeyedVectors.load_word2vec_format(vec_path,
                                                              binary=False)
    edge_weight = {}
    for edge in edge_freq:
        word1 = edge[0]
        word2 = edge[1]
        try:
            distance = 1 - wvmodel.similarity(stemdict[word1], stemdict[word2])
        except:
            distance = 1
        words = text_candidate.split()
        tf1 = words.count(word1)
        tf2 = words.count(word2)
        cf = edge_freq[edge]
        force = calc_force(tf1, tf2, distance)
        dice = calc_dice(tf1, tf2, cf)
        edge_weight[edge] = force * dice
    edges = dict2list(edge_weight)
    graph = build_graph(edges)
    pr = nx.pagerank(graph, alpha=damping)
    return pr, graph
示例#5
0
def kee(name, dataset):
    
    dataset = dataset.lower()
    cfg = ConfigParser()
    cfg.read(os.path.join("./config", dataset+'.ini'))

    abstract_dir = cfg.get('dataset', 'abstract')
    window = int(cfg.get('graph', 'window'))
    with_tag = cfg.getboolean('dataset', 'with_tag')
    damping = float(cfg.get('graph', 'damping'))

    cfg.read('./config/kee.ini')
    feature_select = cfg.get('kee', 'features')

    text = read_file(os.path.join(abstract_dir, name))
    text_candidates = filter_text(text, with_tag=with_tag)
    edge_freq = get_edge_freq(text_candidates, window=window)
    tf = get_term_freq(text)
    edges = dict2list(edge_weight)
    graph = build_graph(edges)
    pr = nx.pagerank(graph, alpha=damping)
    return pr, graph
def positionrank(name, dataset):

    dataset = dataset.lower()
    cfg = ConfigParser()
    cfg.read(os.path.join("./config", dataset + '.ini'))

    window = int(cfg.get('graph', 'window'))
    damping = float(cfg.get('graph', 'damping'))
    abstract_dir = cfg.get('dataset', 'abstract')
    with_tag = cfg.getboolean('dataset', 'with_tag')

    doc_path = os.path.join(abstract_dir, name)
    text = read_file(doc_path)
    candidates = filter_text(text, with_tag=with_tag)
    edges = dict2list(get_edge_freq(candidates, window=window))
    graph = build_graph(edges)
    nodes = graph.nodes()
    if with_tag:
        text = rm_tags(text)
    pos_sum = position_sum(text, nodes)
    pr = nx.pagerank(graph, alpha=damping, personalization=pos_sum)
    return pr, graph
def textrank(name, dataset):

    dataset = dataset.lower()
    cfg = ConfigParser()
    cfg.read(os.path.join("./config", dataset + '.ini'))

    window = int(cfg.get('graph', 'window'))
    damping = float(cfg.get('graph', 'damping'))
    abstract_dir = cfg.get('dataset', 'abstract')
    with_tag = cfg.getboolean('dataset', 'with_tag')

    cfg.read('./config/global.ini')
    use_edge_weight = cfg.getboolean('textrank', 'use_edge_weight')

    doc_path = os.path.join(abstract_dir, name)
    text = read_file(doc_path)
    text_candidates = filter_text(text, with_tag=with_tag)
    edge_freq = get_edge_freq(text_candidates, window=window)
    if not use_edge_weight:
        edge_freq = {e: 1 for e in edge_freq}
    edges = dict2list(edge_freq)
    graph = build_graph(edges)
    pr = nx.pagerank_numpy(graph, alpha=damping)
    return pr, graph
def evaluate(dataset):
    """
    Evaluate ranking result.

    :param dataset: name of dataset
    :param pr: dict, key is stemmed word, value is score
    """

    method_name = 'pagerank_zf'
    dataset = dataset.upper()
    abstract_dir = os.path.join('./data', dataset, 'abstracts')
    gold_dir = os.path.join('./data', dataset, 'gold')

    extracted = os.path.join('./result', dataset, 'extracted_zf')
    pr_type = 'a1b1'  #alfa=1beta=1
    pr_dir = os.path.join('./data', dataset, 'rank_zf', pr_type)
    vocabulary_path = os.path.join('./data', dataset, 'rank_zf',
                                   'vocabulary')  #对应
    damping = 0.85  #0.2 0.5 0.8 0.85

    with_tag = True
    topn = 4
    window = 2
    ngrams = 2
    weight2 = 0.6
    weight3 = 0.3

    names = [
        name for name in os.listdir(pr_dir)
        if os.path.isfile(os.path.join(pr_dir, name))
    ]
    vocabulary = id2word(vocabulary_path)

    count = 0
    gold_count = 0
    extract_count = 0
    mrr = 0
    prcs_micro = 0
    recall_micro = 0
    for name in names:
        pr = read_pr(os.path.join(pr_dir, name), vocabulary, damping)
        doc_path = os.path.join(abstract_dir, name)
        text = read_file(doc_path)
        text_candidates = filter_text(text, with_tag=with_tag)
        edge_freq = get_edge_freq(text_candidates, window=window)
        edges = dict2list(edge_freq)
        graph = build_graph(edges)
        keyphrases = get_phrases(pr,
                                 graph,
                                 doc_path,
                                 ng=ngrams,
                                 pl2=weight2,
                                 pl3=weight3,
                                 with_tag=with_tag)
        top_phrases = []
        for phrase in keyphrases:
            if phrase[0] not in str(top_phrases):
                top_phrases.append(phrase[0])
            if len(top_phrases) == topn:
                break
        if not os.path.exists(extracted):
            os.makedirs(extracted)
        with open(os.path.join(extracted, name), encoding='utf-8',
                  mode='w') as file:
            file.write('\n'.join(top_phrases))

        standard = read_file(os.path.join(gold_dir, name)).split('\n')
        if standard[-1] == '':
            standard = standard[:-1]
        # 根据phrases是否取词干决定
        standard = list(' '.join(list(normalized_token(w) for w in g.split()))
                        for g in standard)
        count_micro = 0
        position = []
        for phrase in top_phrases:
            if phrase in standard:
                count += 1
                count_micro += 1
                position.append(top_phrases.index(phrase))
        if position != []:
            mrr += 1 / (position[0] + 1)
        gold_count += len(standard)
        extract_count += len(top_phrases)
        prcs_micro += count_micro / len(top_phrases)
        recall_micro += count_micro / len(standard)

    prcs = count / extract_count
    recall = count / gold_count
    f1 = 2 * prcs * recall / (prcs + recall)
    mrr /= len(names)
    prcs_micro /= len(names)
    recall_micro /= len(names)
    f1_micro = 2 * prcs_micro * recall_micro / (prcs_micro + recall_micro)
    print(dataset, method_name, count, prcs, recall, f1, mrr)

    eval_result = method_name + pr_type + str(damping) + '@' + str(topn) + ',' + dataset + ',' + str(prcs) + ',' \
                  + str(recall) + ',' + str(f1) + ',' + str(mrr) + ',' + str(prcs_micro) \
                  + ',' + str(recall_micro) + ',' + str(f1_micro) + ',\n'
    with open(os.path.join('./result', method_name + '.csv'),
              mode='a',
              encoding='utf8') as file:
        file.write(eval_result)