Exemplo n.º 1
0
def depart_type(sql, type_list):
    type_depart = {}  # 每组对应的机构信息
    id_depart = getdata(sql)  # 已有机构信息的文章编号和作者机构信息
    for line in id_depart:
        type_depart[line[0]] = line[1].strip()

    for a in range(len(type_list)):
        # 统计每一组对应的机构信息
        a_types = {}
        ake = 0
        for b in type_list[a]:
            if b in type_depart:
                ake += 1
                # print(type_depart[b])  # 输出对应的机构信息
                # 统计每种机构信息出现的次数
                if type_depart[b] in a_types:
                    temp = a_types[type_depart[b]] + 1
                    a_types[type_depart[b]] = temp
                else:
                    a_types[type_depart[b]] = 1
        print("第" + str(a + 1) + "组:")
        print(str(len(type_list[a])) + "篇文章共中有" + str(ake) + "篇有机构信息")
        if len(a_types) > 0:
            aa = reversed(sorted(a_types.items(), key=lambda value: value[1]))
            for line in aa:
                print(line)
        print()
Exemplo n.º 2
0
def get_test_corpus(name, dict1):
    sql = "select title,keywords,abstract from topics.{0} where author_unique is NULL;".format(
        name)
    test_docs = []
    data = getdata(sql)
    for DOC in data:
        title = DOC[0]
        keywords = DOC[1]
        abstract = DOC[2]
        temp = "{};{};{}".format(title, keywords, abstract)
        with open("./data/temp1.txt", "w", encoding='utf-8') as f:
            f.write(temp.rstrip("\n"))
        ltp(
            r"D:\MyProject\pythonProjects\TopicMine\GetData\geology\data\temp1.txt",
            r"D:\MyProject\pythonProjects\TopicMine\GetData\geology\data\temp2.txt"
        )
        documents = parse_xml("./data/temp2.txt")
        test_docs.append(' '.join(documents))

    texts = [[word for word in t.split()] for t in test_docs]
    corpus_temp = [dict1.doc2bow(text) for text in texts]
    for v in range(len(corpus_temp)):
        for a in range(len(corpus_temp[v])):
            temp1 = corpus_temp[v][a][1]
            temp2 = corpus_temp[v][a][0]
            corpus_temp[v][a] = (temp2, temp1 / len(corpus_temp[v]))
    corpora.MmCorpus.serialize('./data/test_corpus.mm', corpus_temp)
Exemplo n.º 3
0
def depart_type(sql, type_list):
    type_depart = {}  # 每组对应的机构信息
    id_depart = getdata(sql)  # 已有机构信息的文章编号和作者机构信息
    for line in id_depart:
        type_depart[line[0]] = line[1].strip()

    for a in range(len(type_list)):
        # 统计每一组对应的机构信息
        a_types = {}
        ake = 0
        for b in type_list[a]:
            if b in type_depart:
                ake += 1
                # print(type_depart[b])  # 输出对应的机构信息
                # 统计每种机构信息出现的次数
                if type_depart[b] in a_types:
                    temp = a_types[type_depart[b]] + 1
                    a_types[type_depart[b]] = temp
                else:
                    a_types[type_depart[b]] = 1
        print("第" + str(a + 1) + "组:")
        print(str(len(type_list[a])) + "篇文章共中有" + str(ake) + "篇有机构信息")
        if len(a_types) > 0:
            aa = reversed(sorted(a_types.items(), key=lambda value: value[1]))
            for line in aa:
                print(line)
        print()
Exemplo n.º 4
0
def fill_new_table(sql, name, ins1, ins2):
    data = getdata(sql)
    id = 0
    for row in data:
        title = row[1].strip().strip('\n').strip('\r')
        authors = row[2]
        mname = row[3]
        index_terms = row[4]
        if row[5] is not None:
            keywords = row[5].strip().strip('\n').strip('\r')
        else:
            keywords = row[5]
        abstract = row[6].strip().strip('\n').strip('\r')
        pdate = row[7]
        try:
            # 找到包含名字是name的记录
            authorlist = authors.split(";")
            for author in authorlist:
                new_author = re.split(r'\^c', author.strip())
                if new_author[0] == name:
                    if len(new_author) > 1:  # 标记已有机构信息的作者
                        temp = new_author[1]
                        id += 1
                        sql = ins1.format(id, title, temp, authors, mname, index_terms, keywords, abstract, pdate,
                                          name)
                        handledata(sql)
                    else:
                        id += 1
                        sql = ins2.format(id, title, authors, mname, index_terms, keywords, abstract, pdate,
                                          name)
                        handledata(sql)
        except Exception as e:
            print(e)
    print("填充新表" + "\"" + name + "\"" + "完成!")
Exemplo n.º 5
0
def update_departs(sql, lis, name):
    data = getdata(sql)
    for a in data:
        dd = a[0]
        depart = a[1]
        if depart not in lis:
            temp = "update %s set author_unique=NULL where id =%d" % (name, dd)
            handledata(temp)
Exemplo n.º 6
0
def update_departs(sql, lis, name):
    data = getdata(sql)
    for a in data:
        dd = a[0]
        depart = a[1]
        if depart not in lis:
            temp = "update %s set author_unique=NULL where id =%d" % (name, dd)
            handledata(temp)
Exemplo n.º 7
0
def get_paper(name):
    sql = 'select title,keywords,abstract from topics.{0};'.format(name)
    data = getdata(sql)
    for DOC in data:
        title = DOC[0]
        keywords = DOC[1]
        abstract = DOC[2]
        # 获得训练数据
        temp = '{};{};{}'.format(title, keywords, abstract)
        with open('data/temp1.txt', 'a', encoding='utf-8') as f:
            f.write('%s\n' % temp.strip())
Exemplo n.º 8
0
def get_author_paper(sql, num_coauthor):
    # 获取作者列表
    author_list = []
    data = getdata(sql)
    for line in data:
        author_list.append([])
        id = int(line[0]) - 1
        temp = line[1].split(';')
        for authors in temp:
            author = authors.split('^c')
            author_list[id].append(author[0].strip())
    # print(author_list)

    # 统计拥有相同合著者的文章
    related_list = []  # 相关列表
    for a in range(len(author_list) - 1):
        related_list.append([])  # 存储每篇文章的同一作者文章,根据合著者相同判断
        related_list[a].append(a + 1)  # 存储文章自身编号
        for b in range(a + 1, len(author_list)):  # 循环判断其后每篇文章是否有相同合著者
            same = 0
            for item in author_list[a]:
                if item in author_list[b]:
                    same += 1
            if same > num_coauthor:  # 参数可以调整,最低为1,即除了作者自身有一个共现作者
                related_list[a].append(b + 1)  # 存储有相同合作者的文章编号
    # print(related_list)

    # 去除重复项
    for a in range(len(author_list) - 1):
        for b in range(a + 1, len(author_list) - 1):
            tempa = related_list[a]
            tempb = related_list[b]
            if set(tempa) & set(tempb):  # 如果文章有交集
                related_list[b] = related_list[a] + related_list[b]
                related_list[a] = []
                break

    # 输出
    new_list = []
    id2 = 0
    id1 = 0
    num = 0
    for item in related_list:
        if len(item) > 1:
            id2 += 1
            num += len(set(item))
            print(sorted(list(set(item))))
            # new_list.append(sorted(list(set(item))))
        if len(item) == 1:
            id1 += 1
            # print(list(set(item)))
    print("拥有合著者共现的类别:" + str(id2) + "个;  包括文章:" + str(num) + "篇")
    print("没有合著者共现的文章:" + str(id1))
    print("总文章数:" + str(len(author_list)))
Exemplo n.º 9
0
def get_author_paper(sql, num_coauthor):
    # 获取作者列表
    author_list = []
    data = getdata(sql)
    for line in data:
        author_list.append([])
        id = int(line[0]) - 1
        temp = line[1].split(';')
        for authors in temp:
            author = authors.split('^c')
            author_list[id].append(author[0].strip())
    # print(author_list)

    # 统计拥有相同合著者的文章
    related_list = []  # 相关列表
    for a in range(len(author_list) - 1):
        related_list.append([])  # 存储每篇文章的同一作者文章,根据合著者相同判断
        related_list[a].append(a + 1)  # 存储文章自身编号
        for b in range(a + 1, len(author_list)):  # 循环判断其后每篇文章是否有相同合著者
            same = 0
            for item in author_list[a]:
                if item in author_list[b]:
                    same += 1
            if same > num_coauthor:  # 参数可以调整,最低为1,即除了作者自身有一个共现作者
                related_list[a].append(b + 1)  # 存储有相同合作者的文章编号
    # print(related_list)

    # 去除重复项
    for a in range(len(author_list) - 1):
        for b in range(a + 1, len(author_list) - 1):
            tempa = related_list[a]
            tempb = related_list[b]
            if set(tempa) & set(tempb):  # 如果文章有交集
                related_list[b] = related_list[a] + related_list[b]
                related_list[a] = []
                break

    # 输出
    new_list = []
    id2 = 0
    id1 = 0
    num = 0
    for item in related_list:
        if len(item) > 1:
            id2 += 1
            num += len(set(item))
            print(sorted(list(set(item))))
            # new_list.append(sorted(list(set(item))))
        if len(item) == 1:
            id1 += 1
            # print(list(set(item)))
    print("拥有合著者共现的类别:" + str(id2) + "个;  包括文章:" + str(num) + "篇")
    print("没有合著者共现的文章:" + str(id1))
    print("总文章数:" + str(len(author_list)))
Exemplo n.º 10
0
def get_keywords(name):
    sql = 'select keywords from topics.{0};'.format(name)
    data = getdata(sql)
    keywords = []
    for doc in data:
        keyword = doc[0]
        if keyword != 'None' and keyword != '':
            for key in re.split(r',|,|;|;|:|:|“|”|"', keyword):
                if len(key) > 1 and key not in keywords:
                    keywords.append(key)
    with open('data/user_dict.txt', 'a', encoding='utf-8') as f:
        f.write('\n'.join(keywords))
Exemplo n.º 11
0
def get_noun(sql, name, dep_lis):
    all_docs = []
    for dep in dep_lis:
        data = getdata(sql.format(name, dep))
        ss = ""
        for DOC in data:
            title = DOC[0]
            keywords = DOC[1]
            abstract = DOC[2]
            temp = "{};{};{}".format(title, keywords, abstract)
            ss += temp + "\n"
        with open("./data/temp1.txt", "w", encoding='utf-8') as f:
            f.write(ss.rstrip("\n"))
        ltp(r"D:\MyProject\pythonProjects\TopicMine\GetData\geology\data\temp1.txt",
            r"D:\MyProject\pythonProjects\TopicMine\GetData\geology\data\temp2.txt")
        documents = parse_xml("./data/temp2.txt")
        all_docs.append(' '.join(documents))
    return all_docs
def coauthor_dict_match(sql, dictionary, name):
    data = getdata(sql)  # 没有机构信息的文章数据
    paper_get_type = 0
    only_one_type = 0
    get_more_type = 0

    for line in data:
        departs = {}
        paper_id = line[1]
        for coauthor in line[0].split(";"):  # 合著者列表
            author = coauthor.split("^c")
            if author[0] != name:  # 合著者不等于重名者
                for item in dictionary.items():
                    if author[0] in item[1]:  # 合著者的姓名在合著者列表中
                        if item[0] not in departs:
                            departs[item[0]] = 1
                        else:
                            temp = departs[item[0]] + 1
                            departs[item[0]] = temp

        # 输出匹配到合著者匹配结果
        # if len(departs) > 0:
        #     paper_get_type += 1
        #     print("第%d篇文章的可能机构信息:" % paper_id)
        #     aa = reversed(sorted(departs.items(), key=lambda value: value[1]))
        #     for a in aa:
        #         if a[1] > 1:    # 限制 匹配到合著者的个数
        #             print(a)
        #     print()

        if len(departs) == 1:
            for key in departs:
                if departs[key] > 1:    # 至少两个合著者
                    print("%d:%s\n" % (paper_id, key))
                    with open('./data/depart_coauthor_match.txt', 'a', encoding='utf-8') as f:
                        f.write("%d:%s\n" % (paper_id, key))
                    only_one_type += 1
                    aa = "update {0} set author_unique = '{1}' where id ={2};".format(name, key, int(paper_id))
                    handledata(aa)

    # print("可能匹配到机构信息的文章个数:" + str(paper_get_type))
    print("匹配到一个机构信息且至少两个合著者的文章个数:" + str(only_one_type))
def get_coauthor_dict(sql, name):
    dicts = {}  # 存储合作者字典
    data = getdata(sql)
    for line in data:
        if line[0] in dicts:  # 唯一作者标识已经存在字典中
            new_coauthor = dicts[line[0]]
            coauthors = line[1].split(";")
            for coauthor in coauthors:
                author = coauthor.split("^c")
                if author[0] != name and author[0] not in new_coauthor:
                    new_coauthor.append(author[0])
        else:
            new_coauthor = []
            coauthors = line[1].split(";")
            for coauthor in coauthors:
                author = coauthor.split("^c")
                if author[0] != name:
                    new_coauthor.append(author[0])
            dicts[line[0]] = new_coauthor
    return dicts
Exemplo n.º 14
0
def get_noun(sql, name, dep_lis):
    all_docs = []
    for dep in dep_lis:
        data = getdata(sql.format(name, dep))
        ss = ""
        for DOC in data:
            title = DOC[0]
            keywords = DOC[1]
            abstract = DOC[2]
            temp = "{};{};{}".format(title, keywords, abstract)
            ss += temp + "\n"
        with open("./data/temp1.txt", "w", encoding='utf-8') as f:
            f.write(ss.rstrip("\n"))
        ltp(
            r"D:\MyProject\pythonProjects\TopicMine\GetData\geology\data\temp1.txt",
            r"D:\MyProject\pythonProjects\TopicMine\GetData\geology\data\temp2.txt"
        )
        documents = parse_xml("./data/temp2.txt")
        all_docs.append(' '.join(documents))
    return all_docs
Exemplo n.º 15
0
def get_depart_list(sql, name):
    departs = getdata(sql)
    list1 = []
    for depart in departs:
        if depart[0] not in list1:
            list1.append(depart[0])
    # 保留最详细的机构信息,如“成都理工大学信息管理学院”与“成都理工大学”,保留后者
    for a in range(len(list1) - 1):
        if list1[a] is not None:
            for b in range(a + 1, len(list1)):
                if list1[b] is not None:
                    if list1[a].startswith(list1[b]):
                        list1[b] = None
                        continue
                    elif list1[b].startswith(list1[a]):
                        list1[a] = None
                        break
    list2 = [a for a in list1 if a is not None]
    if len(list2) != len(list1):
        temp = "select id,author_unique from topics.{0} where author_unique is not NUll;".format(name)
        update_departs(temp, list2, name)
    return list2
Exemplo n.º 16
0
def get_depart_list(sql, name):
    departs = getdata(sql)
    list1 = []
    for depart in departs:
        if depart[0] not in list1:
            list1.append(depart[0])
    # 保留最详细的机构信息,如“成都理工大学信息管理学院”与“成都理工大学”,保留后者
    for a in range(len(list1) - 1):
        if list1[a] is not None:
            for b in range(a + 1, len(list1)):
                if list1[b] is not None:
                    if list1[a].startswith(list1[b]):
                        list1[b] = None
                        continue
                    elif list1[b].startswith(list1[a]):
                        list1[a] = None
                        break
    list2 = [a for a in list1 if a is not None]
    if len(list2) != len(list1):
        temp = "select id,author_unique from topics.{0} where author_unique is not NUll;".format(
            name)
        update_departs(temp, list2, name)
    return list2
Exemplo n.º 17
0
def fill_new_table(sql, name, ins1, ins2):
    data = getdata(sql)
    id = 0
    for row in data:
        title = row[1].strip().strip('\n').strip('\r')
        authors = row[2]
        mname = row[3]
        index_terms = row[4]
        if row[5] is not None:
            keywords = row[5].strip().strip('\n').strip('\r')
        else:
            keywords = row[5]
        abstract = row[6].strip().strip('\n').strip('\r')
        pdate = row[7]
        try:
            # 找到包含名字是name的记录
            authorlist = authors.split(";")
            for author in authorlist:
                new_author = re.split(r'\^c', author.strip())
                if new_author[0] == name:
                    if len(new_author) > 1:  # 标记已有机构信息的作者
                        temp = new_author[1]
                        id += 1
                        sql = ins1.format(id, title, temp, authors, mname,
                                          index_terms, keywords, abstract,
                                          pdate, name)
                        handledata(sql)
                    else:
                        id += 1
                        sql = ins2.format(id, title, authors, mname,
                                          index_terms, keywords, abstract,
                                          pdate, name)
                        handledata(sql)
        except Exception as e:
            print(e)
    print("填充新表" + "\"" + name + "\"" + "完成!")
Exemplo n.º 18
0
def get_test_corpus(name, dict1):
    sql = "select title,keywords,abstract from topics.{0} where author_unique is NULL;".format(name)
    test_docs = []
    data = getdata(sql)
    for DOC in data:
        title = DOC[0]
        keywords = DOC[1]
        abstract = DOC[2]
        temp = "{};{};{}".format(title, keywords, abstract)
        with open("./data/temp1.txt", "w", encoding='utf-8') as f:
            f.write(temp.rstrip("\n"))
        ltp(r"D:\MyProject\pythonProjects\TopicMine\GetData\geology\data\temp1.txt",
            r"D:\MyProject\pythonProjects\TopicMine\GetData\geology\data\temp2.txt")
        documents = parse_xml("./data/temp2.txt")
        test_docs.append(' '.join(documents))

    texts = [[word for word in t.split()] for t in test_docs]
    corpus_temp = [dict1.doc2bow(text) for text in texts]
    for v in range(len(corpus_temp)):
        for a in range(len(corpus_temp[v])):
            temp1 = corpus_temp[v][a][1]
            temp2 = corpus_temp[v][a][0]
            corpus_temp[v][a] = (temp2, temp1 / len(corpus_temp[v]))
    corpora.MmCorpus.serialize('./data/test_corpus.mm', corpus_temp)
Exemplo n.º 19
0
#! /usr/bin/env python3
# -*- coding: utf-8 -*-

"""
@author: ake
@software: PyCharm Community Edition
@time: 2016/4/24 16:51
"""
from GetData.geology.evaluate import getlist
from GetData.preprocess import getdata


if __name__ == '__main__':
    departlist = getlist('data/departs.txt')
    name1 = '李勇'
    sql0 = "select id,authors from {0} where author_unique is NUll;".format(name1)
    data = getdata(sql0)
    for line in data:
        id1 = line[0]
        departs = []
        doc = line[1].strip().split(';')
        for author in doc:
            aa = author.split('^c')
            if aa[0] != name1 and len(aa) > 1 and aa[1] in departlist:
                departs.append(aa[1])
        if len(departs) == 1:
            print('%d:%s' % (id1, departs[0]))
            with open('data/合著者机构匹配.txt', 'a', encoding='utf-8') as f:
                f.write('%d:%s' % (id1, departs[0]) + '\n')
Exemplo n.º 20
0
    # 相对词频 = 某词在文章中出现的次数/文章的总词数
    for v in range(len(new_corpus)):
        for a in range(len(new_corpus[v])):
            temp1 = new_corpus[v][a][1]
            temp2 = new_corpus[v][a][0]
            new_corpus[v][a] = (temp2, temp1 / len(new_corpus[v]))
    corpora.MmCorpus.serialize('./data/type.mm', new_corpus)

    # 生成并保存相似矩阵
    index = similarities.Similarity('./data',
                                    new_corpus,
                                    num_features=len(dictionary1),
                                    num_best=5)
    index.save('./data/sim.index')


if __name__ == '__main__':
    name1 = "李勇"
    # 统计已有的机构类别信息
    depart_list = []
    sql1 = "select author_unique from {0} where author_unique is not NUll;".format(
        name1)
    for depart in getdata(sql1):
        if depart[0] not in depart_list:
            depart_list.append(depart[0])
    with open("./data/departs.txt", "w", encoding="utf-8") as f:
        f.write("\n".join(depart_list))

    # 生成并本地化保存词典和按词典表示的类别语料
    sql2 = "select title,keywords,abstract from topics.{0} where author_unique = '{1}';"
    get_corpus(sql2, name1, depart_list)
        #             print(a)
        #     print()

        if len(departs) == 1:
            for key in departs:
                if departs[key] > 1:    # 至少两个合著者
                    print("%d:%s\n" % (paper_id, key))
                    with open('./data/depart_coauthor_match.txt', 'a', encoding='utf-8') as f:
                        f.write("%d:%s\n" % (paper_id, key))
                    only_one_type += 1
                    aa = "update {0} set author_unique = '{1}' where id ={2};".format(name, key, int(paper_id))
                    handledata(aa)

    # print("可能匹配到机构信息的文章个数:" + str(paper_get_type))
    print("匹配到一个机构信息且至少两个合著者的文章个数:" + str(only_one_type))


if __name__ == '__main__':
    name1 = "李勇"

    sql3 = "select author_unique from topics.{0} where author_unique is NUll;".format(name1)
    num_no_depart = len(getdata(sql3))
    print("没有机构信息的文章有%d篇" % num_no_depart)

    # 获取已有单位信息的作者合著者字典
    sql1 = "select author_unique,authors from topics.{0} where author_unique is not NUll;".format(name1)
    coauthor_dict = get_coauthor_dict(sql1, name1)

    # 根据合作者信息字典匹配作者
    sql2 = "select authors,id from topics.{0} where author_unique is NUll;".format(name1)
    coauthor_dict_match(sql2, coauthor_dict, name1)
Exemplo n.º 22
0
        new_dict.append(aa)
    dictionary1 = corpora.Dictionary(new_dict)
    dictionary1.save('./data/type.dict')  # 保存为本地词典

    new_corpus = [dictionary1.doc2bow(text) for text in texts]
    # 相对词频 = 某词在文章中出现的次数/文章的总词数
    for v in range(len(new_corpus)):
        for a in range(len(new_corpus[v])):
            temp1 = new_corpus[v][a][1]
            temp2 = new_corpus[v][a][0]
            new_corpus[v][a] = (temp2, temp1 / len(new_corpus[v]))
    corpora.MmCorpus.serialize('./data/type.mm', new_corpus)

    # 生成并保存相似矩阵
    index = similarities.Similarity('./data', new_corpus, num_features=len(dictionary1), num_best=5)
    index.save('./data/sim.index')

if __name__ == '__main__':
    name1 = "李勇"
    # 统计已有的机构类别信息
    depart_list = []
    sql1 = "select author_unique from {0} where author_unique is not NUll;".format(name1)
    for depart in getdata(sql1):
        if depart[0] not in depart_list:
            depart_list.append(depart[0])
    with open("./data/departs.txt", "w", encoding="utf-8") as f:
        f.write("\n".join(depart_list))

    # 生成并本地化保存词典和按词典表示的类别语料
    sql2 = "select title,keywords,abstract from topics.{0} where author_unique = '{1}';"
    get_corpus(sql2, name1, depart_list)