Пример #1
0
Файл: test.py Проект: haha8x/eds
def keywords_save():
    # 把所有keyword写入文件
    keywords = open('keywords.txt', encoding='utf-8', mode='w')

    print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
    for i in range(0, 90000000, 1000000):
        print(i)
        sql = 'select keyword from paper_clean1 limit ' + str(i) + ',1000000'
        paper_list = dbs.getDics(sql)
        if len(paper_list) == 0:
            break
        print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
        for paper in paper_list:
            if (paper['keyword']):
                keywords.write(paper['keyword'] + '\n')
    keywords.close()
Пример #2
0
Файл: test.py Проект: haha8x/eds
stopwords = [
    line.strip()
    for line in open('stopwords.txt', encoding='utf-8').readlines()
]
fill = [
    'vn', 'n', 'nr', 'nr1', 'nr2', 'nrj', 'nrf', 'ns', 'nsf', 'nt', 'nz', 'nl',
    'ng'
]
print('词典更新')
jieba.load_userdict('userdict.txt')
f = open('data/paperfenci.txt', 'w', encoding='utf8')
for i in range(0, 10000000, 500000):

    sql = 'select id,name,abstract,keyword from paper_clean1 limit ' + str(
        i) + ',500000'
    paper_list = dbs.getDics(sql)
    if len(paper_list) == 0:
        break
    print('分词:' + str(i))
    print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
    DocWord = []
    for paper in paper_list:
        line = paper['name'].strip('\n').strip(
            '\t') + ' ' + paper['abstract'].strip('\n').strip(
                '\t') + ' ' + paper["keyword"].strip('\n').strip('\t')
        seg_list = pseg.cut(line)
        words = []
        for word, flag in seg_list:
            if flag in fill and word not in stopwords:
                words.append(word)
        DocWord.append(words)
Пример #3
0
from algorithm.base import dbs

sql = "SELECT _id,discipline_subject FROM `journal`"

temp = dbs.getDics(sql)
dic = {}
for t in temp:
    sub = t['discipline_subject']
    list = sub.split('-')
    if list[0] not in dic:
        dic[t["_id"]] = list[0]

sql = "SELECT id,org_id,author_id FROM paper_clean1"
paper = dbs.getDics(sql)
teacher = {}
i = 0
for p in paper:
    if p['org_id'] != -1:
        item = {}
        item["father_id"] = p["id"]
        item["type"] = "1"
        item["prefix"] = dic[p['org_id']]
        records = {"table": "LdaPrefix", "params": item}
        dbs.insertItem(records)
        if p["author_id"] in teacher:
            if p['org_id'] in teacher[p["author_id"]]:
                teacher[p["author_id"]][p['org_id']] += 1
            else:
                teacher[p["author_id"]][p['org_id']] = 1
        else:
            teacher[p["author_id"]] = {p['org_id']: 1}
Пример #4
0
    ]
    for t in result:
        if t['fields'] is not None:
            list.append(t)
            continue
        f = []
        for j in range(random.randint(0, 4) + 1):
            f.append(field[random.randint(0, len(field)) - 1])
        t['fields'] = f
        list.append(t)
    return list


print('导出数据')
sql = 'SELECT paper_clean1.id,paper_clean1.author_id,paper_clean1.`name` as title,paper_clean1.abstract,t.name,t.school,t.institution,t.citation,t.paper_num,t.h_index,t.fields from paper_clean1 JOIN (select radar.author_id,radar.citation,radar.paper_num,radar.h_index,teacher.name,teacher.school,teacher.institution,teacher.fields from teacher LEFT JOIN radar on teacher.id =radar.author_id ) as t on paper_clean1.author_id=t.author_id'
list = dbs.getDics(sql)
print(len(list))

sql = "SELECT * FROM school_info"
schools = dbs.getDics(sql)


def getScool(name):
    r = []
    for s in schools:
        if s['name'].find(name) >= 0:
            r.append(s)
    return r


for l in list:
Пример #5
0
xueDic = {
    "中国史": "06",
    "农业资源与环境": "09",
    "图书情报与档案管理": "12",
    "城乡规划学": "12",
    "安全科学与工程": "0819",
    "戏剧与影视学": "05",
    "考古学": '06',
    "艺术学理论": '05',
    "草学": '09',
    "设计学": '05',
    "软件工程": "0812",
    "音乐与舞蹈学": '05',
    "风景园林学": '05',
}
xue = dbs.getDics(sql)
for x in xue:
    if x["code"] is None:
        x["code"] = xueDic[x["xueke2"]]

    elif x["code"][0:2] == "07" or x["code"][0:2] == "08":
        x["code"] = x["code"][0:4]
    else:
        x["code"] = x["code"][0:2]

sql = 'select * from teacher_dis_code'
list = dbs.getDics(sql)
sql = "SELECT * FROM school_info where name like %s"

for l in list:
    params = (str(l["school"]), )
Пример #6
0
def get_words():
    sql = "SELECT DISTINCT topic_value FROM `lda` where topic like '%计算机%'"
    result = dbs.getDics(sql)
    return result
Пример #7
0
import json
from algorithm.base import dbs

print('导出数据')
sql = 'SELECT paper.id,paper.author_id,paper.author,teacher.name,teacher.institution from paper  JOIN  teacher  on paper.author_id=teacher.id'
list = dbs.getDics(sql)
sql = 'SELECT id,name,school,institution from  teacher'
teacher = dbs.getDics(sql)
print('开始关联')
for t in teacher:
    t['name'] = t['name'].replace(' ', '')
egoNet = {}
for l in list:
    id = str(l['author_id'])
    try:
        author = json.loads(l['author'])
    except:
        print(l)
    name = l['name'].replace(' ', '')
    for a in author:
        if len(a) >= 1 and name != a['name']:
            find = False
            for t in teacher:
                if a['name'] == t['name'] and (len(
                        a['org']) == 0 or a['org'].find(t['school']) >= 0):
                    find = True
                    key = id + '_' + str(t['id']) + '_' + a['name']
                    if key in egoNet.keys():
                        if l['id'] not in egoNet[key]:
                            egoNet[key].append(l['id'])
                    else: