Пример #1
0
def updateAuthorIDfromWholeData(authorID):
    """
    更新原库作者信息
    :param authorID:
    :return: 作者ID,作者机构ID
    """
    server = dbIO(originDBStr)
    sql = "select id,affillist from authorlist where aid = '%s'" % authorID
    wholerows = server.load(sql)
    if len(wholerows) > 0:
        singleID = wholerows[0][0]
        affilid = wholerows[0][1].split("|")[0]
    else:
        server2 = dbIO(newDBStr)
        sql = "select * from authortest where aid = '%s'" % authorID
        print(sql)
        alterdata = server2.load(sql)[0]
        affilid = alterdata[9]
        # print(affilid,alterdata)
        if len(affilid) > 9:
            affilid = selectAffilName(alterdata[9].split("|"))

        result = updateAffilIDfromWholeData(affilid)
        if result == -1: affilid = ''
        sql = "insert into authorlist (aid,url,fullname,simname,firstname,lastname,simlastname," \
              "articlelist,affillist)values ('%s','%s','%s','%s','%s','%s','%s','%s','%s')"\
              % (alterdata[1], alterdata[2], alterdata[3].replace("'", "''"), alterdata[4].replace("'", "''"),
                 alterdata[5].replace("'", "''"), alterdata[6].replace("'", "''"), alterdata[7], '', affilid)
        # print(sql)
        server.save(sql)
    return authorID, affilid
Пример #2
0
def getAlterArticleData():
    """
    为原库更新论文信息,插入信息到articlelist
    :return:
    """
    server = dbIO(newDBStr)
    sql = "select * from articletest"
    articleData = server.load(sql)
    for item in articleData:
        authorlist = item[3].split("|")
        affillist = item[4].split("|")
        if len(authorlist) > 200 or checkTitleInWholeData(
                item[5]):  # pass the reviewers paper 作者过多或出现过的论文无用
            continue
        # print(item,len(authorlist),len(affillist))
        newAuthorlist = []
        newAffillist = []
        for author in authorlist:
            if author == '': continue
            authorID, affilID = updateAuthorIDfromWholeData(author)
            newAuthorlist.append(authorID)
            newAffillist.append(affilID)
        server2 = dbIO(originDBStr)
        sql = """insert into articlelist (sid,doi,authorlist,affillist,
            title,abstract,keywords,date,journalName,articleType,abstractLang,citation) 
            values ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')""" \
            % (item[1], item[2], "|".join(newAuthorlist), "|".join(newAffillist),
               item[5].replace("'", "''"), item[6].replace("'", "''"), item[7].replace("'", "''"),
               item[8], item[9].replace("'", "''"), item[10], item[11], item[12])
        # print(sql)
        server2.save(sql)
Пример #3
0
def updateSql(author, afil, pid):
    """
    更新作者表与机构表内容
    :param author: 作者信息字典
    :param afil: 机构信息字典
    :param pid: 文章ID
    :return:
    """
    server = dbIO()
    sql = "select * from authorlist where aid='%s'" % (author["id"])
    if server.count(sql) <= 0:
        if not afil:
            affiliation = ""
        else:
            affiliation = author["affiliation"]
        sql = "insert into authorlist (aid,url,fullname,simname,firstname,lastname,simlastname,articlelist,affillist)" \
              " values ('%s','%s','%s','%s','%s','%s','%s','%s','%s')" \
              % (author["id"], author["url"], author["fullname"], author["simname"], author["firstname"],
                 author["lastname"], author["simlastname"], pid, affiliation)
        # print sql
        server.save(sql)
        if afil:
            sql = "select * from affillist where afid='%s'" % (afil["id"])
            if server.count(sql) <= 0:
                sql = "insert into affillist (afid,name,city,country,url) values ('%s','%s','%s','%s','%s')" \
                      % (afil["id"], afil["name"], afil["city"], afil["country"], afil["url"])
                server.save(sql)
Пример #4
0
def getTittle():
    """
    :return: 标题与flag情况,对于scopus ID为空的数据(Scopus信息未更新或是找不到)
    """
    server = dbIO()
    sql = "select title,flag from searchlist2 where sid=''"
    return server.load(sql)
Пример #5
0
def getArticleInfo():
    if True:  # try:
        server = dbIO()
        sql = "select sid from searchlist2 where flag = 0  and" \
              " id > (SELECT max(id) FROM paper_data.searchlist2 where flag = 1)"
        totalNum = server.count(sql)
        if totalNum > 0:
            finishNum = 0
            passNum = 0
            print('totalNum:' + str(totalNum))
            keIndex = checkScopusKeyIndex()
            for row in server.load(sql):
                sid = row[0]
                passNum += 1
                try:
                    infodict, authorList, affilList = getInfofromAbstractAPI(
                        int(sid), keIndex)
                    if infodict == []:
                        continue
                    flag = saveWholeArticletoDB(infodict, authorList,
                                                affilList)
                    if flag > -1:
                        sql = "update searchlist2 set flag = 1 where sid = " + sid
                        finishNum += 1
                        print("No." + str(finishNum) + " Finish: " + sid +
                              " PassNum:" + str(passNum - finishNum))
                        server.save(sql)

                except Exception as e:  # ConnectionResetError
                    print("Wrong: " + sid + "*" * 20)
                    print(e)
                    continue
Пример #6
0
def singlePaperRun(paperID, keyIndex):
    """
    每篇论文的处理,对应于一个singleThread对象
    :param paperID: 论文ID
    :param keyIndex:
    :return:  异常或没有信息返回,则返回0,更新论文信息并更新列表(searchlist)的flag =1,返回1
    """
    try:
        global onceSuccess
        start = time.clock()
        server = dbIO()
        infodict, authorList, affilList = getInfofromAbstractAPI(paperID, keyIndex)
        if infodict is []:
            return 0
        flag = saveWholeArticletoDB(infodict, authorList, affilList)
        if flag == 1:
            sql = "select id from searchlist where sid = " + paperID
            sql = "update searchlist set flag = 1 where id = " + str(server.load(sql)[0][0])
            server.save(sql)
        end = time.clock()
        onceSuccess = onceSuccess + 1
        print("Thread%s:Finished_CostTime:%d" % (paperID, (end - start)))
        return 1
    except ConnectionResetError as e:
        print("wrongThread%s:%s" % (paperID, str(e)))
        return 0
Пример #7
0
def udpateSearchlist():
    """
    从新库中获取新的文章信息添加到原库中,更新检索列表
    :return:
    """
    server = dbIO(newDBStr)
    sql = """select title,url,year,publication_id,flag,sid from searchlist"""
    datarows = server.load(sql)
    server = dbIO(originDBStr)
    print(len(datarows))
    for data in datarows:
        title, url, year, publication_id, flag, sid = data
        if not sid: sid = ""
        sql = """insert into searchlist2 (title,url,year,publication_id,flag,sid) 
              values ('%s','%s','%s','%d','%d','%s')""" \
              % (title.replace("'", "''"), url.replace("'", "''"), year.replace("'", "''"),
                 publication_id, flag, sid.replace("'", "''"))
        server.save(sql)
Пример #8
0
def loadTaskList(key):
    """
    :param key: 关键词
    :return: 查询未被处理的任务对应列表
    """
    dbIOserver = dbIO()
    sql = "select id, totalNum, query from tasklist where keyword = '" + key + "' and flag != 1"
    data = dbIOserver.load(sql)
    taskList = [[int(item[0]), item[2]] for item in data]
    return taskList
Пример #9
0
def getMaxFrequentAffilName(affillist):
    """
    for mutil-affil get the most frequent one (模糊获取高频机构)
    :param affillist: 机构列表
    :return: 打印机构与对应文章数
    """
    server = dbIO(originDBStr)
    for affil in affillist:
        sql = "select count(*) from articlelist where affillist like '%" + affil + "%'"
        num = server.load(sql)[0][0]
        print(affil, num)
Пример #10
0
def selectAffilName(affillist):
    """
    选取机构名称,优先选取第一机构
    :param affillist:
    :return: 机构的ID
    """
    server = dbIO(newDBStr)
    for affilId in affillist:
        sql = "select * from affiltest where afid = '%s'" % affilId
        if server.count(sql) > 0:
            return affilId
Пример #11
0
def getAuthorlist(pid):
    """
    :param pid: 文章ID
    :return: 获取同一文章的全部作者列表
    """
    server = dbIO()
    authorlist = []
    sql = "select aid from authorlist where articlelist='%s'" % pid
    datarows = server.load(sql)
    for row in datarows:
        authorlist.append(row[0])
    return authorlist
Пример #12
0
def checkTitleInWholeData(title):
    """
    检查论文是否在原库出现过
    :param title: 文章标题
    :return: True表示文章出现过,否则未出现
    """
    server = dbIO(originDBStr)
    sql = "select * from articlelist where title = '%s'" % (title.replace(
        "'", "''"))
    if server.count(sql) > 0:
        return True
    else:
        return False
Пример #13
0
def authoraffil(authorlist):
    """
    :param authorlist: 作者ID列表
    :return:  作者对应机构ID列表
    """
    server = dbIO()
    affillist = []
    for aid in authorlist:
        sql = "select affillist from authorlist where aid='%s'" % aid
        datarows = server.load(sql)
        for row in datarows:
            affillist.append(row[0])
    return affillist
Пример #14
0
def getArticleInfo():
    """
    对于searchlist中的flag,如果flag为1,那么有scopus id,并且完成操作
    如果flag为2可能,在插入有scops id数据时出现错误
    如果flag为3,则进入补充数据Supply data阶段,表示数据补充完成,但其sid为semantic id
    :return:
    """
    if True:
        # 获取没有scopus id的数据部分
        server = dbIO()
        sql = "select title,flag,sid from searchlist2 where flag=0 and sid is NULL"
        totalNum = server.count(sql)
        if totalNum > 0:
            finishNum = 0
            passNum = 0
            print('totalNum:'+str(totalNum))
            keIndex = checkScopusKeyIndex()
            for row in server.load(sql):
                title, flag, sid = row
                passNum += 1
                # try: 依据标题获取scopus id
                sid = dealInfo(titleGetRes(title), title)
                print("sid:", sid)
                # 有sid存在时
                if sid:
                    print(title, flag)
                    # 获取文章信息,作者信息,机构信息
                    infodict, authorList, affilList = getInfofromAbstractAPI(int(sid), keIndex)

                    if infodict is []:
                        continue
                    flag = saveWholeArticletoDB(infodict, authorList, affilList)
                    if flag > -1:
                        sql = "update searchlist2 set flag = 1 where sid = '%s'" % sid
                        finishNum += 1
                        print("No."+str(finishNum)+" Finish: "+sid+" PassNum:"+str(passNum-finishNum))
                        server.save(sql)
                # 不存在sid,从semantic进行数据补充处理
                else:
                    print(title, flag)
                    m = getJsonDict(title)
                    # semantic也不存在与该标题一致的数据,依据作者信息搜索相似文章
                    if not m:
                        continue
                    else:
                        for author in m["authors"]:
                            if len(author["ids"]) == 0:
                                authorGetRes(author["name"], [], m["id"])
                            else:
                                authorGetRes(author["name"], author["ids"][0], m["id"])
                        findArticle(m)
Пример #15
0
def updateAffilIDfromWholeData(affilID):
    """
    原库中没有该机构而新库中存在,查询到机构的信息并插入到原库的机构表中
    :param affilID: 机构的ID
    :return: 原库存在该机构,返回其ID,新库不存在该机构返回-1,否则返回0
    """
    server = dbIO(originDBStr)
    sql = "select id from affillist where afid = '%s'" % affilID
    wholerows = server.load(sql)
    if len(wholerows) > 0:
        singleID = wholerows[0][0]
        return singleID
    else:
        server2 = dbIO(newDBStr)
        sql = "select * from affiltest where afid = '%s'" % affilID
        data = server2.load(sql)
        if len(data) == 0: return -1
        alterdata = data[0]
        sql = "insert into affillist (afid,name,city,country,url) values ('%s','%s','%s','%s','%s')" \
              % (alterdata[1], alterdata[2].replace("'", "''"), alterdata[3].replace("'", "''"),
                 alterdata[4].replace("'", "''"), alterdata[5].replace("'", "''"))
        # print(sql)
        server.save(sql)
        return 0
Пример #16
0
def saveTaskList(taskList, key):
    """
    :param taskList: [任务数量,Scopus高级子命令],getTaskQueryList计算获得
    :param key: 关键词(例如人工智能)
    :return: 保存到数据库的任务列表的数目
    """
    dbIOserver = dbIO()
    insertNum = 0
    for task in taskList:
        sql = "select * from tasklist where query = '" + task[1] + "'"
        if dbIOserver.count(sql) == 0:
            sql = "INSERT INTO tasklist (keyword,query,totalNum,taskType) VALUES " \
                  "('" + key + "','" + task[1] + "','" + str(task[0]) + "','test')"
            if dbIOserver.save(sql) > 0:
                insertNum += 1
    return insertNum
Пример #17
0
def getTasks(key):
    """
    入口函数
    :param key:关键词
    :return: 没有关键词对应的任务,则获取依关键词查找的全部子命令,并存成任务列表到数据库,有对应任务则查看,返回任务列表
    """
    dbIOserver = dbIO()
    sql = "SELECT * FROM tasklist where keyword = '" + key + "'"
    # print(sql)
    taskList = []
    if dbIOserver.count(sql) <= 0:
        queryList = buildOriginTask(key)
        taskList = getTaskQueryList(queryList)
        saveTaskList(taskList, key)
    else:
        taskList = loadTaskList(key)
    return taskList
Пример #18
0
def main(mThread):
    """
    构建Scopus ID对应的线程池,其中线程的个数由searchNum决定,不断停止与启动其中线程资源,在mainThread对象中run调用开启多线程
    :param mThread: 当前线程
    :return: 已有scopus ID的未更新数据的条数
    """
        
    global onceSum, onceSuccess, control_flag
    onceSum = 0
    onceSuccess = 0

    # 计数并查询searchNum条更新的Scupus库中数据信息的论文信息
    sql = r"""select id,sid from searchlist where flag = 0 and sid != '' order by rand()
          limit 0,%d;""" % searchNum
    
    threadList = []
    server = dbIO()
    onceSum = server.count(sql)

    keyIndex = checkScopusKeyIndex()
    print("Available Key Index:", keyIndex, " Rent Keys:", len(Settings.apikeyList) - keyIndex)
    
    ISOTIMEFORMAT = '%X'
    print("Get %s Papers Into Cawler %s" % (onceSum, time.strftime(ISOTIMEFORMAT, time.localtime())))
           
    if onceSum > 0:
        # control thread working
        control_flag = 1
        maxID = '0'
        minID = '0'

        for row in server.load(sql):
            threadList.append(singleThread(row[1], keyIndex))
            if minID == '0':
                minID = row[0]
        maxID = row[0]
        _thread.start_new_thread(checkThreadActive, (threadList, mThread))
        for single in threadList:
            single.start()
            time.sleep(0.25)

        print("Threads %s-%s Running" % (minID, maxID))
    return onceSum
Пример #19
0
def searchArticlesByQuery(query, taskID):
    """
    :param query: Scopus高级检索命令语句
    :param taskID: 任务序号
    :return: 插入文章信息到searchlist数据库中
    """
    dbIOserver = dbIO()
    totalNum = getTotalNumFromSearchAPI(query)
    print("searchResults:" + str(totalNum))
    sql = "SELECT * FROM searchlist where taskID = " + str(taskID)
    dataLength = dbIOserver.count(sql)
    print(dataLength)
    repeatNum = 0
    lastNum = 0
    maxRepeatNum = 5
    while (totalNum > 0 and dataLength < totalNum
           and dataLength < 5000) and repeatNum <= maxRepeatNum:
        if lastNum != totalNum + dataLength:
            repeatNum = 0
            lastNum = totalNum + dataLength
        else:
            repeatNum += 1
        articlesList = getArticlefromAPI(query, totalNum, dataLength)
        insertNum = 0
        for article in articlesList:
            sql = "select * from searchlist where sid = " + article['id']
            if dbIOserver.count(sql) == 0:
                sql = "INSERT INTO searchlist (sid,doi,flag,taskID) VALUES ('" + str(article['id'])\
                      + "','" + article['doi'] + "',0,'"+str(taskID)+"')"
                if dbIOserver.save(sql) > 0:
                    insertNum += 1
        dataLength += len(articlesList)
        print("dataLength:" + str(dataLength))

    if repeatNum > 3:
        sql = "update tasklist set flag = 0 where id = " + str(taskID)
        dbIOserver.save(sql)
        return
    sql = "update tasklist set flag = 1 where id = " + str(taskID)
    dbIOserver.save(sql)
Пример #20
0
def addArticle(dict):
    """
    :param dict: 文章信息字典(包含作者ID列表与机构ID列表)
    :return: 文章数据库更新,从semantic补充,故不含scopus ID
    """
    authors = ""
    authorlist = dict["authorlist"]
    for author in authorlist:
        authors += author + "|"
    # print (authors[:-1])
    keywords = ""
    keywordlist = dict["keywords"]
    for keyword in keywords:
        keywords += keyword + "|"
    affils = ""
    affillist = dict["affillist"]
    for affil in affillist:
        affils += affil + "|"
    # print (affils[:-1])
    # add in datebase
    server = dbIO()
    sql = "select * from articlelist where sid='%s'" % (dict["sid"])
    if server.count(sql) <= 0:
        sql = "insert into articlelist(sid,doi,authorlist,affillist,title,abstract,keywords,date," \
              "journalName,articletype,abstractLang,citation)" \
              "values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%d')" \
              % (dict["sid"], dict["doi"], authors[:-1], affils[:-1], dict["title"].replace("'", "''"),
                 dict["abstract"].replace("'", "''").replace("\n\n\n", " |").replace("\n", "| "),
                 keywords[:-1].replace("'", "''"), dict["date"], dict["journalName"],
                 dict["articleType"], dict["abstractLang"], dict["citation"])
        server.save(sql)
        sql = "select flag from searchlist2 where title='%s'" % (
            dict["title"].replace("'", "''"))
        # print (dict["title"],server.load(sql))
        server.load(sql)
        flag = server.load(sql)[0][0]
        if flag == 0:
            sql = "update searchlist2 set flag=3 where title='%s'" % (
                dict["title"].replace("'", "''"))
            server.save(sql)
Пример #21
0
def dealInfo(dict, title):
    """
    更新检索列表searchlist2,flag=2为模糊匹配的标题,flag=1 为完全匹配的标题
    :param dict: sid与rel字典
    :param title: 标题
    :return: 有sid则返回sid,没有则返回None
    """
    if "sid" in dict:
        server = dbIO()
        # sql = "select * from searchlist2 where sid='%s'" % (dict["sid"])
        if True:  # server.count(sql) <= 0:
            if dict["rel"] == 0:
                sql = "update searchlist2 set flag=1,sid='%s'where title='%s'" % (
                    dict["sid"], title.replace("'", "''"))
                server.save(sql)
            elif dict["rel"] <= 0.1:
                sql = "update searchlist2  set flag=2,sid='%s'where title='%s'" % (
                    dict["sid"], title.replace("'", "''"))
                server.save(sql)
        print("contains sid dict:", dict)
        return dict["sid"]
    else:
        return None
Пример #22
0
def saveWholeArticletoDB(infoDict, authorList, affilList):
    """
    保存爬取到的内容到数据库中
    :param infoDict: 文章信息字典
    :param authorList: 对应的作者信息列表(可能多作者)
    :param affilList: 对应的机构信息列表(可能多机构)
    :return:
    """
    server = dbIO()
    sql = "select * from articlelist where sid = '%s'" % (infoDict['id'])
    if server.count(sql) != 0:
        return 0
    # save the affilication data
    affils = ""
    for affil in affilList:
        affils += affil['id'] + "|"
        sql = "select * from affillist where afid = " + affil['id']
        if server.count(sql) == 0:
            sql = "insert into affillist (afid,name,country,city,url) VALUES ('" + affil[
                'id'] + "','" + affil['name'] + "','" + affil[
                    'country'] + "','" + affil['city'] + "','" + affil[
                        'url'] + "')"
            sta = server.save(sql)
        """
        else:
            sql = "udpate affillist set name = '" + affil['name'] + "',city = '" + affil['city'] + "',country = '" + affil['country'] + "', url = '" + affil['url'] + "' where afid = " + affil['id']
            sta = cur.execute(sql)"""
    affils = affils[:-1]

    # save the author data
    authors = ""
    for author in authorList:
        authors += author['id'] + "|"
        sql = "select * from authorlist where aid = " + author['id']
        if server.count(sql) == 0:
            affiliation = author['affiliation']
            affilids = ""
            for affil in affiliation:
                affilids += affil + "|"
            articles = infoDict['id']
            sql = "insert into authorlist (aid,url,fullname,simname,firstname,lastname,simlastname,affillist,articlelist) VALUES ('" +\
                  author['id'] + "','" + author['url'] + "','" + author['fullname'] + "','" + author['simname'] + "','" +\
                  author['firstname'] + "','" + author['lastname'] + "','" + author['simlastname'] +\
                  "','" + affilids[:-1] + "','" + articles + "')"
            sta = server.save(sql)
        else:
            articles = server.load(sql)[0][8]
            articleStrs = articles.split("|")
            if infoDict['id'] not in articleStrs:
                articles += "|" + infoDict['id']
                sql = "select id from authorlist where aid = " + author['id']
                if server.count(sql) == 0:
                    sql = "update authorlist set articlelist = '" + articles + "' where id = " + server.load(
                        sql)[0][0]
                    sta = server.save(sql)
    authors = authors[:-1]

    # save the article data
    keywords = ""
    for key in infoDict['authorKeywords']:
        keywords += key.replace("'", "''") + "|"
    keywords = keywords[:-1]
    #print(infoDict)
    sql = "insert into articlelist (sid,doi,title,abstract,journalName,articleType,date,citation,abstractLang,keywords,authorlist,affillist) VALUES ('" + \
          infoDict['id'] + "','" + infoDict['doi'] + "','" + infoDict['title'] + "','" + infoDict['abstract'] + "','" + \
          infoDict['journal'] + "','" + infoDict['articleType'] + "','" + infoDict['date'] + \
          "','" + infoDict['citation'] + "','" + infoDict['abstractLang'] + "','" + keywords + \
          "','" + authors + "','" + affils + "')"
    sta = server.save(sql)
    return 1