def compareCoauthor(xID, yID):
    #
    xCheckSQL = 'select eid from paper where paperid in (select paperid from paper where eid=' + str(
        xID) + ')'
    yCheckSQL = 'select eid from paper where paperid in (select paperid from paper where eid=' + str(
        yID) + ')'
    xResult = getResult(xCheckSQL, cur)
    yResult = getResult(yCheckSQL, cur)
    if len(xResult) * len(yResult) == 0:
        return -1
    sResult, lResult = compareLen(xResult, yResult)
    single = 0

    for sR in sResult:
        if (sR['eid'] == xID) or (sR['eid'] == yID):  #跳过自己
            continue
        for lR in lResult:
            if (lR['eid'] == xID) or (lR['eid'] == yID):
                continue
            flag = False
            if sR['eid'] == lR['eid']:
                #相同了
                flag = True

            if flag:
                single += 1
                break  #不用往下比较了

    return round(single / len(sResult), 4)
def compareSame(xID, yID):
    # 考虑了名字完全相同,名字相像问题
    #当姓名几乎不一样的时候,返回一个空的字典
    #当姓名很相似的时候,进行各种对比
    xName = getResult('select name from dlurl1 where id=' + str(xID),
                      cur)[0]['name']
    yName = getResult('select name from dlurl1 where id=' + str(yID),
                      cur)[0]['name']
    #flag = False
    '''
		应该给出一个N*N矩阵,每个格有一个4位的向量
		对应数据库中的就是
		id xid yid nameThreashold(0-1) nameRate(0-1) threshold(0-1) paper coauthor institu topic 后四个都用0、1表示
	'''
    valueSet = {}

    #应该是如果名字完全相同,则匹配率稍低,不然则适当升高
    nameSameRate = nameSameOrNot(xName, yName)
    if nameSameRate == 0:
        return valueSet  #直接返回
    #开始比较
    valueSet['nameRate'] = nameSameRate
    valueSet['paper'] = comparePaper(xID, yID)
    valueSet['coauthor'] = compareCoauthor(xID, yID)
    valueSet['institu'] = compareInstit(xID, yID)
    valueSet['topic'] = compareTopic(xID, yID)
    return valueSet
def countInstitu2():
    selectResult = getResult('select id,xid,yid,nameRate from namecompare',
                             cur)
    print('read completed')

    for sR in selectResult:
        xid = str(sR['xid'])
        yid = str(sR['yid'])
        nameRate = str(sR['nameRate'])
        tag = str(int(float(nameRate) * 100))
        xResult = getResult('select id,yid from name2name where gid=' + xid,
                            cur)
        yResult = getResult('select id,yid from name2name where gid=' + yid,
                            cur)
        for xR in xResult:
            for yR in yResult:
                value = compareInstit(xR['yid'], yR['yid'])
                insertSQL = 'insert into name2compare (xid,yid,nameRate,tag,institu) values (' + str(
                    xR['yid']) + ',' + str(
                        yR['yid']) + ',' + nameRate + ',' + tag + ',' + str(
                            value) + ')'
                #print(insertSQL)
                cur.execute(insertSQL)
                conn.commit()
                print('completed: ' + str(xR['yid']) + ' ' + str(yR['yid']))
        print('completedID: ' + str(sR['id']))
def comparePaper2(xID, yID):
    # 对比paper 的year和title,比较相同率 sameRatio
    xCheckSQL = 'select title,doi,time from paper where eid=' + str(xID)
    yCheckSQL = 'select title,doi,time from paper where eid=' + str(yID)
    xResult = getResult(xCheckSQL, cur)
    yResult = getResult(yCheckSQL, cur)
    if len(xResult) * len(yResult) == 0:
        return -1
    sResult, lResult = compareLen(xResult, yResult)
    single = 0
    for sR in sResult:
        for lR in lResult:

            flag = False
            if sR['doi'].strip() == lR['doi'].strip():
                #基本相同了
                flag = True
            elif (int(sR['time']) == int(
                    lR['time'])) and sR['title'] == ['title']:
                #相同
                flag = True
            else:
                pass

            if flag:
                single += 1
                break  #不用往下比较了
    return single / len(sResult)
def cleanPaperInstituNull():
	#
	selectResult = getResult('select distinct id from dlurl1',cur)
	i = 0
	total= len(selectResult)
	for sr in selectResult:
		i += 1
		result = getResult('select id from paper where eid='+str(sr['id']),cur)
		if not len(result)>0:
			#delete
			cur.execute('delete from dlurl1 where id='+str(sr['id']))
			conn.commit()
			cur.execute('delete from experience1 where eid='+str(sr['id']))
			conn.commit()
			cur.execute('delete from topic where eid='+str(sr['id']))
			conn.commit()
			print('deleted: '+str(round(i/total,3))+' ||  id:'+str(sr['id']))
			continue

		result = getResult('select id from experience1 where eid='+str(sr['id']),cur)
		if not len(result)>0:
			#delete
			cur.execute('delete from dlurl1 where id='+str(sr['id']))
			conn.commit()
			cur.execute('delete from paper where eid='+str(sr['id']))
			conn.commit()
			cur.execute('delete from topic where eid='+str(sr['id']))
			conn.commit()
			print('deleted: '+str(round(i/total,3))+' ||  id:'+str(sr['id']))
			continue

		print('completed check: '+str(round(i/total,3))+' ||  id:'+str(sr['id']))
예제 #6
0
def cleanPaper():
    # 这个已经完成了
    conn, cur = getCursor()
    selectSQL = 'select id from dlurl1'
    eidList = getResultList(selectSQL, 'id', cur)

    for eid in eidList:
        selectSQL = 'select id from paper where eid=' + str(eid)
        idPidList = getResult(selectSQL, cur)
        #获得单独的pid列表

        for ip in idPidList:
            pidList.append([ip['id'], False])
        print('start ' + str(eid))
        #sameEid = []
        for i in range(len(pidList)):
            #分配paperid
            updateSQL = 'update paper set paperid=' + str(
                i) + ' where id=' + str(pidList[i][0])
            cur.execute(updateSQL)
            conn.commit()
            if pidList[i][1] == True:
                continue
            for j in range((i + 1), len(pidList)):
                if pidList[j][1] == True:
                    continue
                iR = getResult(
                    'select doi,time,title from where id=' +
                    str(pidList[i][0]), cur)
                jR = getResult(
                    'select doi,time,title from where id=' +
                    str(pidList[j][0]), cur)
                if iR[0]['doi'] == jR[0]['doi']:
                    #相同
                    pidList[j][1] == True
                    updateSQL = 'update paper set paperid=' + str(
                        i) + ' where id=' + str(pidList[j][0])
                    cur.execute(updateSQL)
                    conn.commit()
                    continue
                elif (iR[0]['time'] == jR[0]['time']) and (iR[0]['title']
                                                           == jR[0]['title']):
                    #相同
                    pidList[j][1] == True
                    updateSQL = 'update paper set paperid=' + str(
                        i) + ' where id=' + str(pidList[j][0])
                    cur.execute(updateSQL)
                    conn.commit()
                    continue
                else:
                    #不相同
                    pass
            pidList[i][1] == True
    cur.close
    conn.close()
def buidPaperNet():
	#
	#papernet  id xid yid perid year
	paperNet = readSeriz(expertNet_pickle)#这相当于初始化模板
	paperNetYear = readSeriz(expertNet_pickle)#这相当于初始化模板
	paperGraph=nx.Graph() #初始化一个图
	#预读
	totalSet = []
	for i in range(len(idList)):
		xResult = getResult('select id,time from paper where eid='+str(),cur)
		#需要有个pid list 建立一个字典 key是id,time是内容
		xDict = {}
		for x in xResult:
			xDict[x['id']] = x['time']
		xID = xDict.keys()
		totalSet.append((xDict,xID))
	#预存一次表, 方便用时间这个条件进行控制
	for i in range(len(idList)):
		for j in range((i+1),len(idList)):
			coauthorList = list(set(totalSet[i][1]).intersection(totalSet[j][1]))
			if len(coauthorList) >0:
				#全部插入数据库中
				for cid in coauthorList:
					year = totalSet[i][0][cid]
					insertSQL = 'insert into papernet (xid,yid,perid,year) values('+str(idList[i])+','+str(idList[j])+','+str(cid)+','+str(year)+')'
				paperNet[i][j] = len(coauthorList)
				paperNetYear[i][j] = coauthorList
				paperGraph.add_edge(*(i,j),coauthoryear=coauthorList) #这个增加一条边
	
	constructSeriz(paperNet_pickle,paperNet)
	constructSeriz(paperNetYear_pickle,paperNetYear)
	constructSeriz(paperGraph_pickle,paperGraph)
def cleanRedunOthers():
    #这里基于名字的去除重复的基础上检查
    '''
	#把检查列表准备好
	selectSQL = 'select distinct eid,group_concat(tem) temg from experience1 where eid in (select xid from new_table where tag=60) or eid in (select yid from new_table where tag=60) group by eid'
	#selectSQL = 'select id,xid,yid from new_table where tag=60' 
	selectResult = getResult(selectSQL,cur)
	institutionDict = {}
	for sr in selectResult:
		institutionDict[sr['eid']] = sr['temg'].split(',')
	'''
    #找出其它相似对
    selectSQL = 'select id,xid,yid from new_table where tag=60 and id>74331 limit 30000'
    xyPair = getResult(selectSQL, cur)
    valueSet = {}
    for xy in xyPair:
        print('now compare id: ' + str(xy['id']))
        xID = xy['xid']
        yID = xy['yid']
        #temlist1 = institutionDict[xID]
        #temlist2 = institutionDict[yID]
        #valueSet['institu'] = str(compareInstit(temlist1,temlist2))
        valueSet['institu'] = str(compareInstit(
            xID, yID))  #做机构比较,对于机构insti位为0的对,再比较机构名称是否包含
        valueSet['id'] = str(xy['id'])
        insertMySQLInstitu(valueSet)
        print(str(valueSet['institu']))
def cleanTopic():
    #
    #tokenizer = nltk.RegexpTokenizer(r'w+')
    selectResult = getResult('select id,topic from topic', cur)
    print('read completed')
    i = 0
    total = len(selectResult)
    for sr in selectResult:
        i += 1
        topic = sr['topic']
        #topic = 'get and got'
        newTopic = ''
        for t in topic.split():
            nt = wn.morphy(t)
            try:
                newTopic += (nt + ' ')
            except Exception:
                newTopic += (t + ' ')
        #print(newTopic)
        #topic = tokenizer.tokenize(topic)
        #topic = nltk.PorterStemmer().stem(topic)
        updateSQL = 'update topic set topic="' + newTopic.strip(
        ) + '" where id=' + str(sr['id'])
        print(updateSQL)
        cur.execute(updateSQL)
        conn.commit()
        print('completed: ' + str(round(i / total, 3)) + ' ||  id:' +
              str(sr['id']))
def mainFunction():
    http, uag = getHttpUa()

    for ua in uag:
        uaQueue.put(ua)

    for ip in http:
        ipQueue.put(ip)

    for i in range(5):
        pWorker = pageWorker(uaQueue, ipQueue, dlQueue)
        pWorker.daemon = True
        pWorker.start()

    conn, cur = getCursor()

    dlList = getResult(selectSQL, cur)
    '''
    with open(expertList_path) as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            dlList.append(row)
    '''

    for dl in dlList:
        dlQueue.put(dl)
def findPage():
    http,ua = getHttpUa()
    conn,cur = getCursor()
    dlList = getResult(sltCollNotNull,cur)
    for dl in dlList:
        # this is test!!!! read from a txt
        #html = readTXT('E:/Code/Test Data/Paul Robert Barford - ACM author profile page - colleagues.txt')
        #html = readTXT('E:/Code/Test Data/Yu Zheng - ACM author profile page.txt')
        #html = readTXT('E:/Code/Test Data/A. Smolic - ACM author profile page.txt')
        if ChangeOrNot() == True:
            editeProxies(http)
            editeHeader(ua)
        time.sleep(random.randint(1, 12))
        
        html = str(getPage(dl['colleage']))#取出url
        if html != ' ':
            nameLink = analysisPage(html)
            for nl in nameLink:
            	addInfo(conn,cur,nl)
            	#print(nl)
            print('Now is '+str(dl['id']))
            
        #break#only run one time
    
    cur.close()
    conn.close()
def prepareNULL():
    #
    nullDOI = readSeriz(nullDOI_pickle)
    yearTitle = [[[] for j in range(27)] for i in range(60)]
    yearList = []
    num = 0
    for nid in nullDOI:
        #print(str(nid))
        getR = getResult('select time,title from paper where id=' + str(nid),
                         cur)
        sr = getR[0]
        if not (len(sr['title']) > 0 and sr['time'] > 0):
            continue
        if sr['time'] in yearList:
            yidx = yearList.index(sr['time'])
        else:
            yearList.append(sr['time'])
            yidx = yearList.index(sr['time'])
        nidx = assignNum(sr['title'])
        yearTitle[yidx][nidx].append((nid, sr['time'], sr['title']))
        print('now is ' + str(num) + ' id is ' + str(nid))
        num += 1
    print('begin saving')

    constructSeriz(yearList_pickle_null, yearList)
    for i in range(len(yearList)):
        for j in range(27):
            path = idyeartitle_path_null + str(i) + '_' + str(j) + '.pickle'
            if len(yearTitle[i][j]) > 0:
                constructSeriz(path, yearTitle[i][j])
def mainFunction():
    http, uag = getHttpUa()

    for ip in http:
        ipQueue.put(ip)
    for ua in uag:
        uaQueue.put(ua)
    for k in range(1):
        aWorker = analysisWorker(htmlQueue, infoQueue)
        aWorker.daemon = True
        aWorker.start()
    print('ok1')
    for i in range(4):
        pWorker = pageWorker(ipQueue, uaQueue, dlQueue, htmlQueue)
        pWorker.daemon = True
        pWorker.start()
    print('ok2')
    conn, cur = getCursor()
    dlList = getResult(sltDLNotCom, cur)  #返回url实体的二维数组
    for dl in dlList:
        dlQueue.put(dl)
    cur.close()
    conn.close()
    print('ok3')
    for j in range(1):
        mWorker = mysqlWorker(infoQueue)
        mWorker.daemon = True
        mWorker.start()
예제 #14
0
def prepareInstitution():
    #
    conn, cur = getCursor()
    selectSQL = 'select institution,id from experience1'
    institResult = getResult(selectSQL, cur)
    institList = []

    print('get ready')
    for tr in institResult:
        institList.append(tr['institution'])
    institList = list(set(institList))
    print('begin insert')
    for i in range(len(institList)):
        #不要让id自增!!!调整数据库!!!!!
        insertSQL = 'insert into institution (id, instutition) values (' + str(
            i) + ',"' + institList[i] + '")'
        cur.execute(insertSQL)
        conn.commit()
    print('complete1')
    for ir in institResult:
        if ir['institution'] in institList:
            idx = institList.index(ir['institution'])
        updateSQL = 'update experience1 set tem=' + str(
            idx) + ' where id=' + str(ir['id'])
        cur.execute(updateSQL)
        conn.commit()
    print('complete2')
    cur.close()
    conn.close()
def buildTopic():
	# colleaguenet xid,yid,score
	topicNet = readSeriz(expertNet_pickle)
	topicNetMore = readSeriz(expertNet_pickle)
	topicGraph = nx.Graph()
	#预读
	totalSet = []
	for i in range(len(idList)):
		#other还没有设置,相当于tid
		xResult = getResult('select other from topic where eid='+str(idList[i]),cur)
		#需要有个other list 建立一个字典 key是other,num是内容
		xDict = {}
		for x in xResult:
			xDict[x['other']] = x['num']
		xID = xDict.keys()
		totalSet.append((xDict,xID))
	
	for i in range(len(idList)):
		for j in range((i+1),len(idList)):
			topicList = list(set(totalSet[i][1]).intersection(totalSet[j][1]))
			if len(topicList)>0:
				#全部插入数据库中
				score = calculateTopic(topicList)
				#xid yid 已经转成str
				insertSQL = 'insert into topicnet (xid,yid,score) values('+xid+','+yid+','+str(score)+')'
				cur.execute(insertSQL)
				conn.commit()
				topicNet[i][j] = score #这里也做一个记录
				topicGraph(*(i,j),similar=score)
				topicNetMore[i][j] = topicList

	constructSeriz(topicNet_pickle,topicNet)
	constructSeriz(topicGraph_pickle,topicGraph)
	constructSeriz(topicNetMore_pickle,topicNetMore)
def linkP2D():
	#
	conn,cur = getCursor()
	resuID = getResult(selectSQL,cur)
	for re in resuID:
		updateSQL = 'update dlurl1 set tem=3 where id='+str(re['eid'])
		cur.execute(updateSQL)
		conn.commit()
		print('complete: '+str(re['eid']))
def cleanUrlMySQL():
    conn, cur = getCursor()  #获取数据库连接和游标
    dlList = getResult(sltCollNotNull, cur)
    for dl in dlList:
        url, userid = analysisRecord(dl['url'])
        result = addInfo(url, userid, dl['id'], cur, conn)
        if result == 1:
            cur.execute('update dlurl1 set tem=1 where id=' +
                        str(dl['id']))  #标记
            conn.commit()
            print('Now is ' + str(dl['id']))
def comparePaper(xID, yID):
    # 对比paper 的year和title,比较相同率 sameRatio
    xCheckSQL = 'select paperid from paper where eid=' + str(xID)
    yCheckSQL = 'select paperid from paper where eid=' + str(yID)
    xResult = getResult(xCheckSQL, cur)
    yResult = getResult(yCheckSQL, cur)
    if len(xResult) * len(yResult) == 0:
        return -1
    sResult, lResult = compareLen(xResult, yResult)
    single = 0
    for sR in sResult:
        for lR in lResult:
            flag = False
            if sR['paperid'] == lR['paperid']:
                #基本相同了
                flag = True
            if flag:
                single += 1
                break  #不用往下比较了
    return round(single / len(sResult), 4)
def compareInstit2(xID, yID):
    #需要返回-1区别是否是有一个为空
    xCheckSQL = 'select tem from experience1 where eid=' + str(xID)
    yCheckSQL = 'select tem from experience1 where eid=' + str(yID)
    instList1 = getResult(xCheckSQL, cur)
    instList2 = getResult(yCheckSQL, cur)
    if len(instList1) * len(instList2) == 0:
        return -1
    sResult, lResult = compareLen(instList1, instList2)
    single = 0
    for sR in sResult:
        for lR in lResult:
            flag = False
            if sR == lR:
                #相同了
                flag = True
            if flag:
                single += 1
                break  #不用往下比较了

    return single / len(sResult)
def compareInstit(xID, yID):
    #
    xCheckSQL = 'select institution from experience1 where eid=' + str(xID)
    yCheckSQL = 'select institution from experience1 where eid=' + str(yID)
    xResult = getResult(xCheckSQL, cur)
    yResult = getResult(yCheckSQL, cur)
    if len(xResult) * len(yResult) == 0:
        return -1
    sResult, lResult = compareLen(xResult, yResult)
    single = 0

    for sR in sResult:
        for lR in lResult:
            flag = False
            shortI, longI = compareLen(sR['institution'], lR['institution'])
            if shortI in longI:
                #相同了
                flag = True
            if flag:
                single += 1
                break  #不用往下比较了
    return single / len(lResult)
def compareTopic(xID, yID):
    xCheckSQL = 'select topic from topic where eid=' + str(xID)
    yCheckSQL = 'select topic from topic where eid=' + str(yID)
    xResult = getResult(xCheckSQL, cur)
    yResult = getResult(yCheckSQL, cur)
    if len(xResult) * len(yResult) == 0:
        return -1
    sResult, lResult = compareLen(xResult, yResult)
    single = 0

    for sR in sResult:
        for lR in lResult:
            flag = False
            sTopic, lTopic = compareLen(sR['topic'], lR['topic'])
            if sTopic in lTopic:
                #包含关系也算相同了
                flag = True
            if flag:
                single += 1
                break  #不用往下比较了

    return single / len(sResult)
def countInstituXXXX():
    selectResult = getResult('select id,gid,yid from name2name', cur)
    print('read completed')
    for sr in selectResult:
        if sr['gid'] == sr['yid']:
            continue
        value = compareInstit(sr['gid'], sr['yid'])
        insertSQL = 'insert into name2compare (xid,yid,nameRate,tag,institu) values (' + str(
            sr['gid']) + ',' + str(sr['yid']) + ',1.0,100,' + str(value) + ')'
        #print(insertSQL)
        cur.execute(insertSQL)
        conn.commit()
        print('completed: ' + str(sr['id']))
예제 #23
0
def addInstitution(instInfo, cur, conn):
    #
    insertSQL = ''
    result = getResult('select * from expert order by eid desc limit 1', cur)
    eid = result[0]['eid']

    for inst in instInfo:
        try:
            insertSQL = 'insert into experience (eid,institution) values(' + str(
                eid) + ', "' + inst + '")'
            cur.execute(insertSQL)
            conn.commit()
        except Exception:
            print('error:' + insertSQL)
예제 #24
0
def linkP2D():
    #
    conn, cur = getCursor()
    resuID = getResult(selectSQL, cur)
    for re in resuID:
        updateSQL = 'update dlurl1 set status=2 where id=' + str(re['eid'])
        try:
            cur.execute(updateSQL)
            conn.commit()
            print('complete: ' + str(re['eid']))
        except Exception:
            print('error: ' + str(re['eid']))

    cur.close()
    conn.close()
예제 #25
0
def preparePublication():
    #建立一个publication表
    conn, cur = getCursor()
    selectSQL = 'select pid from publication'
    paperResult = getResult(selectSQL, cur)
    print('read completed')
    for i in range(len(paperResult)):
        if i < -1:
            continue
        print(str(paperResult[i]['pid']))
        updateSQL = 'update paper set paperid=' + str(
            i) + ' where pid="' + paperResult[i]['pid'] + '"'
        cur.execute(updateSQL)
        conn.commit()
        print('completed: ' + str(i))
    '''
	paperList = []
	print('get ready')
	for tr in paperResult:
		paperList.append(tr['pid'])
	paperList = list(set(paperList))
	print('begin insert')
	for i in range(len(paperList)):
		#不要让id自增!!!调整数据库!!!!!
		insertSQL = 'insert into publication (id, pid) values ('+str(i)+',"'+paperList[i]+'")'
		cur.execute(insertSQL)
		conn.commit()
	print('complete1')

	pidDict = {}
	selectResult = getResult('select id, pid from publication',cur)
	
	for sr in selectResult:
		pidDict[sr['pid']] = sr['id'] 
	for ir in paperResult:
		if ir['pid'] in paperList:
			idx = paperList.index(ir['pid'])
		updateSQL = 'update paper set paperid='+str(pidDict[ir['pid']])+' where id='+str(ir['id'])
		cur.execute(updateSQL)
		conn.commit()
	print('complete2')
	
	'''
    cur.close()
    conn.close()
def combine():
	#
	#selectSQL = 'select id,xid,yid from name2compare where institu>0.5 and nameRate>0.75'
	#selectSQL = 'select id,xid,yid from name2compare where institu>-1 and institu<0.5 and paper>0.2 and nameRate>0.6'
	#selectSQL = 'select id,xid,yid from name2compare where paper<0.2 and paper>0 and coauthor >0.01 and nameRate>0.6'
	#selectSQL = 'select id,xid,yid from name2compare where coauthor<0.01 and topic>0.06 and nameRate>0.6'
	selectSQL = 'select id,xid,yid from name2compare where institu<0.1 and institu>-1 and paper>=0 and coauthor>0'
	
	selectResult = getResult(selectSQL,cur)
	i = 0
	total= len(selectResult)
	for sr in selectResult:
		i+=1
		fg = updateCombine(sr['xid'],sr['yid'])
		if fg:
			print('completed: '+str(round(i/total,3))+' ||  id:'+str(sr['id']))
		else:
			print('somewhere error: '+str(round(i/total,3))+' ||  id:'+str(sr['id']))
예제 #27
0
def getID(html):
    #
    eid = -1  #初始化
    indx = '<![CDATA['
    start = html.find(indx)
    end = html.find(']]></fullpath>')
    if start > 0:
        subjectURL = html[(start + len(indx)):end]
        url, userid = extractUserID(subjectURL)  #从网址中分离出url地址
        #回查数据库
        selectSQL = 'select t.id from (select id,url from dlurl1 where status<>2) t where t.url="' + url + '"'
        result = getResult(selectSQL, cur)

        if len(result) == 1:
            eid = int(result[0]['id'])
        else:
            print('exist')
    return eid
def getID(html):
    #
    eid = -1  #初始化
    indx = '<![CDATA['
    start = html.find(indx)
    end = html.find(']]></fullpath>')
    if start > 0:
        subjectURL = html[(start + len(indx)):end]
        #url,userid = extractUserID(subjectURL)#从网址中分离出url地址
        #回查数据库
        #print('find 1')
        #selectSQL = 'select tem.id from (select id,url from dlurl1 where status<>2 and userid='+str(userid)+') tem where tem.url="'+url+'"'
        selectSQL = 'select tem.id from (select id,subject from dlurl1 where status<>2) tem where tem.subject="' + subjectURL + '"'
        result = getResult(selectSQL, cur)
        eid = []
        for r in result:
            eid.append(r['id'])
        print('len len is ' + str(len(eid)))
    return eid
def analysisNull():
    #
    '''
    nullDOI = readSeriz(nullDOI_pickle)
    print('num is '+str(len(nullDOI)))
    
    nullDOI = readSeriz(nullDOI_pickle)
    for nid in nullDOI:
        updateSQL = 'update paper set paperid=null where id='+str(nid)
        cur.execute(updateSQL)
        conn.commit()
        print('completed: '+str(nid))
        #break
    '''
    yearTitle = [[[] for j in range(27)] for i in range(100)]
    yearList = []
    #
    selectResult = getResult(
        'select id,time,title from paper where paperid is not null', cur)
    print('read completed, the total is ' + str(len(selectResult)))
    num = 0
    for sr in selectResult:
        if not (len(sr['title']) > 0 and sr['time'] > 0):
            continue
        if sr['time'] in yearList:
            yidx = yearList.index(sr['time'])
        else:
            yearList.append(sr['time'])
            yidx = yearList.index(sr['time'])
        nidx = assignNum(sr['title'])

        yearTitle[yidx][nidx].append((sr['id'], sr['time'], sr['title']))
        print('now is ' + str(num) + ' id is ' + str(sr['id']))
        num += 1
    print('begin saving')
    constructSeriz(yearList_pickle, yearList)
    for i in range(len(yearList)):
        for j in range(27):
            path = idyeartitle_path + str(i) + '_' + str(j) + '.pickle'
            if len(yearTitle[i][j]) > 0:
                constructSeriz(path, yearTitle[i][j])
            else:
                constructSeriz(path, [])
def dlInfo(html, soup):
    #
    indx = '<![CDATA['
    start = html.find(indx)
    end = html.find(']]></fullpath>')
    if start > 0:
        url = html[(start + len(indx)):end]
        url, userid = extractUserID(url)
        selectSQL = 'select t.id,t.name from (select id,name,url from dlurl1 where userid=' + str(
            userid) + ') t where t.url="' + url + '"'
        result = getResult(selectSQL, cur)
        if len(result) == 1:
            id = int(result[0]['id'])
            name = result[0]['name']
            advisorcsv = getCsvUrl(soup, html, id)
            writeMetrx(writePath, [[id, name, advisorcsv]])  #给二维数组
            print('complete:' + str(id))
        else:
            print('error or exist')