示例#1
0
def prepareInstitution():
    #
    conn, cur = getCursor()
    selectSQL = 'select institution,id from experience1'
    institResult = getResult(selectSQL, cur)
    institList = []

    print('get ready')
    for tr in institResult:
        institList.append(tr['institution'])
    institList = list(set(institList))
    print('begin insert')
    for i in range(len(institList)):
        #不要让id自增!!!调整数据库!!!!!
        insertSQL = 'insert into institution (id, instutition) values (' + str(
            i) + ',"' + institList[i] + '")'
        cur.execute(insertSQL)
        conn.commit()
    print('complete1')
    for ir in institResult:
        if ir['institution'] in institList:
            idx = institList.index(ir['institution'])
        updateSQL = 'update experience1 set tem=' + str(
            idx) + ' where id=' + str(ir['id'])
        cur.execute(updateSQL)
        conn.commit()
    print('complete2')
    cur.close()
    conn.close()
def mainFunction():
    http, uag = getHttpUa()

    for ip in http:
        ipQueue.put(ip)
    for ua in uag:
        uaQueue.put(ua)
    for k in range(1):
        aWorker = analysisWorker(htmlQueue, infoQueue)
        aWorker.daemon = True
        aWorker.start()
    print('ok1')
    for i in range(4):
        pWorker = pageWorker(ipQueue, uaQueue, dlQueue, htmlQueue)
        pWorker.daemon = True
        pWorker.start()
    print('ok2')
    conn, cur = getCursor()
    dlList = getResult(sltDLNotCom, cur)  #返回url实体的二维数组
    for dl in dlList:
        dlQueue.put(dl)
    cur.close()
    conn.close()
    print('ok3')
    for j in range(1):
        mWorker = mysqlWorker(infoQueue)
        mWorker.daemon = True
        mWorker.start()
def findPage():
    http,ua = getHttpUa()
    conn,cur = getCursor()
    dlList = getResult(sltCollNotNull,cur)
    for dl in dlList:
        # this is test!!!! read from a txt
        #html = readTXT('E:/Code/Test Data/Paul Robert Barford - ACM author profile page - colleagues.txt')
        #html = readTXT('E:/Code/Test Data/Yu Zheng - ACM author profile page.txt')
        #html = readTXT('E:/Code/Test Data/A. Smolic - ACM author profile page.txt')
        if ChangeOrNot() == True:
            editeProxies(http)
            editeHeader(ua)
        time.sleep(random.randint(1, 12))
        
        html = str(getPage(dl['colleage']))#取出url
        if html != ' ':
            nameLink = analysisPage(html)
            for nl in nameLink:
            	addInfo(conn,cur,nl)
            	#print(nl)
            print('Now is '+str(dl['id']))
            
        #break#only run one time
    
    cur.close()
    conn.close()
def mainFunction():
    http, uag = getHttpUa()

    for ua in uag:
        uaQueue.put(ua)

    for ip in http:
        ipQueue.put(ip)

    for i in range(5):
        pWorker = pageWorker(uaQueue, ipQueue, dlQueue)
        pWorker.daemon = True
        pWorker.start()

    conn, cur = getCursor()

    dlList = getResult(selectSQL, cur)
    '''
    with open(expertList_path) as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            dlList.append(row)
    '''

    for dl in dlList:
        dlQueue.put(dl)
def linkP2D():
	#
	conn,cur = getCursor()
	resuID = getResult(selectSQL,cur)
	for re in resuID:
		updateSQL = 'update dlurl1 set tem=3 where id='+str(re['eid'])
		cur.execute(updateSQL)
		conn.commit()
		print('complete: '+str(re['eid']))
def checkSame(url,userID):
    conn,cur = getCursor()
    checkSQL = 'select id,url from dlurl1 where userid='+userID
    number = cur.execute(checkSQL)
    if number>0:#如果结果集记录大于0
        for c in cur.fetchall():#提取每个记录
            if c['url']==url:#对url是否相同
                return True
    return False
示例#7
0
def mainFunction():
    conn, cur = getCursor()
    filePathList = readFiles(files_path)
    for fileP in filePathList:
        insertPaperSQL(fileP)
        #break#只运行一次

    cur.close()
    conn.close()
示例#8
0
def cleanPaper():
    # 这个已经完成了
    conn, cur = getCursor()
    selectSQL = 'select id from dlurl1'
    eidList = getResultList(selectSQL, 'id', cur)

    for eid in eidList:
        selectSQL = 'select id from paper where eid=' + str(eid)
        idPidList = getResult(selectSQL, cur)
        #获得单独的pid列表

        for ip in idPidList:
            pidList.append([ip['id'], False])
        print('start ' + str(eid))
        #sameEid = []
        for i in range(len(pidList)):
            #分配paperid
            updateSQL = 'update paper set paperid=' + str(
                i) + ' where id=' + str(pidList[i][0])
            cur.execute(updateSQL)
            conn.commit()
            if pidList[i][1] == True:
                continue
            for j in range((i + 1), len(pidList)):
                if pidList[j][1] == True:
                    continue
                iR = getResult(
                    'select doi,time,title from where id=' +
                    str(pidList[i][0]), cur)
                jR = getResult(
                    'select doi,time,title from where id=' +
                    str(pidList[j][0]), cur)
                if iR[0]['doi'] == jR[0]['doi']:
                    #相同
                    pidList[j][1] == True
                    updateSQL = 'update paper set paperid=' + str(
                        i) + ' where id=' + str(pidList[j][0])
                    cur.execute(updateSQL)
                    conn.commit()
                    continue
                elif (iR[0]['time'] == jR[0]['time']) and (iR[0]['title']
                                                           == jR[0]['title']):
                    #相同
                    pidList[j][1] == True
                    updateSQL = 'update paper set paperid=' + str(
                        i) + ' where id=' + str(pidList[j][0])
                    cur.execute(updateSQL)
                    conn.commit()
                    continue
                else:
                    #不相同
                    pass
            pidList[i][1] == True
    cur.close
    conn.close()
def cleanUrlMySQL():
    conn, cur = getCursor()  #获取数据库连接和游标
    dlList = getResult(sltCollNotNull, cur)
    for dl in dlList:
        url, userid = analysisRecord(dl['url'])
        result = addInfo(url, userid, dl['id'], cur, conn)
        if result == 1:
            cur.execute('update dlurl1 set tem=1 where id=' +
                        str(dl['id']))  #标记
            conn.commit()
            print('Now is ' + str(dl['id']))
def mainFunction():
    path = 'E:/Code/Data/insertSQL.csv'
    sqlList = readList(path)
    conn, cur = getCursor()  #获取数据库连接和游标
    print('total is ' + str(len(sqlList)))
    for i in range(len(sqlList)):
        try:
            cur.execute(sqlList[i])
            conn.commit()
            print('now is ' + str(i))
        except Exception:
            print('error' + sqlList[i])
        #break
    cur.close()
    conn.close()
示例#11
0
def linkP2D():
    #
    conn, cur = getCursor()
    resuID = getResult(selectSQL, cur)
    for re in resuID:
        updateSQL = 'update dlurl1 set status=2 where id=' + str(re['eid'])
        try:
            cur.execute(updateSQL)
            conn.commit()
            print('complete: ' + str(re['eid']))
        except Exception:
            print('error: ' + str(re['eid']))

    cur.close()
    conn.close()
示例#12
0
def preparePublication():
    #建立一个publication表
    conn, cur = getCursor()
    selectSQL = 'select pid from publication'
    paperResult = getResult(selectSQL, cur)
    print('read completed')
    for i in range(len(paperResult)):
        if i < -1:
            continue
        print(str(paperResult[i]['pid']))
        updateSQL = 'update paper set paperid=' + str(
            i) + ' where pid="' + paperResult[i]['pid'] + '"'
        cur.execute(updateSQL)
        conn.commit()
        print('completed: ' + str(i))
    '''
	paperList = []
	print('get ready')
	for tr in paperResult:
		paperList.append(tr['pid'])
	paperList = list(set(paperList))
	print('begin insert')
	for i in range(len(paperList)):
		#不要让id自增!!!调整数据库!!!!!
		insertSQL = 'insert into publication (id, pid) values ('+str(i)+',"'+paperList[i]+'")'
		cur.execute(insertSQL)
		conn.commit()
	print('complete1')

	pidDict = {}
	selectResult = getResult('select id, pid from publication',cur)
	
	for sr in selectResult:
		pidDict[sr['pid']] = sr['id'] 
	for ir in paperResult:
		if ir['pid'] in paperList:
			idx = paperList.index(ir['pid'])
		updateSQL = 'update paper set paperid='+str(pidDict[ir['pid']])+' where id='+str(ir['id'])
		cur.execute(updateSQL)
		conn.commit()
	print('complete2')
	
	'''
    cur.close()
    conn.close()
def addInfo(nl,id):
    conn,cur = getCursor()
    insertSQL='insert into dlurl1 (name,url,status,tem,userID) values("'+nl[0]+'","'+nl[1]+'",0,1,'+str(nl[2])+');'
    #insertValues = (nl[0],nl[1],nl[2])
    try:
        with open(''):
            
        cur.execute(insertSQL)
        conn.commit()
        print('Complete:'+nl[0]+'  id:'+str(id))
        cur.close()
        conn.close()
        return True
    except Exception:
        print('insert error:')
        print(nl)
        cur.close()
        conn.close()
        return False
示例#14
0
def findCSV():
    httpProxies = commHttpProxies
    headers = commHeaders
    
    http,ua = getHttpUa()#获取伪装的备选http,userAgent列表
    conn,cur = getCursor()#获取数据库连接和游标
    dlList = getResult(sltDLNotCom,cur)#返回url实体的二维数组

    for dl in dlList:
        if ChangeOrNot() == True:#随机触发
            httpProxies=editeProxies(http,httpProxies)#改变http
            headers=editeHeader(ua,headers)#改变user agent
        time.sleep(random.randint(1, 12))#随机休眠
        
        url = dl['papercsv']

        if url != None and len(url)> 15:
            try:
                r = requests.get(url, proxies = httpProxies, headers = headers, timeout=30)
                if r.status_code == 200:
                    csv_path = file_path+str(dl['id'])+'.csv'
                    with open(csv_path,'wb') as csv:
                        csv.write(r.content)
                        print('Now is '+str(dl['id']))
            except requests.RequestException as e:
                print(e)
                httpProxies=editeProxies(http,httpProxies)#改变http
                headers=editeHeader(ua,headers)#改变user agent
                time.sleep(random.randint(1, 12))#随机休眠
                try:
                    r = requests.get(url, proxies = httpProxies, headers = headers, timeout=30)
                    if r.status_code == 200:
                        csv_path = file_path+str(dl['id'])+'.csv'
                        with open(csv_path,'wb') as csv:
                            csv.write(r.content)
                            print('Now is '+str(dl['id']))
                except Exception:
                    print('another try is failed! id:'+str(dl['id']))
        #break# only run one time
    
    cur.close()
    conn.close()        
def checkSame(url,userID):
    #
    conn,cur = getCursor()
    try:
        checkSQL = 'select id,url from dlurl1 where userid='+str(userID)
        number = cur.execute(checkSQL)
        if number>0:#如果结果集记录大于0
            for c in cur.fetchall():#提取每个记录
                if c['url']==url:#对url是否相同
                    cur.close()
                    conn.close()
                    return True
        cur.close()
        conn.close()
        return False
    except Exception:
        print('check error!:'+url)
        cur.close()
        conn.close()
        return True
 def run(self):
     while True:
         nl,id = infoQueue.get()
         #ts1 = datetime.datetime.now()
         #处理
         
         flag = addInfo(nl,id)#成功为True
         
         #print('infoset:' +str(infoSet['name']))
         #print('mysql: '+str(dl['id']))
         #ts2 = datetime.datetime.now()
         #print('mysql id:'+str(dl['id'])+' time:'+str(ts2-ts1))
         if not flag:
             self.infoQueue.put((nl,id))#放回
         else:
             print('mysql:'+str(id))
         conn,cur = getCursor()
         cur.execute('update dlurl1 set tem=0 where id='+str(id))
         conn.commit()
         cur.close();conn.close();
         self.infoQueue.task_done()
示例#17
0
def findPage():
    httpProxies = commHttpProxies.copy()
    headers = commHeaders.copy()
    cookies = commCookies.copy()
    #从数据库中获取预访问的url列表,循环访问url地址
    http, ua = getHttpUa()  #获取伪装的备选http,userAgent列表
    conn, cur = getCursor()  #获取数据库连接和游标
    dlList = getResult(sltDLNotCom, cur)  #返回url实体的二维数组
    #for i in range(15):
    #i = 0
    for dl in dlList:
        # this is test!!!! read from a txt
        #html = readTXT('E:/Code/Test Data/Hsinchun Chen.txt')
        #html = readTXT('E:/Code/Test Data/Yu Zheng - ACM author profile page.txt')
        #html = readTXT('E:/Code/Test Data/A. Smolic - ACM author profile page.txt')

        if ChangeOrNot() == True:  #随机触发
            httpProxies = editeProxies(http, httpProxies)  #改变http
            headers = editeHeader(ua, headers, dl['name'])  #改变user agent
            cookies = editeCookies(cookies)
        time.sleep(random.randint(2, 12))  #随机休眠

        print(str(httpProxies['https']))
        print(str(headers['User-Agent']))
        print(str(headers['Referer']))
        print(str(cookies['CFID']))
        print()
        '''
        html = str(getPage(dl['url'],httpProxies,headers,cookies))#取出url
        
        if html != ' ':
            infoSet = analysisPage(html,int(dl['id']))#分析页面
            addInfo(conn,cur,infoSet,dl)#存入数据库
            cur.execute('update dlurl1 set status=1 where id='+str(dl['id']))#标记已抽取
            conn.commit()
            print('Now is '+str(dl['id']))            
        #break#only run one time
        '''
    cur.close()
    conn.close()
示例#18
0
def prepareTopic():
    #获得一个单独的topic列表,一个出现两次以上的列表?
    conn, cur = getCursor()
    selectSQL = 'select id,topic,num from topic'
    topicsResult = getResult(selectSQL, cur)
    topicList = []
    print('get ready')
    for tr in topicsResult:
        topicList.append(tr['topic'])
    topicList = list(set(topicList))
    numList = [0 for i in range(len(topicList))]
    print('begin count')
    for tr in topicsResult:
        idx = topicList.index(tr['topic'])
        numList[idx] += int(tr['num'])
    print('begin insert')

    for i in range(len(topicList)):
        #这个地方id不要自增,这样可以同时更新topic的tid 调整原数据库!!!!!!!!!!!!!
        #insertSQL = 'insert into topiclist (id,topic,num) values ('+str(i)+',"'+topicList[i]+'",'+str(numList[i])+')'
        if numList[i] > 1:
            #忽略只出现一次的topic
            insertSQL = 'insert into topiclist (id,topic,num) values (' + str(
                i) + ',"' + topicList[i] + '",' + str(numList[i]) + ')'
        cur.execute(insertSQL)
        conn.commit()
    print('complete1')
    for eachTopic in topicsResult:
        idx = -1
        if eachTopic['topic'] in topicList:
            idx = topicList.index(eachTopic['topic'])
            if numList[idx] < 2:
                idx = -1  #忽略只出现一次的topic
        updateSQL = 'update topic set other=' + str(idx) + ' where id=' + str(
            eachTopic['id'])
        cur.execute(updateSQL)
        conn.commit()
    print('complete2')
    cur.close()
    conn.close()
示例#19
0
def cleanTopic():
    #
    conn, cur = getCursor()
    selectSQL = 'select id from dlurl1'
    eidList = getResultList(selectSQL, 'id', cur)

    for eid in eidList:
        selectSQL = 'select id,topic from topic where eid=' + str(eid)
        idPidList = getResult(selectSQL, cur)
        #获得单独的pid列表
        pidList = []
        for ip in idPidList:
            pidList.append(ip['topic'])
        newPidList = list(set(pidList))
        print('start ' + str(eid))
        if not len(newPidList) < len(pidList):  #说明不冗余
            print('not dublicate ' + str(eid))
            continue
        #两层循环找相同
        passID = []
        sameEid = []
        for i in range(len(idPidList)):
            if i in passID:
                continue
            for j in range((i + 1), len(idPidList)):
                if idPidList[i]['topic'] == idPidList[j]['topic']:
                    sameEid.append(idPidList[j]['id'])  #从id删掉它
                    passID.append(j)
        for sid in sameEid:
            deleteSQL = 'delete from topic where id=' + str(sid)
            try:
                cur.execute(deleteSQL)
                conn.commit()
                print('complete delete:' + str(sid))
            except Exception:
                print('delete error ' + str(sid))
    cur.close
    conn.close()
'''
	合并一些可以确定的
	update 表 a1 set a1.d=(select d from ((select d from 表 where k23=3) as a2)) where a1.k23=1;
	(1)所有姓名在0.8以上的,机构在0.5以上的
	'select xid,yid from name2compare where institu>-1 and institu<0.5 and paper>0.2 and nameRate>0.6'
	'select xid,yid from name2compare where institu>-1 and institu<0.5 and paper<0.2 and coauthor>0.01 and nameRate>0.6'
	(2)机构在0.5以下,paper在0.2以上的
	(3)
'''
from tool import getCursor,getResult
conn,cur = getCursor()
#import os;os.chdir('e:/Code/Python');import geng_combine;geng_combine.combine()
def combine():
	#
	#selectSQL = 'select id,xid,yid from name2compare where institu>0.5 and nameRate>0.75'
	#selectSQL = 'select id,xid,yid from name2compare where institu>-1 and institu<0.5 and paper>0.2 and nameRate>0.6'
	#selectSQL = 'select id,xid,yid from name2compare where paper<0.2 and paper>0 and coauthor >0.01 and nameRate>0.6'
	#selectSQL = 'select id,xid,yid from name2compare where coauthor<0.01 and topic>0.06 and nameRate>0.6'
	selectSQL = 'select id,xid,yid from name2compare where institu<0.1 and institu>-1 and paper>=0 and coauthor>0'
	
	selectResult = getResult(selectSQL,cur)
	i = 0
	total= len(selectResult)
	for sr in selectResult:
		i+=1
		fg = updateCombine(sr['xid'],sr['yid'])
		if fg:
			print('completed: '+str(round(i/total,3))+' ||  id:'+str(sr['id']))
		else:
			print('somewhere error: '+str(round(i/total,3))+' ||  id:'+str(sr['id']))
		#break
def addInfo(infoSet, dl):
    #
    conn, cur = getCursor()  #获取数据库连接和游标
    #根据不同情况构造插入语句,使用拼接的方法
    if type(infoSet) != dict:
        print('error')
    if infoSet['advUrl'] != ' ':
        updateSQL = 'update dlurl1 set colleage="' + infoSet[
            'collUrl'] + '",subject="' + infoSet[
                'subUrl'] + '",papercsv="' + infoSet[
                    'csvUrl'] + '",advisorcsv="' + infoSet[
                        'advUrl'] + '" where id=' + str(dl['id'])
    else:
        updateSQL = 'update dlurl1 set colleage="' + infoSet[
            'collUrl'] + '",subject="' + infoSet[
                'subUrl'] + '",papercsv="' + infoSet[
                    'csvUrl'] + '",advisorcsv=Null where id=' + str(dl['id'])
    rowSQL = updateSQL[:]
    #判断两个姓名是否相同,不同则合并
    comName = isTheSame(infoSet['name'], dl['name'])

    #增加网址
    try:
        cur.execute(updateSQL)
        conn.commit()
    except Exception:
        print('updata error' + 'id: ' + str(dl['id']))
        #这里专门为插入数据过长而准备的,将非常可能过长的csvurl去除,保存到单独文件中,并不存入数据库中
        try:
            writeList(SQLerror_path, [updateSQL])  #保存到单独文件中
            if infoSet['advUrl'] != ' ':
                updateSQL = 'update dlurl1 set colleage="' + infoSet[
                    'collUrl'] + '",subject="' + infoSet[
                        'subUrl'] + '",papercsv="Too long",advisorcsv="' + infoSet[
                            'advUrl'] + '" where id=' + str(dl['id'])
            else:
                updateSQL = 'update dlurl1 set colleage="' + infoSet[
                    'collUrl'] + '",subject="' + infoSet[
                        'subUrl'] + '",papercsv="Too long",advisorcsv=Null where id=' + str(
                            dl['id'])
            cur.execute(updateSQL)
            conn.commit()
            print(
                'another way of update is successful! Here is the papercsv url:'
            )
            print(rowSQL)
        except Exception:
            print(updateSQL)

    #增加专家
    insertSQL = 'insert into expert (name,homepage,dl) values (%s,%s,%s)'
    insertValues = (comName, infoSet['homepage'], dl['url'])

    try:
        cur.execute(insertSQL, insertValues)
        conn.commit()
    except Exception:
        print('insert error ' + 'id: ' + str(dl['id']))

    #增加机构
    insertSQL = ''
    result = getResult('select * from expert order by eid desc limit 1', cur)
    eid = result[0]['eid']

    for inst in infoSet['institution']:
        try:
            insertSQL = 'insert into experience (eid,institution) values(' + str(
                eid) + ', "' + inst + '")'
            cur.execute(insertSQL)
            conn.commit()
        except Exception:
            print('error:' + insertSQL)

    cur.execute('update dlurl1 set status=1 where id=' + str(dl['id']))  #标记已抽取
    conn.commit()
    cur.close()
    conn.close()
    print('Competed ' + str(dl['id']))