def prepareInstitution(): # conn, cur = getCursor() selectSQL = 'select institution,id from experience1' institResult = getResult(selectSQL, cur) institList = [] print('get ready') for tr in institResult: institList.append(tr['institution']) institList = list(set(institList)) print('begin insert') for i in range(len(institList)): #不要让id自增!!!调整数据库!!!!! insertSQL = 'insert into institution (id, instutition) values (' + str( i) + ',"' + institList[i] + '")' cur.execute(insertSQL) conn.commit() print('complete1') for ir in institResult: if ir['institution'] in institList: idx = institList.index(ir['institution']) updateSQL = 'update experience1 set tem=' + str( idx) + ' where id=' + str(ir['id']) cur.execute(updateSQL) conn.commit() print('complete2') cur.close() conn.close()
def mainFunction(): http, uag = getHttpUa() for ip in http: ipQueue.put(ip) for ua in uag: uaQueue.put(ua) for k in range(1): aWorker = analysisWorker(htmlQueue, infoQueue) aWorker.daemon = True aWorker.start() print('ok1') for i in range(4): pWorker = pageWorker(ipQueue, uaQueue, dlQueue, htmlQueue) pWorker.daemon = True pWorker.start() print('ok2') conn, cur = getCursor() dlList = getResult(sltDLNotCom, cur) #返回url实体的二维数组 for dl in dlList: dlQueue.put(dl) cur.close() conn.close() print('ok3') for j in range(1): mWorker = mysqlWorker(infoQueue) mWorker.daemon = True mWorker.start()
def findPage(): http,ua = getHttpUa() conn,cur = getCursor() dlList = getResult(sltCollNotNull,cur) for dl in dlList: # this is test!!!! read from a txt #html = readTXT('E:/Code/Test Data/Paul Robert Barford - ACM author profile page - colleagues.txt') #html = readTXT('E:/Code/Test Data/Yu Zheng - ACM author profile page.txt') #html = readTXT('E:/Code/Test Data/A. Smolic - ACM author profile page.txt') if ChangeOrNot() == True: editeProxies(http) editeHeader(ua) time.sleep(random.randint(1, 12)) html = str(getPage(dl['colleage']))#取出url if html != ' ': nameLink = analysisPage(html) for nl in nameLink: addInfo(conn,cur,nl) #print(nl) print('Now is '+str(dl['id'])) #break#only run one time cur.close() conn.close()
def mainFunction(): http, uag = getHttpUa() for ua in uag: uaQueue.put(ua) for ip in http: ipQueue.put(ip) for i in range(5): pWorker = pageWorker(uaQueue, ipQueue, dlQueue) pWorker.daemon = True pWorker.start() conn, cur = getCursor() dlList = getResult(selectSQL, cur) ''' with open(expertList_path) as csvfile: reader = csv.DictReader(csvfile) for row in reader: dlList.append(row) ''' for dl in dlList: dlQueue.put(dl)
def linkP2D(): # conn,cur = getCursor() resuID = getResult(selectSQL,cur) for re in resuID: updateSQL = 'update dlurl1 set tem=3 where id='+str(re['eid']) cur.execute(updateSQL) conn.commit() print('complete: '+str(re['eid']))
def checkSame(url,userID): conn,cur = getCursor() checkSQL = 'select id,url from dlurl1 where userid='+userID number = cur.execute(checkSQL) if number>0:#如果结果集记录大于0 for c in cur.fetchall():#提取每个记录 if c['url']==url:#对url是否相同 return True return False
def mainFunction(): conn, cur = getCursor() filePathList = readFiles(files_path) for fileP in filePathList: insertPaperSQL(fileP) #break#只运行一次 cur.close() conn.close()
def cleanPaper(): # 这个已经完成了 conn, cur = getCursor() selectSQL = 'select id from dlurl1' eidList = getResultList(selectSQL, 'id', cur) for eid in eidList: selectSQL = 'select id from paper where eid=' + str(eid) idPidList = getResult(selectSQL, cur) #获得单独的pid列表 for ip in idPidList: pidList.append([ip['id'], False]) print('start ' + str(eid)) #sameEid = [] for i in range(len(pidList)): #分配paperid updateSQL = 'update paper set paperid=' + str( i) + ' where id=' + str(pidList[i][0]) cur.execute(updateSQL) conn.commit() if pidList[i][1] == True: continue for j in range((i + 1), len(pidList)): if pidList[j][1] == True: continue iR = getResult( 'select doi,time,title from where id=' + str(pidList[i][0]), cur) jR = getResult( 'select doi,time,title from where id=' + str(pidList[j][0]), cur) if iR[0]['doi'] == jR[0]['doi']: #相同 pidList[j][1] == True updateSQL = 'update paper set paperid=' + str( i) + ' where id=' + str(pidList[j][0]) cur.execute(updateSQL) conn.commit() continue elif (iR[0]['time'] == jR[0]['time']) and (iR[0]['title'] == jR[0]['title']): #相同 pidList[j][1] == True updateSQL = 'update paper set paperid=' + str( i) + ' where id=' + str(pidList[j][0]) cur.execute(updateSQL) conn.commit() continue else: #不相同 pass pidList[i][1] == True cur.close conn.close()
def cleanUrlMySQL(): conn, cur = getCursor() #获取数据库连接和游标 dlList = getResult(sltCollNotNull, cur) for dl in dlList: url, userid = analysisRecord(dl['url']) result = addInfo(url, userid, dl['id'], cur, conn) if result == 1: cur.execute('update dlurl1 set tem=1 where id=' + str(dl['id'])) #标记 conn.commit() print('Now is ' + str(dl['id']))
def mainFunction(): path = 'E:/Code/Data/insertSQL.csv' sqlList = readList(path) conn, cur = getCursor() #获取数据库连接和游标 print('total is ' + str(len(sqlList))) for i in range(len(sqlList)): try: cur.execute(sqlList[i]) conn.commit() print('now is ' + str(i)) except Exception: print('error' + sqlList[i]) #break cur.close() conn.close()
def linkP2D(): # conn, cur = getCursor() resuID = getResult(selectSQL, cur) for re in resuID: updateSQL = 'update dlurl1 set status=2 where id=' + str(re['eid']) try: cur.execute(updateSQL) conn.commit() print('complete: ' + str(re['eid'])) except Exception: print('error: ' + str(re['eid'])) cur.close() conn.close()
def preparePublication(): #建立一个publication表 conn, cur = getCursor() selectSQL = 'select pid from publication' paperResult = getResult(selectSQL, cur) print('read completed') for i in range(len(paperResult)): if i < -1: continue print(str(paperResult[i]['pid'])) updateSQL = 'update paper set paperid=' + str( i) + ' where pid="' + paperResult[i]['pid'] + '"' cur.execute(updateSQL) conn.commit() print('completed: ' + str(i)) ''' paperList = [] print('get ready') for tr in paperResult: paperList.append(tr['pid']) paperList = list(set(paperList)) print('begin insert') for i in range(len(paperList)): #不要让id自增!!!调整数据库!!!!! insertSQL = 'insert into publication (id, pid) values ('+str(i)+',"'+paperList[i]+'")' cur.execute(insertSQL) conn.commit() print('complete1') pidDict = {} selectResult = getResult('select id, pid from publication',cur) for sr in selectResult: pidDict[sr['pid']] = sr['id'] for ir in paperResult: if ir['pid'] in paperList: idx = paperList.index(ir['pid']) updateSQL = 'update paper set paperid='+str(pidDict[ir['pid']])+' where id='+str(ir['id']) cur.execute(updateSQL) conn.commit() print('complete2') ''' cur.close() conn.close()
def addInfo(nl,id): conn,cur = getCursor() insertSQL='insert into dlurl1 (name,url,status,tem,userID) values("'+nl[0]+'","'+nl[1]+'",0,1,'+str(nl[2])+');' #insertValues = (nl[0],nl[1],nl[2]) try: with open(''): cur.execute(insertSQL) conn.commit() print('Complete:'+nl[0]+' id:'+str(id)) cur.close() conn.close() return True except Exception: print('insert error:') print(nl) cur.close() conn.close() return False
def findCSV(): httpProxies = commHttpProxies headers = commHeaders http,ua = getHttpUa()#获取伪装的备选http,userAgent列表 conn,cur = getCursor()#获取数据库连接和游标 dlList = getResult(sltDLNotCom,cur)#返回url实体的二维数组 for dl in dlList: if ChangeOrNot() == True:#随机触发 httpProxies=editeProxies(http,httpProxies)#改变http headers=editeHeader(ua,headers)#改变user agent time.sleep(random.randint(1, 12))#随机休眠 url = dl['papercsv'] if url != None and len(url)> 15: try: r = requests.get(url, proxies = httpProxies, headers = headers, timeout=30) if r.status_code == 200: csv_path = file_path+str(dl['id'])+'.csv' with open(csv_path,'wb') as csv: csv.write(r.content) print('Now is '+str(dl['id'])) except requests.RequestException as e: print(e) httpProxies=editeProxies(http,httpProxies)#改变http headers=editeHeader(ua,headers)#改变user agent time.sleep(random.randint(1, 12))#随机休眠 try: r = requests.get(url, proxies = httpProxies, headers = headers, timeout=30) if r.status_code == 200: csv_path = file_path+str(dl['id'])+'.csv' with open(csv_path,'wb') as csv: csv.write(r.content) print('Now is '+str(dl['id'])) except Exception: print('another try is failed! id:'+str(dl['id'])) #break# only run one time cur.close() conn.close()
def checkSame(url,userID): # conn,cur = getCursor() try: checkSQL = 'select id,url from dlurl1 where userid='+str(userID) number = cur.execute(checkSQL) if number>0:#如果结果集记录大于0 for c in cur.fetchall():#提取每个记录 if c['url']==url:#对url是否相同 cur.close() conn.close() return True cur.close() conn.close() return False except Exception: print('check error!:'+url) cur.close() conn.close() return True
def run(self): while True: nl,id = infoQueue.get() #ts1 = datetime.datetime.now() #处理 flag = addInfo(nl,id)#成功为True #print('infoset:' +str(infoSet['name'])) #print('mysql: '+str(dl['id'])) #ts2 = datetime.datetime.now() #print('mysql id:'+str(dl['id'])+' time:'+str(ts2-ts1)) if not flag: self.infoQueue.put((nl,id))#放回 else: print('mysql:'+str(id)) conn,cur = getCursor() cur.execute('update dlurl1 set tem=0 where id='+str(id)) conn.commit() cur.close();conn.close(); self.infoQueue.task_done()
def findPage(): httpProxies = commHttpProxies.copy() headers = commHeaders.copy() cookies = commCookies.copy() #从数据库中获取预访问的url列表,循环访问url地址 http, ua = getHttpUa() #获取伪装的备选http,userAgent列表 conn, cur = getCursor() #获取数据库连接和游标 dlList = getResult(sltDLNotCom, cur) #返回url实体的二维数组 #for i in range(15): #i = 0 for dl in dlList: # this is test!!!! read from a txt #html = readTXT('E:/Code/Test Data/Hsinchun Chen.txt') #html = readTXT('E:/Code/Test Data/Yu Zheng - ACM author profile page.txt') #html = readTXT('E:/Code/Test Data/A. Smolic - ACM author profile page.txt') if ChangeOrNot() == True: #随机触发 httpProxies = editeProxies(http, httpProxies) #改变http headers = editeHeader(ua, headers, dl['name']) #改变user agent cookies = editeCookies(cookies) time.sleep(random.randint(2, 12)) #随机休眠 print(str(httpProxies['https'])) print(str(headers['User-Agent'])) print(str(headers['Referer'])) print(str(cookies['CFID'])) print() ''' html = str(getPage(dl['url'],httpProxies,headers,cookies))#取出url if html != ' ': infoSet = analysisPage(html,int(dl['id']))#分析页面 addInfo(conn,cur,infoSet,dl)#存入数据库 cur.execute('update dlurl1 set status=1 where id='+str(dl['id']))#标记已抽取 conn.commit() print('Now is '+str(dl['id'])) #break#only run one time ''' cur.close() conn.close()
def prepareTopic(): #获得一个单独的topic列表,一个出现两次以上的列表? conn, cur = getCursor() selectSQL = 'select id,topic,num from topic' topicsResult = getResult(selectSQL, cur) topicList = [] print('get ready') for tr in topicsResult: topicList.append(tr['topic']) topicList = list(set(topicList)) numList = [0 for i in range(len(topicList))] print('begin count') for tr in topicsResult: idx = topicList.index(tr['topic']) numList[idx] += int(tr['num']) print('begin insert') for i in range(len(topicList)): #这个地方id不要自增,这样可以同时更新topic的tid 调整原数据库!!!!!!!!!!!!! #insertSQL = 'insert into topiclist (id,topic,num) values ('+str(i)+',"'+topicList[i]+'",'+str(numList[i])+')' if numList[i] > 1: #忽略只出现一次的topic insertSQL = 'insert into topiclist (id,topic,num) values (' + str( i) + ',"' + topicList[i] + '",' + str(numList[i]) + ')' cur.execute(insertSQL) conn.commit() print('complete1') for eachTopic in topicsResult: idx = -1 if eachTopic['topic'] in topicList: idx = topicList.index(eachTopic['topic']) if numList[idx] < 2: idx = -1 #忽略只出现一次的topic updateSQL = 'update topic set other=' + str(idx) + ' where id=' + str( eachTopic['id']) cur.execute(updateSQL) conn.commit() print('complete2') cur.close() conn.close()
def cleanTopic(): # conn, cur = getCursor() selectSQL = 'select id from dlurl1' eidList = getResultList(selectSQL, 'id', cur) for eid in eidList: selectSQL = 'select id,topic from topic where eid=' + str(eid) idPidList = getResult(selectSQL, cur) #获得单独的pid列表 pidList = [] for ip in idPidList: pidList.append(ip['topic']) newPidList = list(set(pidList)) print('start ' + str(eid)) if not len(newPidList) < len(pidList): #说明不冗余 print('not dublicate ' + str(eid)) continue #两层循环找相同 passID = [] sameEid = [] for i in range(len(idPidList)): if i in passID: continue for j in range((i + 1), len(idPidList)): if idPidList[i]['topic'] == idPidList[j]['topic']: sameEid.append(idPidList[j]['id']) #从id删掉它 passID.append(j) for sid in sameEid: deleteSQL = 'delete from topic where id=' + str(sid) try: cur.execute(deleteSQL) conn.commit() print('complete delete:' + str(sid)) except Exception: print('delete error ' + str(sid)) cur.close conn.close()
''' 合并一些可以确定的 update 表 a1 set a1.d=(select d from ((select d from 表 where k23=3) as a2)) where a1.k23=1; (1)所有姓名在0.8以上的,机构在0.5以上的 'select xid,yid from name2compare where institu>-1 and institu<0.5 and paper>0.2 and nameRate>0.6' 'select xid,yid from name2compare where institu>-1 and institu<0.5 and paper<0.2 and coauthor>0.01 and nameRate>0.6' (2)机构在0.5以下,paper在0.2以上的 (3) ''' from tool import getCursor,getResult conn,cur = getCursor() #import os;os.chdir('e:/Code/Python');import geng_combine;geng_combine.combine() def combine(): # #selectSQL = 'select id,xid,yid from name2compare where institu>0.5 and nameRate>0.75' #selectSQL = 'select id,xid,yid from name2compare where institu>-1 and institu<0.5 and paper>0.2 and nameRate>0.6' #selectSQL = 'select id,xid,yid from name2compare where paper<0.2 and paper>0 and coauthor >0.01 and nameRate>0.6' #selectSQL = 'select id,xid,yid from name2compare where coauthor<0.01 and topic>0.06 and nameRate>0.6' selectSQL = 'select id,xid,yid from name2compare where institu<0.1 and institu>-1 and paper>=0 and coauthor>0' selectResult = getResult(selectSQL,cur) i = 0 total= len(selectResult) for sr in selectResult: i+=1 fg = updateCombine(sr['xid'],sr['yid']) if fg: print('completed: '+str(round(i/total,3))+' || id:'+str(sr['id'])) else: print('somewhere error: '+str(round(i/total,3))+' || id:'+str(sr['id'])) #break
def addInfo(infoSet, dl): # conn, cur = getCursor() #获取数据库连接和游标 #根据不同情况构造插入语句,使用拼接的方法 if type(infoSet) != dict: print('error') if infoSet['advUrl'] != ' ': updateSQL = 'update dlurl1 set colleage="' + infoSet[ 'collUrl'] + '",subject="' + infoSet[ 'subUrl'] + '",papercsv="' + infoSet[ 'csvUrl'] + '",advisorcsv="' + infoSet[ 'advUrl'] + '" where id=' + str(dl['id']) else: updateSQL = 'update dlurl1 set colleage="' + infoSet[ 'collUrl'] + '",subject="' + infoSet[ 'subUrl'] + '",papercsv="' + infoSet[ 'csvUrl'] + '",advisorcsv=Null where id=' + str(dl['id']) rowSQL = updateSQL[:] #判断两个姓名是否相同,不同则合并 comName = isTheSame(infoSet['name'], dl['name']) #增加网址 try: cur.execute(updateSQL) conn.commit() except Exception: print('updata error' + 'id: ' + str(dl['id'])) #这里专门为插入数据过长而准备的,将非常可能过长的csvurl去除,保存到单独文件中,并不存入数据库中 try: writeList(SQLerror_path, [updateSQL]) #保存到单独文件中 if infoSet['advUrl'] != ' ': updateSQL = 'update dlurl1 set colleage="' + infoSet[ 'collUrl'] + '",subject="' + infoSet[ 'subUrl'] + '",papercsv="Too long",advisorcsv="' + infoSet[ 'advUrl'] + '" where id=' + str(dl['id']) else: updateSQL = 'update dlurl1 set colleage="' + infoSet[ 'collUrl'] + '",subject="' + infoSet[ 'subUrl'] + '",papercsv="Too long",advisorcsv=Null where id=' + str( dl['id']) cur.execute(updateSQL) conn.commit() print( 'another way of update is successful! Here is the papercsv url:' ) print(rowSQL) except Exception: print(updateSQL) #增加专家 insertSQL = 'insert into expert (name,homepage,dl) values (%s,%s,%s)' insertValues = (comName, infoSet['homepage'], dl['url']) try: cur.execute(insertSQL, insertValues) conn.commit() except Exception: print('insert error ' + 'id: ' + str(dl['id'])) #增加机构 insertSQL = '' result = getResult('select * from expert order by eid desc limit 1', cur) eid = result[0]['eid'] for inst in infoSet['institution']: try: insertSQL = 'insert into experience (eid,institution) values(' + str( eid) + ', "' + inst + '")' cur.execute(insertSQL) conn.commit() except Exception: print('error:' + insertSQL) cur.execute('update dlurl1 set status=1 where id=' + str(dl['id'])) #标记已抽取 conn.commit() cur.close() conn.close() print('Competed ' + str(dl['id']))