def findPage(): http,ua = getHttpUa() conn,cur = getCursor() dlList = getResult(sltCollNotNull,cur) for dl in dlList: # this is test!!!! read from a txt #html = readTXT('E:/Code/Test Data/Paul Robert Barford - ACM author profile page - colleagues.txt') #html = readTXT('E:/Code/Test Data/Yu Zheng - ACM author profile page.txt') #html = readTXT('E:/Code/Test Data/A. Smolic - ACM author profile page.txt') if ChangeOrNot() == True: editeProxies(http) editeHeader(ua) time.sleep(random.randint(1, 12)) html = str(getPage(dl['colleage']))#取出url if html != ' ': nameLink = analysisPage(html) for nl in nameLink: addInfo(conn,cur,nl) #print(nl) print('Now is '+str(dl['id'])) #break#only run one time cur.close() conn.close()
def run(self): while True: # Get the work from the queue and expand the tuple # 从队列中获取任务并扩展tuple httpProxies = commHttpProxies.copy() headers = commHeaders.copy() cookies = commCookies.copy() dl = self.dlQueue.get() http = self.ipQueue.get() ua = self.uaQueue.get() httpProxies['https'] = http #ts1 = datetime.datetime.now() #修饰参数 if ChangeOrNot() == True: #随机触发 #httpProxies=editeProxies(http,httpProxies)#改变http headers = editeHeader(ua, headers, dl['name']) #改变user agent cookies = editeCookies(cookies) time.sleep(random.randint(5, 20)) #随机休眠 #取出html html = str(getPage(dl['url'], httpProxies, headers, cookies)) #取出url #放回 self.ipQueue.put(http) self.uaQueue.put(ua) if html == ' ': #未获取成功,重新放入 self.dlQueue.put(dl) #放入 self.htmlQueue.put((html, dl)) #print('get: '+str(dl['id'])) #ts2 = datetime.datetime.now() #print('page id:'+str(dl['id'])+' time:'+str(ts2-ts1)) self.dlQueue.task_done()
def run(self): while True: # 从队列中获取任务并扩展tuple httpProxies = commHttpProxies.copy() headers = commHeaders.copy() cookies = commCookies.copy() dl = self.dlQueue.get() http = self.ipQueue.get() ua = self.uaQueue.get() httpProxies['https'] = http #修饰参数 if ChangeOrNot() == True:#随机触发 headers=editeHeader(ua,headers,dl['name'])#改变user agent cookies=editeCookies(cookies) time.sleep(random.randint(5, 20))#随机休眠 #取出html html = str(getPage(dl['url'],httpProxies,headers,cookies))#取出url papercsv = getCsvUrl(html) dl['papercsv'] = papercsv #放回 self.ipQueue.put(http) self.uaQueue.put(ua) if html == ' ':#未获取成功,重新放入 self.dlQueue.put(dl) #放入 else: print('get: '+dl['id']) self.htmlQueue.put(dl) self.dlQueue.task_done()
def run(self): while True: httpProxies = commHttpProxies.copy() headers = commHeaders.copy() cookies = commCookies.copy() dl = self.dlQueue.get() http = self.ipQueue.get() ua = self.uaQueue.get() httpProxies['https'] = http #ts1 = datetime.datetime.now() #修饰参数 if ChangeOrNot() == True:#随机触发 headers=editeHeader(ua,headers,dl['name'])#改变user agent cookies=editeCookies(cookies) time.sleep(random.randint(3, 14))#随机休眠 #Optional dl['colleage'] = cleanURL(dl['colleage'])#delete cookies #if '&CFID=716005087&CFTOKEN=29677367' in dl['colleage']: # dl['colleage'] = dl['colleage'].replace('&CFID=716005087&CFTOKEN=29677367','') #取出html html = str(getPage(dl['colleage'],httpProxies,headers,cookies))#取出url #放回 self.ipQueue.put(http) self.uaQueue.put(ua) if html == ' ':#未获取成功,重新放入 self.dlQueue.put(dl) #放入 print('get HTML:'+str(dl['id'])) self.htmlQueue.put((html,dl)) #print('get: '+str(dl['id'])) #ts2 = datetime.datetime.now() #print('page id:'+str(dl['id'])+' time:'+str(ts2-ts1)) self.dlQueue.task_done()
headers = commHeaders.copy() cookies = commCookies.copy() while not dlQueue.empty(): dl = dlQueue.get() http = random.choice(http) ua = random.choice(uag) httpProxies['https'] = http #ts1 = datetime.datetime.now() #修饰参数 if ChangeOrNot() == True: #随机触发 headers = editeHeader(ua, headers, dl['name']) #改变user agent cookies = editeCookies(cookies) time.sleep(random.randint(5, 20)) #随机休眠 #取出html html = str(getPage(dl['url'], httpProxies, headers, cookies)) #取出url #放回 if html == ' ': #未获取成功,重新放入 dlQueue.put(dl) #放入文件中 try: path = dirPath + str(dl['id']) + '.txt' writeTXT(path, html) print('complete: ' + str(dl['id'])) except Exception: print('error:' + str(dl['id'])) dlQueue.put(dl)
from bs4 import BeautifulSoup #主要的循环方法 def findSubject(): http,ua = getHttpUa() conn,cur = getCursor() subList = getResult(sltDLNotCom,cur)#返回二维数组 for dl in dlList: # this is test!!!! read from a txt #html = readTXT('E:/Code/Test Data/Paul Robert Barford - ACM author profile page.txt') if ChangeOrNot() == True: editeProxies(http) editeHeader(ua) time.sleep(random.randint(1, 20)) html = str(getPage(dl['subject']))#取出url if html != ' ': subject = extractSubject(html) status = addInfo(conn,cur,subject,dl['id']) if status == 1: print('Now is '+str(dl['id'])) cur.close() conn.close() def extractSubject(doc): # combineStr = '' subject = [] soup = BeautifulSoup(''.join(doc),"lxml") h5 = soup.find(text='Subject Areas')