def findPage(): http,ua = getHttpUa() conn,cur = getCursor() dlList = getResult(sltCollNotNull,cur) for dl in dlList: # this is test!!!! read from a txt #html = readTXT('E:/Code/Test Data/Paul Robert Barford - ACM author profile page - colleagues.txt') #html = readTXT('E:/Code/Test Data/Yu Zheng - ACM author profile page.txt') #html = readTXT('E:/Code/Test Data/A. Smolic - ACM author profile page.txt') if ChangeOrNot() == True: editeProxies(http) editeHeader(ua) time.sleep(random.randint(1, 12)) html = str(getPage(dl['colleage']))#取出url if html != ' ': nameLink = analysisPage(html) for nl in nameLink: addInfo(conn,cur,nl) #print(nl) print('Now is '+str(dl['id'])) #break#only run one time cur.close() conn.close()
def run(self): while True: # 从队列中获取任务并扩展tuple httpProxies = commHttpProxies.copy() headers = commHeaders.copy() cookies = commCookies.copy() dl = self.dlQueue.get() http = self.ipQueue.get() ua = self.uaQueue.get() httpProxies['https'] = http #修饰参数 if ChangeOrNot() == True:#随机触发 headers=editeHeader(ua,headers,dl['name'])#改变user agent cookies=editeCookies(cookies) time.sleep(random.randint(5, 20))#随机休眠 #取出html html = str(getPage(dl['url'],httpProxies,headers,cookies))#取出url papercsv = getCsvUrl(html) dl['papercsv'] = papercsv #放回 self.ipQueue.put(http) self.uaQueue.put(ua) if html == ' ':#未获取成功,重新放入 self.dlQueue.put(dl) #放入 else: print('get: '+dl['id']) self.htmlQueue.put(dl) self.dlQueue.task_done()
def run(self): while True: # Get the work from the queue and expand the tuple # 从队列中获取任务并扩展tuple httpProxies = commHttpProxies.copy() headers = commHeaders.copy() cookies = commCookies.copy() dl = self.dlQueue.get() http = self.ipQueue.get() ua = self.uaQueue.get() httpProxies['https'] = http #ts1 = datetime.datetime.now() #修饰参数 if ChangeOrNot() == True: #随机触发 #httpProxies=editeProxies(http,httpProxies)#改变http headers = editeHeader(ua, headers, dl['name']) #改变user agent cookies = editeCookies(cookies) time.sleep(random.randint(5, 20)) #随机休眠 #取出html html = str(getPage(dl['url'], httpProxies, headers, cookies)) #取出url #放回 self.ipQueue.put(http) self.uaQueue.put(ua) if html == ' ': #未获取成功,重新放入 self.dlQueue.put(dl) #放入 self.htmlQueue.put((html, dl)) #print('get: '+str(dl['id'])) #ts2 = datetime.datetime.now() #print('page id:'+str(dl['id'])+' time:'+str(ts2-ts1)) self.dlQueue.task_done()
def run(self): while True: # 从队列中获取任务并扩展tuple httpProxies = commHttpProxies.copy() headers = commHeaders.copy() cookies = commCookies.copy() ua = self.uaQueue.get() dl = self.htmlQueue.get() http = self.ipQueue.get() httpProxies['https'] = http #修饰参数 if ChangeOrNot() == True:#随机触发 headers=editeHeader(ua,headers,dl['name'])#改变user agent cookies=editeCookies(cookies) time.sleep(random.randint(6, 15))#随机休眠 #-----------------------------这里就是不一样的地方 #取出csv flag = findCSV(dl,httpProxies,headers,cookies)#取出url self.ipQueue.put(http)#放回 self.uaQueue.put(ua) if not flag:#未获取成功,重新放入 self.htmlQueue.put(dl) #放入 self.htmlQueue.task_done()
def run(self): while True: # Get the work from the queue and expand the tuple # 从队列中获取任务并扩展tuple httpProxies = commHttpProxies.copy() headers = commHeaders.copy() cookies = commCookies.copy() ua = self.uaQueue.get() dl = self.dlQueue.get() http = self.ipQueue.get() httpProxies['https'] = http #ts1 = datetime.datetime.now() #修饰参数 if ChangeOrNot() == True: #随机触发 #httpProxies=editeProxies(http,httpProxies)#改变http headers = editeHeader(ua, headers, dl['name']) #改变user agent cookies = editeCookies(cookies) time.sleep(random.randint(7, 15)) #随机休眠 #取出html flag = findCSV(dl, httpProxies, headers, cookies) #取出url #html = readTXT('E:/Code/Test Data/Yu Zheng - ACM author profile page.txt') self.ipQueue.put(http) #放回 self.uaQueue.put(ua) if not flag: #未获取成功,重新放入 self.dlQueue.put(dl) #放入 #print('get: '+str(dl['id'])) #ts2 = datetime.datetime.now() #print('page id:'+str(dl['id'])+' time:'+str(ts2-ts1)) self.dlQueue.task_done()
def findCSV(): httpProxies = commHttpProxies headers = commHeaders http,ua = getHttpUa()#获取伪装的备选http,userAgent列表 conn,cur = getCursor()#获取数据库连接和游标 dlList = getResult(sltDLNotCom,cur)#返回url实体的二维数组 for dl in dlList: if ChangeOrNot() == True:#随机触发 httpProxies=editeProxies(http,httpProxies)#改变http headers=editeHeader(ua,headers)#改变user agent time.sleep(random.randint(1, 12))#随机休眠 url = dl['papercsv'] if url != None and len(url)> 15: try: r = requests.get(url, proxies = httpProxies, headers = headers, timeout=30) if r.status_code == 200: csv_path = file_path+str(dl['id'])+'.csv' with open(csv_path,'wb') as csv: csv.write(r.content) print('Now is '+str(dl['id'])) except requests.RequestException as e: print(e) httpProxies=editeProxies(http,httpProxies)#改变http headers=editeHeader(ua,headers)#改变user agent time.sleep(random.randint(1, 12))#随机休眠 try: r = requests.get(url, proxies = httpProxies, headers = headers, timeout=30) if r.status_code == 200: csv_path = file_path+str(dl['id'])+'.csv' with open(csv_path,'wb') as csv: csv.write(r.content) print('Now is '+str(dl['id'])) except Exception: print('another try is failed! id:'+str(dl['id'])) #break# only run one time cur.close() conn.close()
def findPage(): httpProxies = commHttpProxies.copy() headers = commHeaders.copy() cookies = commCookies.copy() #从数据库中获取预访问的url列表,循环访问url地址 http, ua = getHttpUa() #获取伪装的备选http,userAgent列表 conn, cur = getCursor() #获取数据库连接和游标 dlList = getResult(sltDLNotCom, cur) #返回url实体的二维数组 #for i in range(15): #i = 0 for dl in dlList: # this is test!!!! read from a txt #html = readTXT('E:/Code/Test Data/Hsinchun Chen.txt') #html = readTXT('E:/Code/Test Data/Yu Zheng - ACM author profile page.txt') #html = readTXT('E:/Code/Test Data/A. Smolic - ACM author profile page.txt') if ChangeOrNot() == True: #随机触发 httpProxies = editeProxies(http, httpProxies) #改变http headers = editeHeader(ua, headers, dl['name']) #改变user agent cookies = editeCookies(cookies) time.sleep(random.randint(2, 12)) #随机休眠 print(str(httpProxies['https'])) print(str(headers['User-Agent'])) print(str(headers['Referer'])) print(str(cookies['CFID'])) print() ''' html = str(getPage(dl['url'],httpProxies,headers,cookies))#取出url if html != ' ': infoSet = analysisPage(html,int(dl['id']))#分析页面 addInfo(conn,cur,infoSet,dl)#存入数据库 cur.execute('update dlurl1 set status=1 where id='+str(dl['id']))#标记已抽取 conn.commit() print('Now is '+str(dl['id'])) #break#only run one time ''' cur.close() conn.close()
def findDL(): # http, ua = getHttpUa(http_path, ua_path) seedList = readMetrix(seedList_path) total = len(seedList) print('total is ' + str(total)) i = 0 for s in seedList: i += 1 print('now is ' + str(i)) if ChangeOrNot() == True: editeProxies(http) editeHeader(ua) time.sleep(random.randint(1, 10)) url = findLink(s) try: writeMetrx(dl_path, [[s[0], url]]) except: print('write error! it is ' + s[0]) writeMetrx(dl_path, [[s[0], 'error']])
def run(self): while True: httpProxies = commHttpProxies.copy() headers = commHeaders.copy() cookies = commCookies.copy() dl = self.dlQueue.get() http = self.ipQueue.get() ua = self.uaQueue.get() httpProxies['https'] = http #ts1 = datetime.datetime.now() #修饰参数 if ChangeOrNot() == True:#随机触发 headers=editeHeader(ua,headers,dl['name'])#改变user agent cookies=editeCookies(cookies) time.sleep(random.randint(3, 14))#随机休眠 #Optional dl['colleage'] = cleanURL(dl['colleage'])#delete cookies #if '&CFID=716005087&CFTOKEN=29677367' in dl['colleage']: # dl['colleage'] = dl['colleage'].replace('&CFID=716005087&CFTOKEN=29677367','') #取出html html = str(getPage(dl['colleage'],httpProxies,headers,cookies))#取出url #放回 self.ipQueue.put(http) self.uaQueue.put(ua) if html == ' ':#未获取成功,重新放入 self.dlQueue.put(dl) #放入 print('get HTML:'+str(dl['id'])) self.htmlQueue.put((html,dl)) #print('get: '+str(dl['id'])) #ts2 = datetime.datetime.now() #print('page id:'+str(dl['id'])+' time:'+str(ts2-ts1)) self.dlQueue.task_done()
for dl in dlList: dlQueue.put(dl) httpProxies = commHttpProxies.copy() headers = commHeaders.copy() cookies = commCookies.copy() while not dlQueue.empty(): dl = dlQueue.get() http = random.choice(http) ua = random.choice(uag) httpProxies['https'] = http #ts1 = datetime.datetime.now() #修饰参数 if ChangeOrNot() == True: #随机触发 headers = editeHeader(ua, headers, dl['name']) #改变user agent cookies = editeCookies(cookies) time.sleep(random.randint(5, 20)) #随机休眠 #取出html html = str(getPage(dl['url'], httpProxies, headers, cookies)) #取出url #放回 if html == ' ': #未获取成功,重新放入 dlQueue.put(dl) #放入文件中 try: path = dirPath + str(dl['id']) + '.txt' writeTXT(path, html) print('complete: ' + str(dl['id']))