def getCsvUrl(soup, doc, id): # url = ' ' try: csv = soup.find('csv') url = csv.a['href'] except Exception: print('error:' + str(csv)) path = Dir_path + csvError_path + str(id) + '.txt' writeTXT(path, doc) return url
def getSubject(soup, doc, id): # url = ' ' stri = soup.find(text='See all subject areas') #找到目标字样 if stri != None: try: url = 'http://dl.acm.org/' + stri.parent['href'] except Exception: print('error:' + str(stri.parent)) path = Dir_path + subjError_path + str(id) + '.txt' writeTXT(path, doc) return url
def getColleagues(soup, doc, id): # url = ' ' stri = soup.find(text='See all colleagues of this author') #找到目标字样 if stri != None: try: url = 'http://dl.acm.org/' + stri.parent['href'] except Exception: print('error:' + str(stri.parent)) path = Dir_path + collError_path + str(id) + '.txt' writeTXT(path, doc) return url
def getAdvisor(soup, doc, id): # url = ' ' stri = soup.find(text='Advisor only') #找到目标字样 if stri != None: try: url = 'http://dl.acm.org/' + stri.parent['href'] except Exception: print('error:' + str(stri.parent)) path = Dir_path + advError_path + str(id) + '.txt' writeTXT(path, doc) return url
def getSubject(soup, doc, id): # url = ' ' try: left = soup.find('left') subject = left.find('See all subject areas') if subject != None: url = subject.parent['href'] except Exception: print('error:' + str(subject)) path = Dir_path + subjError_path + str(id) + '.txt' writeTXT(path, doc) return url
def getColleagues(soup, doc, id): # url = ' ' try: left = soup.find('left') colleagues = left.find('See all colleagues of this author') if colleagues != None: url = colleagues.parent['href'] except Exception: print('error:' + str(colleagues)) path = Dir_path + collError_path + str(id) + '.txt' writeTXT(path, doc) return url
def getAdvisor(soup, doc, id): # url = ' ' try: left = soup.find('left') Advisor = left.find('Advisor only') if Advisor != None: url = Advisor.parent['href'] except Exception: print('error:' + str(Advisor)) path = Dir_path + advError_path + str(id) + '.txt' writeTXT(path, doc) return url
def getSubject(soup, doc, id): # url = ' ' stri = soup.find(text='See all subject areas') #找到目标字样 if stri != None: try: url = 'http://dl.acm.org/' + stri.parent['href'] except Exception: print('error:' + str(stri.parent)) path = Dir_path + subjError_path + str(id) + '.txt' writeTXT(path, doc) ''' for a in aTag: if a.string!='' and a.string == 'See all subject areas': url = 'http://dl.acm.org/' + a['href'] break ''' #print('subject'+url) return url
def getColleagues(soup, doc, id): # url = ' ' stri = soup.find(text='See all colleagues of this author') #找到目标字样 if stri != None: try: url = 'http://dl.acm.org/' + stri.parent['href'] except Exception: print('error:' + str(stri.parent)) path = Dir_path + collError_path + str(id) + '.txt' writeTXT(path, doc) ''' for a in aTag: if a.string!='' and a.string!=None and 'See all colleagues' in a.string: #if a.string!='' and a.string=='See all colleagues of this author': url = 'http://dl.acm.org/' + a['href'] break ''' #print(url) return url
def analysisPage(doc,id): nameLink = [] #total = 0 soup = BeautifulSoup(''.join(doc),"lxml") a = soup.find('a',{'name':"collab"}) try: divAb = a.parent.parent tr = divAb.table.tr for td in tr.findAll('td'): for div in td.findAll('div'): if div.a.string != None: name = cleanName(div.a.string) url = 'http://dl.acm.org/' + div.a['href'] url,userID = extractUserID(url) #if checkSame(url,userID)==False:#不存在相同 nameLink.append([name,url,userID]) except Exception: print('error:'+str(a)) path = Dir_path+httpError_path+str(id)+'.txt' writeTXT(path,doc) return nameLink
headers = commHeaders.copy() cookies = commCookies.copy() while not dlQueue.empty(): dl = dlQueue.get() http = random.choice(http) ua = random.choice(uag) httpProxies['https'] = http #ts1 = datetime.datetime.now() #修饰参数 if ChangeOrNot() == True: #随机触发 headers = editeHeader(ua, headers, dl['name']) #改变user agent cookies = editeCookies(cookies) time.sleep(random.randint(5, 20)) #随机休眠 #取出html html = str(getPage(dl['url'], httpProxies, headers, cookies)) #取出url #放回 if html == ' ': #未获取成功,重新放入 dlQueue.put(dl) #放入文件中 try: path = dirPath + str(dl['id']) + '.txt' writeTXT(path, html) print('complete: ' + str(dl['id'])) except Exception: print('error:' + str(dl['id'])) dlQueue.put(dl)