def linklistappend(self, qualifiedlink): glovar.linklock.acquire() if qualifiedlink not in glovar.linklist: self.linklist.append(qualifiedlink) glovar.linklist.append(qualifiedlink) CrawThread.writetocrawfile(qualifiedlink) glovar.linklock.release()
def formlistappend(self): formdict = {} formdict[self.current_actionlink] = self.current_forminputlist # # for item in self.current_forminputlist: # self.current_forminputlist.remove(item) formstr = self.transformtostr(formdict) flag = 1 glovar.formlock.acquire() #''' # for idict in glovar.formlist: # if set(list(idict))==set(list(formdict)):#link same # idictlist=idict[list(idict)[0]] # formdictlist=formdict[list(formdict)[0]] # for inputdict in formdictlist: # for inputdict2 in idictlist: # if set(list(inputdict))==set(list(inputdict2)): # continue # else: # flag=0 #''' for form in glovar.formlist: str = self.transformtostr(form) if str == formstr: flag = 0 if flag == 1: glovar.formlist.append(formdict) CrawThread.writetoformfile(formdict) glovar.formlock.release() self.current_forminputlist = []
def hreflistappend(self, attrs): hrefdict = {} dict = self.hrefdatadict_from_attrs(attrs) if dict: hrefdict[self.current_hreflink] = dict flag = 1 glovar.hreflock.acquire() for idict in glovar.hreflist: if set(list(hrefdict)) == set(list(idict)): idictvalue = idict[list(idict)[0]] hrefdictvalue = hrefdict[list(hrefdict)[0]] if set(list(idictvalue)) == set(list(hrefdictvalue)): flag = 0 if flag == 1: glovar.hreflist.append(hrefdict) CrawThread.writetohreffile(hrefdict) glovar.hreflock.release()
def main(self, link): global linkparse # linkparse=link timeout = 20 self.sleep_download_time = 10 socket.setdefaulttimeout(timeout) url_parser = URLParser(strict=False) # headers = { # 'User-Agent': # 'Opera/9.23' # } # try: # r = urllib.request.Request(link) # except ValueError as e: # print("ValueError!!!") # print(e) # print("link:"+link+"\n") # return # if r: try: time.sleep(self.sleep_download_time) u = urllib.request.urlopen(link) backurl = u.geturl() #prevent redirection print("backurl:", backurl) glovar.parselock.acquire() if backurl in glovar.parsedlist: print("this link has been paresd") return None linkparse = backurl glovar.parselock.release() the_html = u.read() charset = u.info().get_content_charset() u.close() except Exception as e: print(time.strftime("%Y-%m-%d %H:%M:%S")) print("fail to access!!!") print("urllib.error") print(e) print("link:" + link + "\n") return None except socket.timeout as e: print("socket timout:", link) print(e) return None # print ("charset:".join(charset)) if not charset: try: url_parser.feed(the_html.decode('gb2312')) print('gb2312') except UnicodeDecodeError: try: url_parser.feed(the_html.decode('utf-8')) print('utf-8') except UnicodeDecodeError: try: url_parser.feed(the_html.decode('GB18030')) print('GB18030') #www.sohu.com except HTMLParseError as e: print(" HTMLParseError!!!") print(e) print("link:" + linkparse + "\n") return None except UnicodeDecodeError as e: print("UnicodeDecodeError") print(e) print("link:" + linkparse + "\n") return None except HTMLParseError as e: print(" HTMLParseError:") print(e) print("link:" + linkparse + "\n") return None else: # try: print("charset:" + charset) url_parser.feed(the_html.decode(charset)) # url_parser.feed(the_html) # except Exception as e: # print('parse error occurred: %s\n' % e) # return None # global linklist # linklist= url_parser.linklist # print(time.strftime("%Y-%m-%d %H:%M:%S")) # print("parse successfully!!!") # print("link parsed:"+link+"\n") # global linkparse if link != linkparse: print(link, "redict to: ", linkparse) print(time.strftime("%Y-%m-%d %H:%M:%S")) print("access successfully!!!") print(linkparse) CrawThread.writetolinkfile(linkparse) glovar.parselock.acquire() glovar.parsedlist.append(linkparse) glovar.parselock.release()
def main(self,link): global linkparse # linkparse=link timeout = 20 self.sleep_download_time = 10 socket.setdefaulttimeout(timeout) url_parser = URLParser(strict=False) # headers = { # 'User-Agent': # 'Opera/9.23' # } # try: # r = urllib.request.Request(link) # except ValueError as e: # print("ValueError!!!") # print(e) # print("link:"+link+"\n") # return # if r: try: time.sleep(self.sleep_download_time) u = urllib.request.urlopen(link) backurl=u.geturl()#prevent redirection print("backurl:",backurl) glovar.parselock.acquire() if backurl in glovar.parsedlist: print("this link has been paresd") return None linkparse=backurl glovar.parselock.release() the_html = u.read() charset = u.info().get_content_charset() u.close() except Exception as e: print(time.strftime("%Y-%m-%d %H:%M:%S")) print("fail to access!!!") print("urllib.error") print(e) print("link:"+link+"\n") return None except socket.timeout as e: print("socket timout:",link) print(e) return None # print ("charset:".join(charset)) if not charset: try: url_parser.feed(the_html.decode('gb2312')) print ('gb2312') except UnicodeDecodeError: try: url_parser.feed(the_html.decode('utf-8')) print ('utf-8') except UnicodeDecodeError: try: url_parser.feed(the_html.decode('GB18030')) print ('GB18030') #www.sohu.com except HTMLParseError as e: print(" HTMLParseError!!!") print(e) print("link:"+linkparse+"\n") return None except UnicodeDecodeError as e: print("UnicodeDecodeError") print(e) print("link:"+linkparse+"\n") return None except HTMLParseError as e: print(" HTMLParseError:") print(e) print("link:"+linkparse+"\n") return None else: # try: print ("charset:"+charset) url_parser.feed(the_html.decode(charset)) # url_parser.feed(the_html) # except Exception as e: # print('parse error occurred: %s\n' % e) # return None # global linklist # linklist= url_parser.linklist # print(time.strftime("%Y-%m-%d %H:%M:%S")) # print("parse successfully!!!") # print("link parsed:"+link+"\n") # global linkparse if link!=linkparse: print(link,"redict to: ",linkparse) print(time.strftime("%Y-%m-%d %H:%M:%S")) print("access successfully!!!") print(linkparse) CrawThread.writetolinkfile(linkparse) glovar.parselock.acquire() glovar.parsedlist.append(linkparse) glovar.parselock.release()