def loop(self, url, next, post=None, cb=None, cc = 1, deep=2, debug=0, allow_external = False, link_filter=None, start_now=True, **options): doneurls = [common.md5(url)] domain = common.get_domain(url).lower() def page_loaded(doc): if doc.req['meta']['deep']<deep: for n in doc.q(next): nexturl = n.nodevalue() if domain != common.get_domain(nexturl): continue if link_filter and not link_filter(url=nexturl): continue if common.md5(nexturl) not in doneurls: doneurls.append(common.md5(nexturl)) req = Request(url=nexturl, meta=dict(deep=doc.req['meta']['deep']+1),use_cache=True, cb = page_loaded, **options) self.downloader.put(req) #allow the loop caller proccessing each loaded page if cb: cb(doc) self.downloader.put(Request(url=url, post=post, meta=dict(deep=1), use_cache=True, cb = page_loaded, **options)) self.downloader.cc = cc if start_now: self.downloader.start()
def make_key(self, url, post=''): #normalise the post if post and isinstance(post, common.MyDict): post = post.dict() if post and isinstance(post, dict): post = urllib.urlencode(sorted(post.items())) return common.md5((url + (post or '')).encode('utf8')) + '.htm'
def make_key(self, url, post = ''): #normalise the post if post and isinstance(post, common.MyDict): post = post.dict() if post and isinstance(post, dict): post = urllib.urlencode(sorted(post.items())) return common.md5((url + (post or '')).encode('utf8')) + '.htm'
def page_loaded(doc): if doc.req['meta']['deep']<deep: for n in doc.q(next): nexturl = n.nodevalue() if domain != common.get_domain(nexturl): continue if link_filter and not link_filter(url=nexturl): continue if common.md5(nexturl) not in doneurls: doneurls.append(common.md5(nexturl)) req = Request(url=nexturl, meta=dict(deep=doc.req['meta']['deep']+1),use_cache=True, cb = page_loaded, **options) self.downloader.put(req) #allow the loop caller proccessing each loaded page if cb: cb(doc)
def append_line(self, filename, line, dedup=False): #waiting while other thread writing while self.writingflag: pass #hold the flag self.writingflag = True path = self.join_path(filename) if dedup: if not hasattr(self, '_data_lines'): self._data_lines = [] if common.md5(line) not in self._data_lines: self._data_lines.append(common.md5(line)) common.append_file(path, line + '\r\n') else: common.append_file(path, line + '\r\n') #free the flag self.writingflag = False
def append_line(self, filename, line, dedup=False): #waiting while other thread writing while self.writingflag: pass #hold the flag self.writingflag = True path = self.join_path(filename) if dedup: if not hasattr(self,'_data_lines'): self._data_lines = [] if common.md5(line) not in self._data_lines: self._data_lines.append(common.md5(line)) common.append_file(path, line+'\r\n') else: common.append_file(path, line+'\r\n') #free the flag self.writingflag = False