def parse(self, response): #print "queued %d" % len(self.crawler.engine.slot.scheduler) print "[*] open %s" % response.url item = AllCrawl2Item() item['inurl'] = response.url item['site'] = urlparse(response.url).netloc.lower() item['ext'] = splitext(urlparse(response.url).path)[1][1:].lower() item.update(parsers.get_content(response, item)) return item
def parse(self, response): #print "queued %d" % len(self.crawler.engine.slot.scheduler) print colorama.Fore.GREEN + "[+] open %s" % (response.url,) + colorama.Fore.RESET, items = AllCrawl2Item() items['inurl'] = response.url items['site'] = urlparse( response.url ).netloc.lower() items['ext'] = splitext( urlparse( response.url ).path )[1][1:].lower() for item in parsers.get_content( response.body, items ): yield item print ''
def parse(self, response): #print "queued %d" % len(self.crawler.engine.slot.scheduler) print colorama.Fore.GREEN + "[+] open %s" % ( response.url, ) + colorama.Fore.RESET, item = AllCrawl2Item() item['inurl'] = response.url item['site'] = 'local' item['ext'] = splitext(urlparse(response.url).path)[1][1:].lower() item.update(parsers.get_content(response.body, item)) print '' return item
def parse(self, response): print "[*] open %s" % response.url item = AllCrawl2Item() item['inurl'] = response.url item['site'] = urlparse(response.url).netloc.lower() item['ext'] = splitext(urlparse(response.url).path)[1][1:].lower() if not split(response.url)[1]: # is file #print '[debug] +%s' % response.url item["intext"] = '' for message in json.loads(response.body): item["intext"] += _file + ' ' yield Request(message) yield item else: # is dir #print '[debug] %s' % response.url yield parsers.get_content(response, item)
def parse(self, response): #print "queued %d" % len(self.crawler.engine.slot.scheduler) print "[*] open %s" % response.url if not split(response.url)[1]: # is dir files = json.loads( response.body ) for _file in files: print '[debug] %s' % ( _file['filename'] + '/' if _file['filetype'] == 'd' else _file['filename'] ) if _file['filetype'] == 'd': yield Request( response.urljoin( _file['filename'] + '/' ), meta = { 'ftp_user': self.ftp_user, 'ftp_password': self.ftp_password } ) if _file['filetype'] == '-': yield Request( response.urljoin( _file['filename'] ), meta = { 'ftp_user': self.ftp_user, 'ftp_password': self.ftp_password } ) else: item = AllCrawl2Item() item['inurl'] = response.url item['site'] = urlparse( response.url ).netloc.lower() item['ext'] = splitext( urlparse( response.url ).path )[1][1:].lower() yield parsers.get_content( response, item )
def parse(self, response): #print "queued %d" % len(self.crawler.engine.slot.scheduler) print colorama.Fore.GREEN + "[+] open %s" % (response.url,) + colorama.Fore.RESET , items = AllCrawl2Item() items['inurl'] = response.url items['site'] = urlparse( response.url ).netloc.lower() items['ext'] = splitext( urlparse( response.url ).path )[1][1:].lower() if not split(response.url)[1]: # is dir #item["intext"] = '' for _file in json.loads( response.body ): #item["intext"] += _file + ' ' yield Request( response.url + _file ) #for _item in item: # yield _item else: # is file for item in parsers.get_content( response.body, items ): yield item print ''