def produce(self): # mongodb with multipleprocessing must be init after fork self.mongo_handle = MongoUtils(db=self.__mongo_db) if not self.redis_handle.connected or not self.mongo_handle.connected: logger.error('no redis/mongodb connection found! exit.') return while True: try: _, req = self.redis_handle.fetch_one_result() with self.context['lock']: self.context['result_counts'].value -= 1 logger.debug('got req, %d results left' % self.context['result_counts'].value) self.proc_req(req) except: logger.exception('produce exception!') if not self.redis_handle.connected: logger.error('redis disconnected! reconnecting...') self.redis_handle.connect() if not self.mongo_handle.connected: logger.error('mongodb disconnected! reconnecting...') self.mongo_handle.connect() time.sleep(10) finally: with self.context['lock']: if self.context['result_counts'].value == 0: if self.context[ 'live_spider_counts'].value == 0 and self.context[ 'task_counts'].value == 0: self.context['task_done'].set()
def consume(self): if not self.redis_handle.connected: logger.error('no redis connection found in consumer! exit.') return while True: try: url = self.redis_handle.fetch_one_task() with self.context['lock']: self.context['live_spider_counts'].value += 1 self.context['task_counts'].value -= 1 logger.info('get task url: %s' % url) logger.info('%d tasks left' % self.context['task_counts'].value) if not self.redis_handle.is_blocked(URL(url)): self.start_spider(url, self.__cookie_file) except: logger.exception('consumer exception!') if not self.redis_handle.connected: logger.error('redis disconnected! reconnecting...') self.redis_handle.connect() time.sleep(10) finally: with self.context['lock']: self.context['live_spider_counts'].value -= 1
def proc_req(self, req): try: data = json.loads(req) except: logger.exception('json loads req error: %s' % req) return urlstring = data.get('url', '') if not urlstring: logger.error('empty url found!') return url = URL(urlstring) method = data.get('method', '') if not method: logger.error('not method found!') return # save to mongodb data.update({ 'pattern': url.pattern, 'hostname': url.hostname, 'domain': url.domain }) target = self.redis_handle.is_target(url) if not self.redis_handle.is_url_saved(method, url): logger.debug('redis saved pattern not found!') self.mongo_handle.save(data, is_target=target) self.redis_handle.set_url_saved(method, url) else: logger.debug('redis saved pattern found!') if not target: logger.debug('%s is not target' % url.hostname) return # todo post req if method == 'POST': logger.debug('POST not support now') elif method == 'GET': # new host found, add index page to task queue if self.redis_handle.get_hostname_reqcount(url.hostname) == 0: self.create_task_from_url(URL(url.index_page), add_whitelist=False) # check url validation inside create_url_task self.create_task_from_url(url, add_whitelist=False) else: # not GET nor POST logger.error('HTTP Verb %s found!' % method) logger.debug(data)
'mongo_db': args.mongo_db } for _ in range(args.consumer): worker = Consumer(**kwargs).consume proc = Process(name='consumer-%d' % _, target=worker) proc.daemon = True proc.start() for _ in range(args.producer): worker = Producer(**kwargs).produce proc = Process(name='producer-%d' % _, target=worker) proc.daemon = True proc.start() if not args.keepon: redis_handle.flushdb() redis_handle.save_startup_params(args) target = args.url or args.file producer = Producer(**kwargs) if isinstance(target, basestring): url = URL(target) if not url.valid or url.blocked: logger.error('not valid url, exit.') sys.exit(-1) producer.create_task_from_url(url) # file object else: producer.create_task_from_file(target) redis_handle.close() tspider_context['task_done'].wait()
def spider(self): if not self._url: logger.info('incorrect url format found!') return [] # fptr, spiderfile = tempfile.mkstemp() if not os.path.exists(TMPDIR_PATH): os.mkdir(TMPDIR_PATH) spiderfile = os.path.join(TMPDIR_PATH, uuid.uuid4().hex) crawler_file = os.path.join(SPIDER_PATH, 'casper_crawler.js') command = 'casperjs --ignore-ssl-errors=true --ssl-protocol=any ' \ '{cmd} "{url}" --output="{file}"'.format(cmd=crawler_file, url=self._url, file=spiderfile) if self._cookie_file: command += ' --cookie={0}'.format(self._cookie_file) try: proc = subprocess.Popen(command, shell=True) start = datetime.now() while proc.poll() is None: time.sleep(1) now = datetime.now() if (now - start).seconds > CASPERJS_TIMEOUT: os.kill(proc.pid, signal.SIGKILL) os.waitpid(-1, os.WNOHANG) logger.error('casperjs execution timeout. killed.') break except: logger.exception('casperjs execution failed!') if not os.path.exists(spiderfile): logger.error('no spider result file found!') return [] fingerprints = [] # with os.fdopen(fptr) as f: with open(spiderfile) as f: for line in f: line = line.strip() try: request = json.loads(line) method = request['method'].upper() url = request['url'] postdata = request.get('postData', '') type_ = request['type'] headers = {} for header in request['headers']: headers[header['name']] = header['value'] headers.pop('Content-Length', '') headers.pop('User-Agent', '') headers.pop('Accept', '') except: logger.exception('json.loads failed!') continue # check urls fingerprint fp = '%s|%s' % (method, url) if fp in fingerprints: continue fingerprints.append(fp) data = { 'method': method, 'url': url, 'postdata': postdata, 'headers': headers, 'type': type_ } # print json.dumps(data) self._results.append(json.dumps(data)) os.unlink(spiderfile) if self._outfile: with open(self._outfile, 'w') as f: for url in self._results: f.write(url + '\n') return self._results