Пример #1
0
    def produce(self):
        # mongodb with multipleprocessing must be init after fork
        self.mongo_handle = MongoUtils(db=self.__mongo_db)
        if not self.redis_handle.connected or not self.mongo_handle.connected:
            logger.error('no redis/mongodb connection found! exit.')
            return

        while True:
            try:
                _, req = self.redis_handle.fetch_one_result()
                with self.context['lock']:
                    self.context['result_counts'].value -= 1
                logger.debug('got req, %d results left' %
                             self.context['result_counts'].value)
                self.proc_req(req)
            except:
                logger.exception('produce exception!')
                if not self.redis_handle.connected:
                    logger.error('redis disconnected! reconnecting...')
                    self.redis_handle.connect()
                if not self.mongo_handle.connected:
                    logger.error('mongodb disconnected! reconnecting...')
                    self.mongo_handle.connect()
                time.sleep(10)
            finally:
                with self.context['lock']:
                    if self.context['result_counts'].value == 0:
                        if self.context[
                                'live_spider_counts'].value == 0 and self.context[
                                    'task_counts'].value == 0:
                            self.context['task_done'].set()
Пример #2
0
 def consume(self):
     if not self.redis_handle.connected:
         logger.error('no redis connection found in consumer! exit.')
         return
     while True:
         try:
             url = self.redis_handle.fetch_one_task()
             with self.context['lock']:
                 self.context['live_spider_counts'].value += 1
                 self.context['task_counts'].value -= 1
             logger.info('get task url: %s' % url)
             logger.info('%d tasks left' %
                         self.context['task_counts'].value)
             if not self.redis_handle.is_blocked(URL(url)):
                 self.start_spider(url, self.__cookie_file)
         except:
             logger.exception('consumer exception!')
             if not self.redis_handle.connected:
                 logger.error('redis disconnected! reconnecting...')
                 self.redis_handle.connect()
             time.sleep(10)
         finally:
             with self.context['lock']:
                 self.context['live_spider_counts'].value -= 1
Пример #3
0
    def proc_req(self, req):
        try:
            data = json.loads(req)
        except:
            logger.exception('json loads req error: %s' % req)
            return
        urlstring = data.get('url', '')
        if not urlstring:
            logger.error('empty url found!')
            return
        url = URL(urlstring)

        method = data.get('method', '')
        if not method:
            logger.error('not method found!')
            return
        # save to mongodb
        data.update({
            'pattern': url.pattern,
            'hostname': url.hostname,
            'domain': url.domain
        })
        target = self.redis_handle.is_target(url)

        if not self.redis_handle.is_url_saved(method, url):
            logger.debug('redis saved pattern not found!')
            self.mongo_handle.save(data, is_target=target)
            self.redis_handle.set_url_saved(method, url)
        else:
            logger.debug('redis saved pattern found!')

        if not target:
            logger.debug('%s is not target' % url.hostname)
            return

        # todo post req
        if method == 'POST':
            logger.debug('POST not support now')
        elif method == 'GET':
            # new host found, add index page to task queue
            if self.redis_handle.get_hostname_reqcount(url.hostname) == 0:
                self.create_task_from_url(URL(url.index_page),
                                          add_whitelist=False)
            # check url validation inside create_url_task
            self.create_task_from_url(url, add_whitelist=False)
        else:
            # not GET nor POST
            logger.error('HTTP Verb %s found!' % method)
            logger.debug(data)
Пример #4
0
        'mongo_db': args.mongo_db
    }
    for _ in range(args.consumer):
        worker = Consumer(**kwargs).consume
        proc = Process(name='consumer-%d' % _, target=worker)
        proc.daemon = True
        proc.start()
    for _ in range(args.producer):
        worker = Producer(**kwargs).produce
        proc = Process(name='producer-%d' % _, target=worker)
        proc.daemon = True
        proc.start()

    if not args.keepon:
        redis_handle.flushdb()
        redis_handle.save_startup_params(args)
        target = args.url or args.file
        producer = Producer(**kwargs)
        if isinstance(target, basestring):
            url = URL(target)
            if not url.valid or url.blocked:
                logger.error('not valid url, exit.')
                sys.exit(-1)
            producer.create_task_from_url(url)
        # file object
        else:
            producer.create_task_from_file(target)

    redis_handle.close()
    tspider_context['task_done'].wait()
Пример #5
0
    def spider(self):
        if not self._url:
            logger.info('incorrect url format found!')
            return []

        # fptr, spiderfile = tempfile.mkstemp()
        if not os.path.exists(TMPDIR_PATH):
            os.mkdir(TMPDIR_PATH)
        spiderfile = os.path.join(TMPDIR_PATH, uuid.uuid4().hex)
        crawler_file = os.path.join(SPIDER_PATH, 'casper_crawler.js')
        command = 'casperjs --ignore-ssl-errors=true --ssl-protocol=any ' \
                  '{cmd} "{url}" --output="{file}"'.format(cmd=crawler_file, url=self._url, file=spiderfile)
        if self._cookie_file:
            command += ' --cookie={0}'.format(self._cookie_file)
        try:
            proc = subprocess.Popen(command, shell=True)
            start = datetime.now()
            while proc.poll() is None:
                time.sleep(1)
                now = datetime.now()
                if (now - start).seconds > CASPERJS_TIMEOUT:
                    os.kill(proc.pid, signal.SIGKILL)
                    os.waitpid(-1, os.WNOHANG)
                    logger.error('casperjs execution timeout. killed.')
                    break
        except:
            logger.exception('casperjs execution failed!')

        if not os.path.exists(spiderfile):
            logger.error('no spider result file found!')
            return []

        fingerprints = []
        # with os.fdopen(fptr) as f:
        with open(spiderfile) as f:
            for line in f:
                line = line.strip()
                try:
                    request = json.loads(line)
                    method = request['method'].upper()
                    url = request['url']
                    postdata = request.get('postData', '')
                    type_ = request['type']
                    headers = {}
                    for header in request['headers']:
                        headers[header['name']] = header['value']
                    headers.pop('Content-Length', '')
                    headers.pop('User-Agent', '')
                    headers.pop('Accept', '')
                except:
                    logger.exception('json.loads failed!')
                    continue

                # check urls fingerprint
                fp = '%s|%s' % (method, url)
                if fp in fingerprints:
                    continue
                fingerprints.append(fp)

                data = {
                    'method': method,
                    'url': url,
                    'postdata': postdata,
                    'headers': headers,
                    'type': type_
                }
                # print json.dumps(data)
                self._results.append(json.dumps(data))
        os.unlink(spiderfile)
        if self._outfile:
            with open(self._outfile, 'w') as f:
                for url in self._results:
                    f.write(url + '\n')
        return self._results