예제 #1
0
def process_24_network(net, port):
    q = JoinableQueue()
    r = JoinableQueue()
    gevent.spawn(prepare_list, q, net)

    tasks = []
    for x in range(0, CONCURRENT_GROUPS):
        #print "spawning %i" % x
        tasks += [gevent.spawn(scan_network, q, r, port)]

    q.join()
    gevent.joinall(tasks)

    if not r.empty():
        with open(str(net.ip) + '_' + str(port) + ".m3u", "w+") as f:
            f.write("#EXTM3U\n")
            while not r.empty():
                try:
                    group = r.get(timeout=10)
                    f.write(
                        '#EXTINF:-1 tvg-logo="" tvg-name="" group-title="",ChannelName'
                        + "\n")
                    f.write('udp://@' + str(group) + ':' + str(port) + "\n")
                    logging.info("Ok ====> %s" % group)
                except gevent.queue.Empty:
                    break
예제 #2
0
def get_movie_id():
    baidu_tool = MysqlCurd('douban_movie')
    baidu_tool.connect_mysql()
    result = baidu_tool.query_mysql_condition('movie_name', [{'version': 0}, ['name']])
    q = JoinableQueue()
    for temp in result:
        if not baidu_tool.query_mysql_condition('name_id', [{'movie_name': temp[0]}, ['movie_id']]):
            q.put(temp[0])
    baidu_tool.close_connect()
    error_q = JoinableQueue()

    def crawl(time):
        while not q.empty():
            tool = MysqlCurd('douban_movie')
            tool.connect_mysql()
            name = q.get()
            try:
                page = super_downloader('https://movie.douban.com/subject_search?', params={'search_text': name},
                                        cookies=True, proxy=True)
            except requests.exceptions.RequestException:
                print('get movie id ' + name + 'download error!')
                return False
            page = etree.HTML(page)
            gevent.sleep(random.uniform(time[0], time[1]))
            try:
                count = 0
                count1 = 0
                for _ in page.xpath('//*[@id="content"]/div/div[1]/div[2]/table[@width="100%"]'):
                    try:
                        mark = _.xpath('tr/td[2]/div')[0]
                        id = mark.xpath('a')[0].get('href')[33:-1]
                        _name = mark.xpath('a')[0].text.split('/')[0].strip()
                        # score = mark.xpath('div/span[2]')[0].text
                        # comment_num = mark.xpath('div/span[3]')[0].text[1:-4]
                        tool.replace_mysql('name_id', {'movie_id': id, 'movie_name': _name})
                        count1 += 1
                        print('get movie id '+_name+'completed!!!')
                    except IndexError as e:
                        print('get movie id sub error!!!'+repr(e))
                        continue
                    count += 1
                    if count == 3:
                        break
                if count1>0:
                    # tool.replace_mysql('movie_name', {'version': 1, 'name': name})
                    tool.close_connect()
                print('get movie id ' + name + ' completed!')
            except Exception as e:
                error_q.put(name)
                print('get movie id ' + name + ' error!')
                print(e)
    worker = SleepFunction()
    worker.run(crawl)
    with open('errorlist//movie_id.txt', 'a', encoding='utf8') as f:
        if not error_q.empty():
            print(get_time(), file=f)
            while not error_q.empty():
                print(error_q.get(), file=f)
예제 #3
0
def get_person_info():
    baidu_tool = MysqlCurd('douban_person')
    baidu_tool.connect_mysql()
    result = baidu_tool.query_mysql_condition('person_name_id', [{
        'version': 0
    }, ['person_id', 'person_name']])
    print(result)
    print(result.__len__())
    q = JoinableQueue()
    for _ in result:
        if not baidu_tool.query_mysql_condition('person_info',
                                                [{
                                                    'person_id': _[0]
                                                }, ['person_name']]):
            q.put(_)
    error_q = JoinableQueue()
    baidu_tool.close_connect()

    def temp(param):
        while not q.empty():
            i = q.get()
            p = Person(id=i[0], name=i[1])
            flag = p.analysis_person_info()
            if flag:
                name_id_tool = MysqlCurd('douban_person')
                name_id_tool.connect_mysql()
                name_id_tool.replace_mysql('person_name_id', {
                    'person_id': p.id,
                    'person_name': p.name,
                    'version': 1
                })
                name_id_tool.close_connect()
            else:
                error_q.put((p.id, p.name))

    worker = SleepFunction()
    worker.run(temp)
    with open('errorlist//person_id.txt', 'a', encoding='utf8') as f:
        if not error_q.empty():
            print(get_time(), file=f)
            while not error_q.empty():
                print(error_q.get(), file=f)
예제 #4
0
def get_movie_info(name=None):
    q = JoinableQueue()
    tool = MysqlCurd('douban_movie')
    tool.connect_mysql()
    if name:
        try:
            movie_id = tool.query_mysql_condition('name_id', [{'movie_name': name}, ['movie_id']])[0][0]
            q.put((movie_id, name))
        except IndexError:
            print('no id!')
    else:
        result = tool.query_mysql_condition('name_id', [{'version': 0}, ['movie_id', 'movie_name']])
        for temp in result:
            if not tool.query_mysql_condition('movie_info', [{'movie_id': temp[0]}, ['movie_name']]):
                q.put(temp)
    tool.close_connect()
    error_q = JoinableQueue()

    def temp(time):
        while not q.empty():
            data = q.get()
            m = Movie(data[0], data[1])
            try:
                print('analysis movie info ' + data[1] + 'started')  # 显示到控制台进行到哪个电影
                m.analysis_movie_info()
                gevent.sleep(random.uniform(time[0], time[1]))
            except Exception as e:
                print(e)
                print('analysis movie info ' + data[1] + 'error')
                error_q.put(data[1])
            m.tool.close_connect()
            print(len(q), 'remain!')
    worker = SleepFunction()
    worker.run(temp)
    with open('errorlist//movie_info.txt', 'a', encoding='utf8') as f:
        if not error_q.empty():
            print(get_time(), file=f)
            while not error_q.empty():
                print(error_q.get(), file=f)
예제 #5
0
파일: massget.py 프로젝트: beched/hehdirb
class MassGet(FastGet):
    def __init__(self, urls, dic, threads=10, report_db=False, keepalive=None, each_threads=10):
        self.dic = dic
        self.report_db = report_db
        self.table = None
        if report_db:
            self.sql_conn(report_db)
        self.keepalive = keepalive
        self.each_threads = each_threads
        self.queue = JoinableQueue()
        [self.queue.put(x.strip()) for x in urls]
        [spawn(self.worker) for _ in xrange(threads)]
        self.queue.join()

    def worker(self):
        while not self.queue.empty():
            url = self.queue.get()
            try:
                FastGet(url, self.dic, self.each_threads, self.report_db, self.keepalive, self.table)
            except Exception as e:
                logging.error('Worker global exception for %s: %s' % (url, e))
            finally:
                self.queue.task_done()
예제 #6
0
def get_person_id():
    baidu_tool = MysqlCurd('douban_person')
    baidu_tool.connect_mysql()
    result = baidu_tool.query_mysql_condition('person_name', [{
        'version': 0
    }, ['name']])
    print(result)
    print(result.__len__())
    q = JoinableQueue()
    for _ in result:
        if not baidu_tool.query_mysql_condition('person_name_id',
                                                [{
                                                    'person_name': _[0]
                                                }, ['person_id']]):
            q.put(_[0].strip('\n'))
    error_q = JoinableQueue()

    def crawl(param):
        while not q.empty():
            tool = MysqlCurd('douban_person')
            tool.connect_mysql()
            name = q.get()
            try:
                result = super_downloader(
                    'https://movie.douban.com/subject_search?',
                    params={'search_text': name},
                    proxy=True,
                    cookies=True)
                gevent.sleep(random.uniform(2, 6.5))
            except requests.exceptions.RequestException as e:
                print(name + 'download error!')
                continue
            try:
                page = etree.HTML(result)
                basic = page.xpath(
                    '//*[@id="content"]/div/div[@class="article"]/div[1]/'
                    'div[@class="result-item"]/div[@class="content"]/h3/a')[0]
                id = basic.get('href')[35:-1]
                name = basic.text.split()[0]
                tool.replace_mysql('person_name_id', {
                    'person_id': id,
                    'person_name': name,
                })
                baidu_tool = MysqlCurd('douban_person')
                baidu_tool.connect_mysql()
                baidu_tool.replace_mysql('person_name', {
                    'name': name,
                    'version': 1
                })
                baidu_tool.close_connect()
                tool.close_connect()
                print(name + 'completed')
            except IndexError:
                error_q.put(name)
                print(name + 'error!')

    worker = SleepFunction()
    worker.run(crawl)
    with open('errorlist//person_id.txt', 'a', encoding='utf8') as f:
        if not error_q.empty():
            print(get_time(), file=f)
            while not error_q.empty():
                print(error_q.get(), file=f)
예제 #7
0
class FastGet:
    def __init__(self, url, dic, threads=100, report_db=False, keepalive=None, table_name=None):
        self.url = url
        parts = urlparse(url)
        self.scheme, self.host, self.port = parts.scheme, parts.hostname, parts.port
        if not self.port:
            self.port = 443 if self.scheme == 'https' else 80

        self.keepalive = keepalive
        try:
            instance = HehReq(self.host, int(self.port), self.scheme, self.keepalive)
        except Exception as e:
            logging.error('Init exception for %s: %s' % (self.url, e))
            return
        if not keepalive:
            self.keepalive = instance.detect_keepalive()
        if self.keepalive == 0:
            logging.error('Keep-Alive value for %s appears to be 0, check the connection' % url)
            return
        logging.warning('Calculated Keep-Alive for %s: %s' % (url, self.keepalive))

        self.report_db = report_db
        if report_db:
            self.table = table_name
            self.sql_conn(report_db)

        self.queue = JoinableQueue()
        [self.queue.put(dic[i:i + self.keepalive]) for i in xrange(0, len(dic), self.keepalive)]
        [spawn(self.worker) for _ in xrange(threads)]
        self.queue.join()

    def sql_conn(self, report_db):
        self.conn = MySQLdb.connect(report_db['host'], report_db['user'], report_db['passwd'], report_db['db'])
        self.cur = self.conn.cursor()
        if not self.table:
            self.table = 'scan_%s' % datetime.strftime(datetime.now(), '%Y_%m_%d_%H%M%S')
            self.cur.execute(
                'create table %s(scheme varchar(16), host varchar(128), port smallint, uri varchar(128),\
                code smallint, size int, type varchar(128))' % self.table)

    def report(self, result):
        if result[1] not in [302, 404]:
            logging.warning('Path %s://%s:%s/%s, response code %s, content-length %s, content-type %s' % (
                self.scheme, self.host, self.port, result[0], result[1], result[2], result[3]))
        if self.report_db:
            p = [self.scheme, self.host, self.port] + list(result)
            self.cur.execute('insert into %s values(%%s,%%s,%%s,%%s,%%s,%%s,%%s)' % self.table, p)

    def worker(self):
        try:
            instance = HehReq(self.host, int(self.port), self.scheme, self.keepalive)
        except Exception as e:
            logging.error('Worker init exception for %s: %s' % (self.url, e))
            return
        while not self.queue.empty():
            paths = self.queue.get()
            try:
                for x in instance.bulk_get(paths):
                    self.report(x)
            except Exception as e:
                logging.error('Worker loop exception for %s: %s' % (self.url, e))
            finally:
                if self.report_db:
                    self.conn.commit()
                self.queue.task_done()
예제 #8
0
class BaseCrawler(object):
    def __init__(self,
                 requestHandler=BaseRequestHandler(),
                 parseHandler=BaseParseHandler(),
                 sheduler=BaseScheduler(),
                 pipeline=BasePipeline()):
        self.requestHandler = requestHandler
        self.parseHandler = parseHandler
        self.sheduler = sheduler
        self.pipeline = pipeline
        self.task_queue = JoinableQueue()
        self.response_queue = JoinableQueue()
        self.tasks_cnt = 0
        self.result_queue = JoinableQueue()
        self.jobs_cnt = config.num_threads
        self.start_time = time.time()
        self.stop = False

    def doScheduler(self):
        """Generate tasks, one thread
        """
        logging.info('scheduler started!')
        for task in self.sheduler.init_generator():
            self.task_queue.put(task)
            self.tasks_cnt += 1

        while self.tasks_cnt > 0 and not self.stop:
            gevent.sleep(config.new_task_check_time)

        logging.info('scheduler finished! All task done.')

        for i in xrange(config.num_threads):
            self.task_queue.put(StopIteration)

    def worker(self):
        """Fetch url and parse, config.num_threads threads
        """
        task = self.task_queue.get()
        cnt = config.error_retry_cnt
        while task != StopIteration:
            try:
                #timeout = gevent.Timeout(config.TASK_TIMEOUT)
                #timeout.start()
                response = self.requestHandler.handle(task)
                result, new_tasks = self.parseHandler.handle(response)
                #timeout.cancel()
                #if isinstance(result, collections.Iterable):
                #if isinstance(result, list):
                #    for ret in result:
                #        self.result_queue.put(ret)
                #else:
                if result:
                    self.result_queue.put(result)
                for task in new_tasks:
                    self.task_queue.put(task)
                    self.tasks_cnt += 1
                #self.task_queue.task_done()
                self.tasks_cnt -= 1
                task = self.task_queue.get()
                cnt = config.error_retry_cnt
            except Exception as e:
                try:
                    #timeout.cancel()
                    cnt -= 1
                    logging.exception(e)
                    if cnt <= 0:
                        #self.task_queue.task_done()
                        self.tasks_cnt -= 1
                        task = self.task_queue.get()
                        logging.error(
                            'task failed, try \033[31m%d\033[0m times! will not try'
                            % (config.error_retry_cnt - cnt))
                        cnt = config.error_retry_cnt
                    #logging.exception('task failed!')
                    else:
                        logging.error(
                            'task failed, try \033[31m%d\033[0m times!' %
                            (config.error_retry_cnt - cnt))
                except Exception as e:
                    self.tasks_cnt -= 1
                    #self.jobs_cnt -= 1
                    raise
            finally:
                #timeout.cancel()
                pass
        self.jobs_cnt -= 1

    def doPipeline(self):
        while self.jobs_cnt > 0 or not self.result_queue.empty():
            gevent.sleep(config.pipeline_sleeptime)
            results = []
            try:
                while 1:
                    results.append(self.result_queue.get_nowait())
                    if len(results) > 100:
                        raise gevent.queue.Empty
            except gevent.queue.Empty:
                if results:
                    try:
                        self.pipeline.process(results)
                    except:
                        logging.exception('')
                #logging.exception('')
            except:
                logging.exception('')

    def run(self):
        jobs = [
            gevent.spawn(self.doScheduler),
            gevent.spawn(self.doPipeline),
        ]
        for i in xrange(config.num_threads):
            job = gevent.spawn(self.worker)
            jobs.append(job)
            #thread.start_new_thread(self.worker)
        try:
            timeout = gevent.Timeout(config.CRAWLER_TIMEOUT)
            timeout.start()
            #self.task_queue.join()
            gevent.joinall(jobs)
        except:
            logging.exception('pipeline error!')
        finally:
            timeout.cancel()
            self.end_time = time.time()
            logging.info('run times: %f s' % (self.end_time - self.start_time))
예제 #9
0
class HttpScanner(object):
    def __init__(self, args):
        """
        Initialise HTTP scanner
        :param args:
        :return:
        """
        self.args = args
        self.output = HttpScannerOutput(args)
        self._init_scan_options()

        # Reading files
        self.output.write_log("Reading files and deduplicating.", logging.INFO)
        self.hosts = self._file_to_list(args.hosts)
        self.urls = self._file_to_list(args.urls)

        #
        self._calc_urls()
        out = 'Loaded %i hosts %i urls' % (self.hosts_count, self.urls_count)
        if self.args.ports is not None:
            out += ' %i ports' % len(self.args.ports)
        self.output.print_and_log(out)

        if self.args.ports is not None and not self.args.syn:
            new_hosts = []
            for host in self.hosts:
                for port in self.args.ports:
                    # print(host, port)
                    new_hosts.append(helper.generate_url(host, port))
            self.hosts = new_hosts

        #
        self._calc_urls()
        self.output.print_and_log('%i full urls to scan' %
                                  self.full_urls_count)

        # Queue and workers
        self.hosts_queue = JoinableQueue()
        self.workers = []

    def _file_to_list(self, filename, dedup=True):
        """
        Get list from file
        :param filename: file to read
        :return: list of lines
        """
        if not path.exists(filename) or not path.isfile(filename):
            self.output.print_and_log('File %s not found!' % filename,
                                      logging.ERROR)
            exit(-1)

        # Preparing lines list
        lines = filter(lambda line: line is not None and len(line) > 0,
                       open(filename).read().split('\n'))
        if len(lines) == 0:
            self.output.print_and_log('File %s is empty!' % filename,
                                      logging.ERROR)
            exit(-1)

        return helper.deduplicate(lines) if dedup else lines

    def _init_scan_options(self):
        # Session
        self.session = session()
        self.session.timeout = self.args.timeout
        self.session.verify = False

        # TODO: debug and check
        # self.session.mount("http://", HTTPAdapter(max_retries=self.args.max_retries))
        # self.session.mount("https://", HTTPAdapter(max_retries=self.args.max_retries))
        # http://stackoverflow.com/questions/15431044/can-i-set-max-retries-for-requests-request
        # Max retries
        adapters.DEFAULT_RETRIES = self.args.max_retries

        # TOR
        if self.args.tor:
            self.output.write_log("TOR usage detected. Making some checks.")
            self.session.proxies = {
                'http': 'socks5://127.0.0.1:9050',
                'https': 'socks5://127.0.0.1:9050'
            }

            url = 'http://ifconfig.me/ip'
            real_ip, tor_ip = None, None

            # Ger real IP address
            try:
                real_ip = get(url).text.strip()
            except Exception as exception:
                self.output.print_and_log(
                    "Couldn't get real IP address. Check yout internet connection.",
                    logging.ERROR)
                self.output.write_log(str(exception), logging.ERROR)
                exit(-1)

            # Get TOR IP address
            try:
                tor_ip = self.session.get(url).text.strip()
            except Exception as exception:
                self.output.print_and_log(
                    "TOR socks proxy doesn't seem to be working.",
                    logging.ERROR)
                self.output.write_log(str(exception), logging.ERROR)
                exit(-1)

            # Show IP addresses
            self.output.print_and_log('Real IP: %s TOR IP: %s' %
                                      (real_ip, tor_ip))
            if real_ip == tor_ip:
                self.output.print_and_log(
                    "TOR doesn't work! Stop to be secure.", logging.ERROR)
                exit(-1)

        # Proxy
        if self.args.proxy is not None:
            self.session.proxies = {
                "https": self.args.proxy,
                "http": self.args.proxy
            }

        # Auth
        if self.args.auth is not None:
            items = self.args.auth.split(':')
            self.session.auth = (items[0], items[1])

        # Cookies
        self.cookies = {}
        if self.args.cookies is not None:
            self.cookies = Cookies.from_request(self.args.cookies)

        # Cookies from file
        if self.args.load_cookies is not None:
            if not path.exists(self.args.load_cookies) or not path.isfile(
                    self.args.load_cookies):
                self.output.print_and_log(
                    'Could not find cookie file: %s' % self.args.load_cookies,
                    logging.ERROR)
                exit(-1)

            self.cookies = MozillaCookieJar(self.args.load_cookies)
            self.cookies.load()

        self.session.cookies = self.cookies

        # User-Agent
        self.ua = UserAgent() if self.args.random_agent else None

    def worker(self, worker_id):
        self.output.write_log('Worker %i started.' % worker_id)
        while not self.hosts_queue.empty():
            host = self.hosts_queue.get()
            try:
                self.scan_host(worker_id, host)
            finally:
                self.output.write_log('Worker %i finished.' % worker_id)
                self.hosts_queue.task_done()

    def _head_available(self, host):
        """
        Determine if HEAD requests is allowed
        :param host:
        :return:
        """
        # Trying to use OPTIONS request
        try:
            response = self.session.options(host, headers=self._fill_headers())
            o = response.headers[
                'allow'] if 'allow' in response.headers else None
            if o is not None and o.find('HEAD') != -1:
                return True
        except:
            # TODO: fix
            pass

        try:
            return False if self.session.head(
                host,
                headers=self._fill_headers()).status_code == 405 else True
        except:
            # TODO: fix
            return False

    def scan_host(self, worker_id, host):
        # check if resolvable
        ip = helper.url_to_ip(host)
        if ip is None:
            self.output.write_log('Could not resolve %s  Skipping...' % host,
                                  logging.WARNING)
            self.output.urls_scanned += len(self.urls)
            return

        # Check for HEAD
        host_url = helper.host_to_url(host)
        head_available = False
        if self.args.head:
            head_available = self._head_available(host)
            if head_available:
                self.output.write_log('HEAD is supported for %s' % host)

        errors_count, urls_scanned = 0, 0
        for url in self.urls:
            full_url = urljoin(host_url, url)
            r = self.scan_url(full_url, head_available)
            urls_scanned += 1
            self.output.urls_scanned += 1

            # Output
            r['worker'] = worker_id
            self.output.write(**r)
            if r['exception'] is not None:
                errors_count += 1

            # Skip host on errors
            if self.args.skip is not None and errors_count == self.args.skip:
                self.output.write_log(
                    'Errors limit reached on %s Skipping other urls.' % host,
                    logging.WARNING)
                self.output.urls_scanned += len(self.urls) - urls_scanned
                break

        # cookies bugfix?
        self.session.cookies.clear()

    def _fill_headers(self):
        # Fill UserAgent in headers
        headers = {}
        if self.args.user_agent is not None:
            headers['User-agent'] = self.args.user_agent
        elif self.args.random_agent:
            headers['User-agent'] = self.ua.random

        # Fill Referer in headers
        if self.args.referer is not None:
            headers['Referer'] = self.args.referer

        return headers

    def _parse_response(self, url, response, exception):
        res = {'url': url, 'response': response, 'exception': exception}

        if response is None or exception is not None:
            res.update({
                'status': -1,
                'length': -1,
            })
            return res

        try:
            length = int(response.headers['content-length']
                         ) if 'content-length' in response.headers else len(
                             response.text)
        except Exception as exception:
            self.output.write_log(
                "Exception while getting content length for URL: %s Exception: %s"
                % (url, str(exception)), logging.ERROR)
            length = 0

        res.update({
            'status': response.status_code,
            'length': length,
        })
        return res

    def scan_url(self, url, use_head=False):
        self.output.write_log('Scanning %s' % url, logging.DEBUG)

        # Query URL and handle exceptions
        response, exception = None, None
        method = 'HEAD' if use_head else 'GET'
        try:
            # TODO: add support for user:password in URL
            response = self.session.request(
                method,
                url,
                headers=self._fill_headers(),
                allow_redirects=self.args.allow_redirects)
        except ConnectionError as ex:
            self.output.write_log('Connection error while quering %s' % url,
                                  logging.ERROR)
            exception = ex
        except HTTPError as ex:
            self.output.write_log('HTTP error while quering %s' % url,
                                  logging.ERROR)
            exception = ex
        except Timeout as ex:
            self.output.write_log('Timeout while quering %s' % url,
                                  logging.ERROR)
            exception = ex
        except TooManyRedirects as ex:
            self.output.write_log('Too many redirects while quering %s' % url,
                                  logging.ERROR)
            exception = ex
        except Exception as ex:
            self.output.write_log('Unknown exception while quering %s' % url,
                                  logging.ERROR)
            exception = ex

        # print('cookies: %s' % self.cookies)
        print('session.cookies: %s' % self.session.cookies)
        # self.session.cookies = self.cookies

        return self._parse_response(url, response, exception)

    def signal_handler(self):
        """
        Signal hdndler
        :return:
        """
        # TODO: add saving status via pickle
        self.output.print_and_log('Signal caught. Stopping...',
                                  logging.WARNING)
        self.stop()
        exit(signal.SIGINT)

    def _calc_urls(self):
        # Calculations
        self.urls_count = len(self.urls)
        self.hosts_count = len(self.hosts)
        self.full_urls_count = len(self.urls) * len(self.hosts)
        self.output.args.urls_count = self.full_urls_count

    def start(self):
        """
        Start mulithreaded scan
        :return:
        """
        # Set signal handler
        gevent.signal(signal.SIGTERM, self.signal_handler)
        gevent.signal(signal.SIGINT, self.signal_handler)
        gevent.signal(signal.SIGQUIT, self.signal_handler)

        # ICMP scan
        if self.args.icmp:
            if geteuid() != 0:
                self.output.print_and_log(
                    'To use ICMP scan option you must run as root. Skipping ICMP scan',
                    logging.WARNING)
            else:
                self.output.print_and_log('Starting ICMP scan.')
                self.hosts = helper.icmp_scan(self.hosts, self.args.timeout)
                self._calc_urls()
                self.output.print_and_log(
                    'After ICMP scan %i hosts %i urls loaded, %i urls to scan'
                    %
                    (self.hosts_count, self.urls_count, self.full_urls_count))

        # SYN scan
        if self.args.syn:
            if self.args.tor or self.args.proxy is not None:
                self.output.print_and_log(
                    'SYN scan via tor or proxy is impossible!',
                    logging.WARNING)
                self.output.print_and_log(
                    'Stopping to prevent deanonymization!', logging.WARNING)
                exit(-1)

            if geteuid() != 0:
                self.output.print_and_log(
                    'To use SYN scan option you must run as root. Skipping SYN scan',
                    logging.WARNING)
            else:
                self.output.print_and_log('Starting SYN scan.')
                self.hosts = helper.syn_scan(self.hosts, self.args.ports,
                                             self.args.timeout)
                self._calc_urls()
                self.output.print_and_log(
                    'After SYN scan %i hosts %i urls loaded, %i urls to scan' %
                    (self.hosts_count, self.urls_count, self.full_urls_count))

        # Check threds count vs hosts count
        if self.args.threads > self.hosts_count:
            self.output.write_log(
                'Too many threads! Fixing threads count to %i' %
                self.hosts_count, logging.WARNING)
            threads_count = self.hosts_count
        else:
            threads_count = self.args.threads

        # Output urls count
        self.output.args.urls_count = self.full_urls_count

        # Start workers
        self.workers = [spawn(self.worker, i) for i in range(threads_count)]

        # Fill and join queue
        [self.hosts_queue.put(host) for host in self.hosts]
        self.hosts_queue.join()

    def stop(self):
        """
        Stop scan
        :return:
        """
        # TODO: stop correctly
        gevent.killall(self.workers)
class RequestBase(object):
    def __init__(self,url,parameter,HTTPClients,ClientConnectionPool,task=None):

        if task is not None:
            self.celeryTask = task
            self.celeryTaskId = task.request.id
        else:
            self.celeryTask = None

        self.parameter = parameter
        self.url = url
        self.numberHTTPClients = HTTPClients
        self.numberClientConnectionPool = ClientConnectionPool

        self.http = HTTPClient.from_url(URL(url),concurrency=self.numberClientConnectionPool)
        self.clientPool = gevent.pool.Pool(self.numberHTTPClients)
        self.workQueue = JoinableQueue()

        self.resultList = {}
        self.workQueueMax = 0
        self.workQueueDone = 0
        self.countRequests = 0
        self.status_codes = {}
        self.status_codes_count = {}
        self.meta = {}

        self.greenletList = {}
        self.initAdditionalStructures()
        self.progressMeta = None

        self.exitFlag = False
        self.pauseRequests = False


    def destroy(self):
        self.http.close()

    def initAdditionalStructures(self):
        pass

    def destroyAdditionstrucutres(self):
        pass

    def getProgress(self):
        return self.meta

    def updateProgress(self,state="PROGRESS"):
        '''Updates the status'''
        self.meta = {'state':state,'workQueueDone': self.workQueueDone, 'workQueueMax': self.workQueueMax,'current':len(self.resultList),'workQueue':self.workQueue.qsize(),'requests':self.countRequests}

        #iterate over status_codes dict and save the queue size. may be not the best solution from performance view
        for code,queue in self.status_codes.iteritems():
            self.status_codes_count[code] = queue.qsize()
        self.meta['status_codes'] = self.status_codes_count
        if self.celeryTask is not None:
            self.celeryTask.update_state(task_id=self.celeryTaskId,state=state,meta=self.meta)

    def worker(self,http,clientId):
        while not self.workQueue.empty() or self.exitFlag:
                try:
                    code = self.makeRequest(http,self.getWorkQueueItem())
                finally:
                    self.workQueue.task_done()
      
    def stop(self):
        self.exitFlag=True

    def buildRequestURL(self,workQueueItem):
        '''Function used to build the request URL from a workingQueue item'''
        pass

    def handleRequestSuccess(self,workQueueItem, result):
        '''Required function, called after every successful request'''
        pass

    def handleRequestFailure(self,result):
        '''Function called after a failed request. For example error code 404'''
        pass

    def makeRequest(self,http,workQueueItem):
        '''Makes the request to and '''
        url_string = self.buildRequestURL(workQueueItem)

        self.countRequests += 1
        try:
            
            response = http.get(URL(url_string).request_uri)
            statusCode = response.status_code

            #create a new queue if the response status_code did not exist and adds the item to the queue
            if str(statusCode) not in self.status_codes:
                self.status_codes[str(statusCode)] = JoinableQueue()
            self.status_codes[str(statusCode)].put(workQueueItem)

            try:
                self.handleRequestSuccess(workQueueItem,response)
            except SSLError,e:
                print e

            return statusCode
        except Exception,e:
            self.putWorkQueueItem(workQueueItem)
예제 #11
0
class GeventConsumer(object):

    def __init__(
        self,
        consumer_config=None,
        topic=None,
        parse_func=None,
        num=8,
        auto_commit_offset=False,
        is_debug=False,
    ):
        if not parse_func:
            raise Exception("not parse func, system exit")

        self.parse = parse_func
        self.queue = Queue(100)
        self.stop_flag = Event()
        self.num = num
        self.debug = is_debug
        if not self.debug:
            self.auto_commit_offset = auto_commit_offset
            if isinstance(consumer_config, dict):
                consumer_config.update({'enable.auto.commit':self.auto_commit_offset})
            self.consumer = Consumer(consumer_config)
            self.topic = topic
            self.consumer.subscribe(self.topic)

    def sign_handler(self, sig, frame):
        print(" >>> Termination_signal:[{}] to stop".format(sig))
        self.stop_flag.set()

    def kafka_to_queue(self):
        logger.info("Start Producer thread")
        m = 0
        time_diff = 0
        start_time = time.time()
        while not self.stop_flag.is_set():
            msg = self.consumer.poll(1)
            if msg is None:
                time.sleep(0.001)
                return
            err = msg.error()
            if err:
                if err.code() == KafkaError._PARTITION_EOF:
                    logger.debug(
                        '%s [%s] reached end at offset %s',
                        msg.topic(), msg.partition(), msg.offset()
                    )
                else:
                    logger.error('kafka failed, system exit')
                    self.stop_flag.set()

            self.queue.put(msg)

            # 消费速度统计
            m += 1
            current_time = time.time()
            time_diff = current_time - start_time
            if time_diff > 10:
                rate = m / time_diff
                start_time = current_time
                m = 0
                logger.info('consumer_rate:[%.2f]p/s, queue_size:[%d]' % (rate, self.queue.qsize()))
        logger.info("Producer thread has stopped")

    def consume(self):
        logger.info('Start Thread To Consumer')
        data = dict()
        stop = False
        while True:
            stop = self.stop_flag.is_set()
            if stop and self.queue.empty():
                break
            msg = self.queue.get()
            try:
                data = self.parse(msg.value())
                if data:
                    self.handle_data(data, stop)
            finally:
                self.queue.task_done()
                if not stop and not self.auto_commit_offset:
                    self.consumer.commit(msg)
        logger.info('Thread Consumer has stopped')

    def handle_data(self, data, stop):
        raise NotImplementedError

    def consume_forever(self):
        """
        start consume forever
        """
        signal(SIGTERM, self.sign_handler)
        signal(SIGINT, self.sign_handler)

        if self.debug:
            consume_func = self.mock_consume
            produce_func = self.mock_kafka
        else:
            consume_func = self.consume
            produce_func = self.kafka_to_queue

        task_list = []
        for _ in range(self.num):
            task_list.append(gevent.spawn(consume_func))

        produce_func()
        self.queue.join()
        if not self.debug:
            logger.info("closing kafka...")
            self.consumer.close()
        gevent.joinall(task_list, timeout=5)
        logger.info('Exiting with qsize:%d' % self.queue.qsize())

    # ===========mock kafka and consumer=======================
    def mock_kafka(self):
        logger.info("Start Producer thread")
        m = 0
        time_diff = 0
        start_time = time.time()
        # jing5 msg
        msg = "23230254455354325631393046433232323232320101008e14080b0e0c38426e0101008422551354455354325631393046433232323232323131313131313131313131313131313131313131313131313131313131313131313130010000000002803365818a91eb00010002fffe050018fffe2eeb596f50830005e91efd02649c6b7eb1ac0d80000043c497fd0022f90a3d057b2403032581373635343332310082e99f008a06".decode('hex')
        while not self.stop_flag.is_set():
            self.queue.put(msg)
            m += 1

            # 消费速度统计
            current_time = time.time()
            time_diff = current_time - start_time
            if time_diff > 5:
                rate = m / time_diff
                start_time = current_time
                m = 0
                logger.info('consumer_rate:[%.2f]p/s, queue_size:[%d]' % (rate, self.queue.qsize()))
        logger.info("closing produce...")
        logger.info("Producer thread has stopped")

    def mock_consume(self):
        logger.info('Start Thread To Consumer')
        data = dict()
        stop = False
        while True:
            stop = self.stop_flag.is_set()
            if stop and self.queue.empty():
                break
            msg = self.queue.get()
            try:
                data = self.parse(msg)
                self.handle_data(data, stop)
            except Exception as err:
                logger.error("consumer:{}".format(getcurrent()))
            finally:
                self.queue.task_done()
        logger.info('Thread Consumer has stopped')
예제 #12
0
class HttpScanner(object):
    def __init__(self, args):
        """
        Initialise HTTP scanner
        :param args:
        :return:
        """
        self.args = args
        self.output = HttpScannerOutput(args)
        self._init_scan_options()

        # Reading files
        self.output.write_log("Reading files and deduplicating.", logging.INFO)
        self.hosts = self._file_to_list(args.hosts)
        self.urls = self._file_to_list(args.urls)

        #
        self._calc_urls()
        out = 'Loaded %i hosts %i urls' % (self.hosts_count, self.urls_count)
        if self.args.ports is not None:
            out += ' %i ports' % len(self.args.ports)
        self.output.print_and_log(out)

        if self.args.ports is not None and not self.args.syn:
            new_hosts = []
            for host in self.hosts:
                for port in self.args.ports:
                    # print(host, port)
                    new_hosts.append(helper.generate_url(host, port))
            self.hosts = new_hosts

        #
        self._calc_urls()
        self.output.print_and_log('%i full urls to scan' % self.full_urls_count)

        # Queue and workers
        self.hosts_queue = JoinableQueue()
        self.workers = []

    def _file_to_list(self, filename, dedup=True):
        """
        Get list from file
        :param filename: file to read
        :return: list of lines
        """
        if not path.exists(filename) or not path.isfile(filename):
            self.output.print_and_log('File %s not found!' % filename, logging.ERROR)
            exit(-1)

        # Preparing lines list
        lines = filter(lambda line: line is not None and len(line) > 0, open(filename).read().split('\n'))
        if len(lines) == 0:
            self.output.print_and_log('File %s is empty!' % filename, logging.ERROR)
            exit(-1)

        return helper.deduplicate(lines) if dedup else lines

    def _init_scan_options(self):
        # Session
        self.session = session()
        self.session.timeout = self.args.timeout
        self.session.verify = False

        # TODO: debug and check
        # self.session.mount("http://", HTTPAdapter(max_retries=self.args.max_retries))
        # self.session.mount("https://", HTTPAdapter(max_retries=self.args.max_retries))
        # http://stackoverflow.com/questions/15431044/can-i-set-max-retries-for-requests-request
        # Max retries
        adapters.DEFAULT_RETRIES = self.args.max_retries

        # TOR
        if self.args.tor:
            self.output.write_log("TOR usage detected. Making some checks.")
            self.session.proxies = {
                'http': 'socks5://127.0.0.1:9050',
                'https': 'socks5://127.0.0.1:9050'
            }

            url = 'http://ifconfig.me/ip'
            real_ip, tor_ip = None, None

            # Ger real IP address
            try:
                real_ip = get(url).text.strip()
            except Exception as exception:
                self.output.print_and_log("Couldn't get real IP address. Check yout internet connection.",
                                          logging.ERROR)
                self.output.write_log(str(exception), logging.ERROR)
                exit(-1)

            # Get TOR IP address
            try:
                tor_ip = self.session.get(url).text.strip()
            except Exception as exception:
                self.output.print_and_log("TOR socks proxy doesn't seem to be working.", logging.ERROR)
                self.output.write_log(str(exception), logging.ERROR)
                exit(-1)

            # Show IP addresses
            self.output.print_and_log('Real IP: %s TOR IP: %s' % (real_ip, tor_ip))
            if real_ip == tor_ip:
                self.output.print_and_log("TOR doesn't work! Stop to be secure.", logging.ERROR)
                exit(-1)

        # Proxy
        if self.args.proxy is not None:
            self.session.proxies = {"https": self.args.proxy,
                                    "http": self.args.proxy}

        # Auth
        if self.args.auth is not None:
            items = self.args.auth.split(':')
            self.session.auth = (items[0], items[1])

        # Cookies
        self.cookies = {}
        if self.args.cookies is not None:
            self.cookies = Cookies.from_request(self.args.cookies)

        # Cookies from file
        if self.args.load_cookies is not None:
            if not path.exists(self.args.load_cookies) or not path.isfile(self.args.load_cookies):
                self.output.print_and_log('Could not find cookie file: %s' % self.args.load_cookies, logging.ERROR)
                exit(-1)

            self.cookies = MozillaCookieJar(self.args.load_cookies)
            self.cookies.load()

        self.session.cookies = self.cookies

        # User-Agent
        self.ua = UserAgent() if self.args.random_agent else None

    def worker(self, worker_id):
        self.output.write_log('Worker %i started.' % worker_id)
        while not self.hosts_queue.empty():
            host = self.hosts_queue.get()
            try:
                self.scan_host(worker_id, host)
            finally:
                self.output.write_log('Worker %i finished.' % worker_id)
                self.hosts_queue.task_done()

    def _head_available(self, host):
        """
        Determine if HEAD requests is allowed
        :param host:
        :return:
        """
        # Trying to use OPTIONS request
        try:
            response = self.session.options(host, headers=self._fill_headers())
            o = response.headers['allow'] if 'allow' in response.headers else None
            if o is not None and o.find('HEAD') != -1:
                return True
        except:
            # TODO: fix
            pass

        try:
            return False if self.session.head(host, headers=self._fill_headers()).status_code == 405 else True
        except:
            # TODO: fix
            return False

    def scan_host(self, worker_id, host):
        # check if resolvable
        ip = helper.url_to_ip(host)
        if ip is None:
            self.output.write_log('Could not resolve %s  Skipping...' % host, logging.WARNING)
            self.output.urls_scanned += len(self.urls)
            return

        # Check for HEAD
        host_url = helper.host_to_url(host)
        head_available = False
        if self.args.head:
            head_available = self._head_available(host)
            if head_available:
                self.output.write_log('HEAD is supported for %s' % host)

        errors_count, urls_scanned = 0, 0
        for url in self.urls:
            full_url = urljoin(host_url, url)
            r = self.scan_url(full_url, head_available)
            urls_scanned += 1
            self.output.urls_scanned += 1

            # Output
            r['worker'] = worker_id
            self.output.write(**r)
            if r['exception'] is not None:
                errors_count += 1

            # Skip host on errors
            if self.args.skip is not None and errors_count == self.args.skip:
                self.output.write_log('Errors limit reached on %s Skipping other urls.' % host, logging.WARNING)
                self.output.urls_scanned += len(self.urls) - urls_scanned
                break

        # cookies bugfix?
        self.session.cookies.clear()

    def _fill_headers(self):
        # Fill UserAgent in headers
        headers = {}
        if self.args.user_agent is not None:
            headers['User-agent'] = self.args.user_agent
        elif self.args.random_agent:
            headers['User-agent'] = self.ua.random

        # Fill Referer in headers
        if self.args.referer is not None:
            headers['Referer'] = self.args.referer

        return headers

    def _parse_response(self, url, response, exception):
        res = {'url': url,
               'response': response,
               'exception': exception}

        if response is None or exception is not None:
            res.update({
                'status': -1,
                'length': -1,
            })
            return res

        try:
            length = int(response.headers['content-length']) if 'content-length' in response.headers else len(
                response.text)
        except Exception as exception:
            self.output.write_log(
                "Exception while getting content length for URL: %s Exception: %s" % (url, str(exception)),
                logging.ERROR)
            length = 0

        res.update({
            'status': response.status_code,
            'length': length,
        })
        return res

    def scan_url(self, url, use_head=False):
        self.output.write_log('Scanning %s' % url, logging.DEBUG)

        # Query URL and handle exceptions
        response, exception = None, None
        method = 'HEAD' if use_head else 'GET'
        try:
            # TODO: add support for user:password in URL
            response = self.session.request(method, url, headers=self._fill_headers(),
                                            allow_redirects=self.args.allow_redirects)
        except ConnectionError as ex:
            self.output.write_log('Connection error while quering %s' % url, logging.ERROR)
            exception = ex
        except HTTPError as ex:
            self.output.write_log('HTTP error while quering %s' % url, logging.ERROR)
            exception = ex
        except Timeout as ex:
            self.output.write_log('Timeout while quering %s' % url, logging.ERROR)
            exception = ex
        except TooManyRedirects as ex:
            self.output.write_log('Too many redirects while quering %s' % url, logging.ERROR)
            exception = ex
        except Exception as ex:
            self.output.write_log('Unknown exception while quering %s' % url, logging.ERROR)
            exception = ex


        # print('cookies: %s' % self.cookies)
        print('session.cookies: %s' % self.session.cookies)
        # self.session.cookies = self.cookies

        return self._parse_response(url, response, exception)

    def signal_handler(self):
        """
        Signal hdndler
        :return:
        """
        # TODO: add saving status via pickle
        self.output.print_and_log('Signal caught. Stopping...', logging.WARNING)
        self.stop()
        exit(signal.SIGINT)

    def _calc_urls(self):
        # Calculations
        self.urls_count = len(self.urls)
        self.hosts_count = len(self.hosts)
        self.full_urls_count = len(self.urls) * len(self.hosts)
        self.output.args.urls_count = self.full_urls_count

    def start(self):
        """
        Start mulithreaded scan
        :return:
        """
        # Set signal handler
        gevent.signal(signal.SIGTERM, self.signal_handler)
        gevent.signal(signal.SIGINT, self.signal_handler)
        gevent.signal(signal.SIGQUIT, self.signal_handler)

        # ICMP scan
        if self.args.icmp:
            if geteuid() != 0:
                self.output.print_and_log('To use ICMP scan option you must run as root. Skipping ICMP scan', logging.WARNING)
            else:
                self.output.print_and_log('Starting ICMP scan.')
                self.hosts = helper.icmp_scan(self.hosts, self.args.timeout)
                self._calc_urls()
                self.output.print_and_log('After ICMP scan %i hosts %i urls loaded, %i urls to scan' %
                                          (self.hosts_count, self.urls_count, self.full_urls_count))

        # SYN scan
        if self.args.syn:
            if self.args.tor or self.args.proxy is not None:
                self.output.print_and_log('SYN scan via tor or proxy is impossible!', logging.WARNING)
                self.output.print_and_log('Stopping to prevent deanonymization!', logging.WARNING)
                exit(-1)

            if geteuid() != 0:
                self.output.print_and_log('To use SYN scan option you must run as root. Skipping SYN scan', logging.WARNING)
            else:
                self.output.print_and_log('Starting SYN scan.')
                self.hosts = helper.syn_scan(self.hosts, self.args.ports, self.args.timeout)
                self._calc_urls()
                self.output.print_and_log('After SYN scan %i hosts %i urls loaded, %i urls to scan' %
                                          (self.hosts_count, self.urls_count, self.full_urls_count))

        # Check threds count vs hosts count
        if self.args.threads > self.hosts_count:
            self.output.write_log('Too many threads! Fixing threads count to %i' % self.hosts_count, logging.WARNING)
            threads_count = self.hosts_count
        else:
            threads_count = self.args.threads

        # Output urls count
        self.output.args.urls_count = self.full_urls_count

        # Start workers
        self.workers = [spawn(self.worker, i) for i in range(threads_count)]

        # Fill and join queue
        [self.hosts_queue.put(host) for host in self.hosts]
        self.hosts_queue.join()

    def stop(self):
        """
        Stop scan
        :return:
        """
        # TODO: stop correctly
        gevent.killall(self.workers)
예제 #13
0
class AsynSpiderWithGevent(MySpider):
    def __init__(self, out=BasicAnalysis(), **kwargs):
        super(AsynSpiderWithGevent, self).__init__(out, **kwargs)
        self.q = JoinableQueue()
        self.fetching, self.fetched = set(), set()

    def assign_jobs(self, jobs):
        for job in jobs:
            self.q.put(job)

    def run(self):
        if self.q.empty():
            url = LIST_URL + urllib.urlencode(self.list_query)
            self.q.put(url)
        for _ in range(CONCURRENCY):
            gevent.spawn(self.worker)
        self.q.join()
        assert self.fetching == self.fetched
        self._out.finish()

    def worker(self):
        while True:
            self.fetch_url()

    def fetch_url(self):
        current_url = self.q.get()
        try:
            if current_url in self.fetching:
                return
            self.fetching.add(current_url)
            resp = requests.get(current_url, headers=HEADERS)
            self.fetched.add(current_url)
            xml = etree.fromstring(resp.content)
            has_total_count = xml.xpath("//totalcount/text()")
            if has_total_count:  # 非空证明为列表,否则为详细页
                total_count = int(has_total_count[0])
                if total_count == 0:
                    return  # 列表跨界
                if self.list_query["pageno"] == 1:
                    pageno = 2
                    # while pageno < 10:
                    while pageno <= total_count / PAGE_SIZE:
                        self.list_query["pageno"] = pageno
                        next_list_url = LIST_URL + urllib.urlencode(
                            self.list_query)
                        self.q.put(next_list_url)
                        # logging.info(next_list_url)
                        pageno += 1
                job_ids = xml.xpath("//jobid/text()")
                job_detail_urls = []
                for ID in job_ids:
                    new_detail_query = DETAIL_QUERY.copy()
                    new_detail_query["jobid"] = ID
                    job_detail_urls.append(DETAIL_URL +
                                           urllib.urlencode(new_detail_query))
                for detail_url in job_detail_urls:
                    self.q.put(detail_url)
                    # logging.info(detail_url)

            else:
                self._out.collect(xml)
        finally:
            self.q.task_done()
예제 #14
0
class DriverPool(object):
    """ Create a pool of available Selenium containers for processing.

    Args:
        size (int): maximum concurrent tasks. Must be at least ``2``.
        driver_cls (WebDriver):
        driver_cls_args (tuple):
        driver_cls_kw (dict):
        use_proxy (bool):
        factory (:obj:`~selenium_docker.base.ContainerFactory`):
        name (str):
        logger (:obj:`logging.Logger`):

    Example::

        pool = DriverPool(size=2)

        urls = [
            'https://google.com',
            'https://reddit.com',
            'https://yahoo.com',
            'http://ksl.com',
            'http://cnn.com'
        ]

        def get_title(driver, url):
            driver.get(url)
            return driver.title

        for result in pool.execute(get_title, urls):
            print(result)



    """

    INNER_THREAD_SLEEP = 0.5
    """float: essentially our polling interval between tasks and checking
    when tasks have completed.
    """

    PROXY_CLS = SquidProxy
    """:obj:`~selenium_docker.proxy.AbstractProxy`: created for the pool
    when ``use_proxy=True`` during pool instantiation.
    """

    def __init__(self, size, driver_cls=ChromeDriver, driver_cls_args=None,
                 driver_cls_kw=None, use_proxy=True, factory=None, name=None,
                 logger=None):
        self.size = max(2, size)
        self.name = name or gen_uuid(6)
        self.factory = factory or ContainerFactory.get_default_factory()
        self.logger = logger or getLogger(
            '%s.DriverPool.%s' % (__name__, self.name))

        self._driver_cls = driver_cls
        self._driver_cls_args = driver_cls_args or tuple()
        self._driver_cls_kw = driver_cls_kw or dict()
        self._drivers = Queue(maxsize=self.size)

        # post init inspections
        if not hasattr(self._driver_cls, 'CONTAINER'):
            raise DriverPoolValueError('driver_cls must extend DockerDriver')

        if not isiterable(self._driver_cls_args):
            raise DriverPoolValueError(
                '%s is not iterable' % self._driver_cls_args)

        if not isinstance(self._driver_cls_kw, Mapping):
            raise DriverPoolValueError(
                '%s is not a valid mapping' % self._driver_cls_kw)

        # determine proxy usage
        self.proxy = None
        self._use_proxy = use_proxy  # type: bool

        # deferred instantiation
        self._pool = None  # type: Pool
        self._results = None  # type: Queue
        self._tasks = None  # type: JoinableQueue
        self._processing = False  # type: bool
        self.__feeder_green = None  # type: gevent.Greenlet

    def __repr__(self):
        return '<DriverPool-%s(size=%d,driver=%s,proxy=%s,async=%s)>' % (
            self.name, self.size, self._driver_cls.BROWSER,
            self._use_proxy, self.is_async)

    def __iter__(self):
        return self.results(block=self.is_async)

    def __del__(self):
        try:
            self.close()
        except Exception as e:
            if hasattr(self, 'logger'):
                self.logger.exection(e, exc_info=False)

    @property
    def is_processing(self):
        """bool: whether or not we're currently processing tasks. """
        return self._processing

    @property
    def is_async(self):
        """bool: returns True when asynchronous processing is happening. """
        return self.__feeder_green is not None

    def __bootstrap(self):
        """ Prepare this driver pool instance to batch execute task items. """
        if self.is_processing:
            # cannot run two executions simultaneously
            raise DriverPoolRuntimeException(
                'cannot bootstrap pool, already running')
        if self._results and self._results.qsize():  # pragma: no cover
            self.logger.debug('pending results being discarded')
        if self._tasks and self._tasks.qsize():  # pragma: no cover
            self.logger.debug('pending tasks being discarded')
        if self._pool:  # pragma: no cover
            self.logger.debug('killing processing pool')
            self._pool.join(timeout=10.0)
            self._pool.kill()
            self._pool = None
        if self._use_proxy and not self.proxy:
            # defer proxy instantiation -- since spinning up a squid proxy
            #  docker container is surprisingly time consuming.
            self.logger.debug('bootstrapping squid proxy')
            self.proxy = self.PROXY_CLS(factory=self.factory)
        self.logger.debug('bootstrapping pool processing')
        self._processing = True
        self._results = Queue()
        self._tasks = JoinableQueue()
        self._load_drivers()
        # create our processing pool with headroom over the number of drivers
        #  requested for this processing pool.
        self._pool = Pool(size=self.size + math.ceil(self.size * 0.25))

    def __cleanup(self, force=False):
        """ Stop and remove the web drivers and their containers. This function
        should not remove pending tasks or results. It should be possible to
        cleanup all the external resources of a driver pool and still extract
        the results of the work that was completed.

        Raises:
            DriverPoolRuntimeException: when attempting to cleanup an
                environment while processing is still happening, and forcing
                the cleanup is set to ``False``.

            SeleniumDockerException: when a driver instance or container
                cannot be closed properly.

        Returns:
            None
        """
        if self.is_processing and not force:  # pragma: no cover
            raise DriverPoolRuntimeException(
                'cannot cleanup driver pool while executing')
        self._processing = False
        squid = None  # type: gevent.Greenlet
        error = None  # type: SeleniumDockerException
        if self.proxy:
            self.logger.debug('closing squid proxy')
            squid = gevent.spawn(self.proxy.quit)
        if self._pool:  # pragma: no cover
            self.logger.debug('emptying task pool')
            if not force:
                self._pool.join(timeout=10.0)
            self._pool.kill(block=False,
                            timeout=10.0)
            self._pool = None
        self.logger.debug('closing all driver containers')
        while not self._drivers.empty():
            d = self._drivers.get(block=True)
            try:
                d.quit()
            except SeleniumDockerException as e:  # pragma: no cover
                self.logger.exception(e, exc_info=True)
                if not force:
                    error = e
        if self.proxy:
            squid.join()
            self.proxy = None
        if error:  # pragma: no cover
            raise error

    def _load_driver(self, and_add=True):
        """ Load a single web driver instance and container. """
        args = self._driver_cls_args
        kw = dict(self._driver_cls_kw)
        kw.update({
            'proxy': self.proxy,
            'factory': self.factory,
        })
        driver = self._driver_cls(*args, **kw)
        if and_add:
            self._drivers.put(driver)
        return driver

    def _load_drivers(self):
        """ Load the web driver instances and containers.

        Raises:
            DriverPoolRuntimeException: when the requested number of drivers
                for the given pool size cannot be created for some reason.

        Returns:
            None
        """
        if not self._drivers.empty():  # pragma: no cover
            return
        threads = []
        for o in range(self.size):
            self.logger.debug('creating driver %d of %d', o + 1, self.size)
            thread = gevent.spawn(self._load_driver)
            threads.append(thread)
        for t in reversed(threads):
            t.join()
        if not self._drivers.full():
            raise DriverPoolRuntimeException(
                'unable to fulfill required concurrent drivers, %d of %d' % (
                    self._drivers.qsize(), self.size))

    def _recycle_driver(self, driver):
        if not driver:
            return
        try:
            driver.quit()
        except Exception as e:
            self.logger.exception(e, exc_info=True)
        # do NOT add the new driver container to the drivers queue,
        #  instead this will be handled in the recycle logic that requested
        #  the driver in the first place. Instead of returning the one it
        #  received this "new" instance will be put in its placed.
        print('RECYCLED!!!!!!')
        return self._load_driver(and_add=False)

    def add_async(self, *items):
        """ Add additional items to the asynchronous processing queue.

        Args:
            items (list(Any)): list of items that need processing. Each item is
                applied one at a time to an available driver from the pool.

        Raises:
            StopIteration: when all items have been added.
        """
        if len(items) == 1 and isinstance(items[0], list):
            items = iter(items[0])
        if not items:
            raise DriverPoolValueError(
                'cannot add items with value: %s' % str(items))
        item_count = count(items)
        self.logger.debug('adding %d additional items to tasks', item_count)
        for o in items:
            self._tasks.put(o)

    def close(self):
        """ Force close all the drivers and cleanup their containers.

        Returns:
            None
        """
        self.__cleanup(force=True)

    def execute(self, fn, items, preserve_order=False, auto_clean=True,
                no_wait=False):
        """ Execute a fixed function, blocking for results.

        Args:
            fn (Callable): function that takes two parameters, ``driver`` and
                ``task``.
            items (list(Any)): list of items that need processing. Each item is
                applied one at a time to an available driver from the pool.
            preserve_order (bool): should the results be returned in the order
                they were supplied via ``items``. It's more performant to
                allow results to return in any order.
            auto_clean (bool): cleanup docker containers after executing. If
                multiple processing tasks are going to be used, it's more
                performant to leave the containers running and reuse them.
            no_wait (bool): forgo a small sleep interval between finishing
                a task and putting the driver back in the available drivers
                pool.

        Yields:
            results: the result for each item as they're finished.
        """

        def worker(o):
            job_num, item = o
            self.logger.debug('doing work on item %d' % job_num)
            driver = self._drivers.get(block=True)
            ret_val = fn(driver, item)
            if not no_wait:
                gevent.sleep(self.INNER_THREAD_SLEEP)
            self._drivers.put(driver)
            return ret_val

        if self.__feeder_green:
            raise DriverPoolRuntimeException(
                'cannot perform a blocking execute while async processing')

        self.__bootstrap()
        self.logger.debug('starting sync processing')

        if preserve_order:
            ittr = self._pool.imap
        else:
            ittr = self._pool.imap_unordered

        self.logger.debug('yielding processed results')
        for o in ittr(worker, enumerate(items)):
            self._results.put(o)
        self._results.put(StopIteration)
        self.logger.debug('stopping sync processing')
        if auto_clean:
            self.logger.debug('auto cleanup pool environment')
            self.__cleanup(force=True)
        return self.results(block=False)

    def execute_async(self, fn, items=None, callback=None,
                      catch=(WebDriverException,), requeue_task=False):
        """ Execute a fixed function in the background, streaming results.

        Args:
            fn (Callable): function that takes two parameters, ``driver`` and
                ``task``.
            items (list(Any)): list of items that need processing. Each item is
                applied one at a time to an available driver from the pool.
            callback (Callable): function that takes a single parameter, the
                return value of ``fn`` when its finished processing and has
                returned the driver to the queue.
            catch (tuple[Exception]): tuple of Exception classes to catch
                during task execution. If one of these Exception classes
                is caught during ``fn`` execution the driver that crashed will
                attempt to be recycled.
            requeue_task (bool): in the event of an Exception being caught
                should the task/item that was being worked on be re-added to
                the queue of items being processed.

        Raises:
            DriverPoolValueError: if ``callback`` is not ``None``
                or ``callable``.

        Returns:
            None
        """

        def worker(fn, task):
            ret_val = None
            async_task_id = gen_uuid(12)
            self.logger.debug('starting async task %s', async_task_id)
            driver = self._drivers.get(block=True)
            if isinstance(driver, Exception):
                raise driver
            try:
                ret_val = fn(driver, task)
            except catch as e:
                self.logger.exception('hihi')
                if self.is_processing:
                    driver = self._recycle_driver(driver)
                    if requeue_task:
                        self._tasks.put(task)
            finally:
                self._results.put(ret_val)
                self._drivers.put(driver)
                gevent.sleep(self.INNER_THREAD_SLEEP)
                return ret_val

        def feeder():
            self.logger.debug('starting async feeder thread')
            while True:
                while not self._tasks.empty():
                    task = self._tasks.get()
                    if self._pool is None:
                        break
                    self._pool.apply_async(
                        worker,
                        args=(fn, task,),
                        callback=greenlet_callback)
                gevent.sleep(self.INNER_THREAD_SLEEP)
                if self._pool is None and not self.is_processing:
                    break
            return

        if callback is None:
            def logger(value):
                self.logger.debug('%s', value)
            callback = logger

        def real_callback(cb, value):
            if isinstance(value, gevent.GreenletExit):
                raise value
            else:
                cb(value)

        greenlet_callback = partial(real_callback, callback)

        for f in [fn, callback]:
            if not callable(f):
                raise DriverPoolValueError(
                    'cannot use %s, is not callable' % callback)

        self.logger.debug('starting async processing')
        self.__bootstrap()
        if not self.__feeder_green:
            self.__feeder_green = gevent.spawn(feeder)
        if items:
            self.add_async(*items)

    def quit(self):
        """ Alias for :func:`~DriverPool.close()`. Included for consistency
        with driver instances that generally call ``quit`` when they're no
        longer needed.

        Returns:
            None
        """
        if self.__feeder_green:
            return self.stop_async()
        return self.close()

    def results(self, block=True):
        """ Iterate over available results from processed tasks.

        Args:
            block (bool): when ``True``, block this call until all tasks have
                been processed and all results have been returned. Otherwise
                this will continue indefinitely while tasks are dynamically
                added to the async processing queue.

        Yields:
            results: one result at a time as they're finished.

        Raises:
            StopIteration: when the processing is finished.
        """
        est_size = self._results.qsize()
        self.logger.debug('there are an estimated %d results', est_size)
        if block:
            self.logger.debug('blocking for results to finish processing')
            while self.is_processing:
                while not self._results.empty():
                    yield self._results.get()
                gevent.sleep(self.INNER_THREAD_SLEEP)
                if self._tasks.empty() and self._results.empty():
                    break
            raise StopIteration
        else:
            if est_size > 0:
                self.logger.debug('returning as many results as have finished')
            self._results.put(StopIteration)
            for result in self._results:
                yield result

    def stop_async(self, timeout=None, auto_clean=True):
        """ Stop all the async worker processing from executing.

        Args:
            timeout (float): number of seconds to wait for pool to finish
                processing before killing and closing out the execution.
            auto_clean (bool): cleanup docker containers after executing. If
                multiple processing tasks are going to be used, it's more
                performant to leave the containers running and reuse them.

        Returns:
            None
        """
        self.logger.debug('stopping async processing')
        if self.__feeder_green:
            self.logger.debug('killing async feeder thread')
            gevent.kill(self.__feeder_green)
            self.__feeder_green = None
        if self._pool:
            self.logger.debug('joining async pool before kill')
            self._pool.join(timeout=timeout or 1.0)
            self._pool.kill(block=False)
        tasks_count = self._tasks.qsize()
        self.logger.info('%d tasks remained unprocessed', tasks_count)
        if auto_clean:
            self.logger.debug('auto cleanup pool environment')
            self.__cleanup(force=True)
예제 #15
0
class BaseCrawler(object):
    def __init__(self, requestHandler=BaseRequestHandler(),
                       parseHandler=BaseParseHandler(),
                       sheduler=BaseScheduler(),
                       pipeline=BasePipeline()):
        self.requestHandler = requestHandler
        self.parseHandler = parseHandler
        self.sheduler = sheduler
        self.pipeline = pipeline
        self.task_queue = JoinableQueue()
        self.response_queue = JoinableQueue()
        self.tasks_cnt = 0
        self.result_queue = JoinableQueue()
        self.jobs_cnt = config.num_threads
        self.start_time = time.time()
        self.stop = False
    
    def doScheduler(self):
        """Generate tasks, one thread
        """
        logging.info('scheduler started!')
        for task in self.sheduler.init_generator():
            self.task_queue.put(task)
            self.tasks_cnt += 1

        while self.tasks_cnt > 0 and not self.stop:
            gevent.sleep(config.new_task_check_time)

        logging.info('scheduler finished! All task done.')

        for i in xrange(config.num_threads):
            self.task_queue.put(StopIteration)

    def worker(self):
        """Fetch url and parse, config.num_threads threads
        """
        task = self.task_queue.get()
        cnt = config.error_retry_cnt
        while task != StopIteration:
            try:
                #timeout = gevent.Timeout(config.TASK_TIMEOUT)
                #timeout.start()
                response = self.requestHandler.handle(task)
                result, new_tasks = self.parseHandler.handle(response)
                #timeout.cancel()
                #if isinstance(result, collections.Iterable):
                #if isinstance(result, list):
                #    for ret in result:
                #        self.result_queue.put(ret)
                #else:
                if result:
                    self.result_queue.put(result)
                for task in new_tasks:
                    self.task_queue.put(task)
                    self.tasks_cnt += 1
                #self.task_queue.task_done()
                self.tasks_cnt -= 1
                task = self.task_queue.get()
                cnt = config.error_retry_cnt
            except Exception as e:
                try:
                    #timeout.cancel()
                    cnt -= 1
                    logging.exception(e)
                    if cnt <= 0:
                        #self.task_queue.task_done()
                        self.tasks_cnt -= 1
                        task = self.task_queue.get()
                        logging.error('task failed, try \033[31m%d\033[0m times! will not try' % (config.error_retry_cnt - cnt))
                        cnt = config.error_retry_cnt
                    #logging.exception('task failed!')
                    else:
                        logging.error('task failed, try \033[31m%d\033[0m times!' % (config.error_retry_cnt - cnt))
                except Exception as e:
                    self.tasks_cnt -= 1
                    #self.jobs_cnt -= 1
                    raise
            finally:
                #timeout.cancel()
                pass
        self.jobs_cnt -= 1

    def doPipeline(self):
        while self.jobs_cnt > 0 or not self.result_queue.empty():
            gevent.sleep(config.pipeline_sleeptime)
            results = []
            try:
                while 1:
                    results.append(self.result_queue.get_nowait())
                    if len(results) > 100:
                        raise gevent.queue.Empty
            except gevent.queue.Empty:
                if results:
                    try:
                        self.pipeline.process(results)
                    except:
                        logging.exception('')
                #logging.exception('')
            except:
                logging.exception('')

    def run(self):
        jobs = [
                gevent.spawn(self.doScheduler),
                gevent.spawn(self.doPipeline),
                ]
        for i in xrange(config.num_threads):
            job = gevent.spawn(self.worker)
            jobs.append(job)
            #thread.start_new_thread(self.worker)
        try:
            timeout = gevent.Timeout(config.CRAWLER_TIMEOUT)
            timeout.start()
            #self.task_queue.join()
            gevent.joinall(jobs)
        except:
            logging.exception('pipeline error!')
        finally:
            timeout.cancel()
            self.end_time = time.time()
            logging.info('run times: %f s' % (self.end_time - self.start_time))