class ArticleManager(threading.Thread, Singleton):
    def __init__(self, table_article='articles'):
        if not hasattr(self, '_table_article'):
            super(ArticleManager, self).__init__()

            self._thread_stop = False

            self._articles_deque = collections.deque()
            self._db = RedisDB()
            self._table_article = table_article

    def run(self):
        while not self._thread_stop:
            try:
                self.__add_article_to_db()
            except Exception as e:
                log.error(e)

            tools.delay_time(1)

    def stop(self):
        self._thread_stop = True

    def put_articles(self, article):
        self._articles_deque.append(article)

    def clear_article(self):
        '''
        @summary: 删除redis里的数据
        ---------
        ---------
        @result:
        '''

        self._db.clear(self._table_article)

    def __add_article_to_db(self):
        article_list = []
        while self._articles_deque:
            article = self._articles_deque.popleft()
            article_list.append(article)
            if len(article_list) > 100:
                log.debug('添加article到数据库')
                self._db.zadd(self._table_article, article_list)
                article_list = []

        if article_list:
            log.debug('添加article到数据库')
            self._db.zadd(self._table_article, article_list)
示例#2
0
class TaskManager():
    IS_IN_TIME_RANGE = 1  # 在时间范围
    NOT_REACH_TIME_RANGE = 2  # 没到达时间范围
    OVER_MIN_TIME_RANGE = 3  # 超过时间范围

    def __init__(self):

        self._mysqldb = MysqlDB(**config.get('mysqldb'))
        self._redis = RedisDB(**config.get('redisdb'))

        self._task_root_key = config.get('spider').get('redis_task_cache_root_key')

        self._account_task_key = self._task_root_key + ':z_account_task'
        self._article_task_key = self._task_root_key + ':z_article_task'
        self._last_article_publish_time = self._task_root_key + ':h_last_article_publish_time'
        self._new_last_article_publish_time = self._task_root_key + ':h_new_last_article_publish_time'

        self._ignore_haved_crawl_today_article_account = config.get('spider').get('ignore_haved_crawl_today_article_account')
        self._monitor_interval = config.get('spider').get('monitor_interval')
        self._zombie_account_not_publish_article_days = config.get('spider').get('zombie_account_not_publish_article_days')
        self._spider_interval_min = config.get('spider').get('spider_interval').get('min_sleep_time')
        self._spider_interval_max = config.get('spider').get('spider_interval').get('max_sleep_time')
        self._spider_interval_max = config.get('spider').get('spider_interval').get('max_sleep_time')
        self._crawl_time_range = (config.get("spider").get("crawl_time_range") or "~").split('~')

    def __get_task_from_redis(self, key):
        task = self._redis.zget(key, is_pop=True)
        if task:
            task = eval(task[0])
            return task

    def __random_int(self, min, max):
        pass

    def get_account_task(self):
        """
        获取公众号任务
        :return:
            {'__biz': 'Mjc1NjM3MjY2MA==', 'last_publish_time': None}
            或
            None
        """
        task = self.__get_task_from_redis(self._account_task_key)
        if not task:
            publish_time_condition = "AND last_publish_time < '{today}'".format(today=tools.get_current_date(date_format='%Y-%m-%d' + ' 00:00:00')) if self._ignore_haved_crawl_today_article_account else ''
            sql = '''
                SELECT
                    __biz,
                    last_publish_time
                FROM
                    wechat_account_task
                WHERE
                    `is_zombie` != 1
                AND (
                    (
                        (
                            UNIX_TIMESTAMP(CURRENT_TIMESTAMP) - UNIX_TIMESTAMP(last_spider_time)
                        ) > {monitor_interval}
                        {publish_time_condition}
                    )
                    OR (last_spider_time IS NULL)
                )
                '''.format(monitor_interval=self._monitor_interval, publish_time_condition=publish_time_condition)

            tasks = self._mysqldb.find(sql, to_json=True)
            if tasks:
                self._redis.zadd(self._account_task_key, tasks)
                task = self.__get_task_from_redis(self._account_task_key)

        return task

    def get_article_task(self):
        """
        获取文章任务
        :return:
            {'article_url': 'http://mp.weixin.qq.com/s?__biz=MzIxNzg1ODQ0MQ==&mid=2247485501&idx=1&sn=92721338ddbf7d907eaf03a70a0715bd&chksm=97f220dba085a9cd2b9a922fb174c767603203d6dbd2a7d3a6dc41b3400a0c477a8d62b96396&scene=27#wechat_redirect'}
            或
            None
        """
        task = self.__get_task_from_redis(self._article_task_key)
        if not task:
            sql = 'select id, article_url from wechat_article_task where state = 0 limit 5000'
            tasks = self._mysqldb.find(sql)
            if tasks:
                # 更新任务状态
                task_ids = str(tuple([task[0] for task in tasks])).replace(',)', ')')
                sql = 'update wechat_article_task set state = 2 where id in %s' % (task_ids)
                self._mysqldb.update(sql)

            else:
                sql = 'select id, article_url from wechat_article_task where state = 2 limit 5000'
                tasks = self._mysqldb.find(sql)

            if tasks:
                task_json = [
                    {
                        'article_url': article_url
                    }
                    for id, article_url in tasks
                ]
                self._redis.zadd(self._article_task_key, task_json)
                task = self.__get_task_from_redis(self._article_task_key)

        return task

    def update_article_task_state(self, sn, state=1):
        sql = 'update wechat_article_task set state = %s where sn = "%s"' % (state, sn)
        self._mysqldb.update(sql)

    def record_last_article_publish_time(self, __biz, last_publish_time):
        self._redis.hset(self._last_article_publish_time, __biz, last_publish_time or '')

    def is_reach_last_article_publish_time(self, __biz, publish_time):
        last_publish_time = self._redis.hget(self._last_article_publish_time, __biz)
        if not last_publish_time:
            # 查询mysql里是否有该任务
            sql = "select last_publish_time from wechat_account_task where __biz = '%s'" % __biz
            data = self._mysqldb.find(sql)
            if data:  # [(None,)] / []
                last_publish_time = str(data[0][0] or '')
                self.record_last_article_publish_time(__biz, last_publish_time)

        if last_publish_time is None:
            return

        if publish_time < last_publish_time:
            return True

        return False

    def is_in_crawl_time_range(self, publish_time):
        """
        是否在时间范围
        :param publish_time:
        :return: 是否达时间范围
        """
        if not publish_time or (not self._crawl_time_range[0] and not self._crawl_time_range[1]):
            return TaskManager.IS_IN_TIME_RANGE

        if self._crawl_time_range[0]:  # 时间范围 上限
            if publish_time > self._crawl_time_range[0]:
                return TaskManager.NOT_REACH_TIME_RANGE

            if publish_time <= self._crawl_time_range[0] and publish_time >= self._crawl_time_range[1]:
                return TaskManager.IS_IN_TIME_RANGE

        if publish_time < self._crawl_time_range[1]:  # 下限
            return TaskManager.OVER_MIN_TIME_RANGE

        return TaskManager.IS_IN_TIME_RANGE

    def record_new_last_article_publish_time(self, __biz, new_last_publish_time):
        self._redis.hset(self._new_last_article_publish_time, __biz, new_last_publish_time)

    def get_new_last_article_publish_time(self, __biz):
        return self._redis.hget(self._new_last_article_publish_time, __biz)

    def update_account_last_publish_time(self, __biz, last_publish_time):
        sql = 'update wechat_account_task set last_publish_time = "{}", last_spider_time="{}" where __biz="{}"'.format(
            last_publish_time, tools.get_current_date(), __biz
        )
        self._mysqldb.update(sql)

    def is_zombie_account(self, last_publish_timestamp):
        if tools.get_current_timestamp() - last_publish_timestamp > self._zombie_account_not_publish_article_days * 86400:
            return True
        return False

    def sign_account_is_zombie(self, __biz, last_publish_time=None):
        if last_publish_time:
            sql = 'update wechat_account_task set last_publish_time = "{}", last_spider_time="{}", is_zombie=1 where __biz="{}"'.format(
                last_publish_time, tools.get_current_date(), __biz
            )
        else:
            sql = 'update wechat_account_task set last_spider_time="{}", is_zombie=1 where __biz="{}"'.format(
                tools.get_current_date(), __biz
            )

        self._mysqldb.update(sql)

    def get_task(self, url=None, tip=''):
        """
        获取任务
        :param url: 指定url时,返回该url包装后的任务。否则先取公众号任务,无则取文章任务。若均无任务,则休眠一段时间之后再取
        :return:
        """

        sleep_time = random.randint(self._spider_interval_min, self._spider_interval_max)

        if not url:
            account_task = self.get_account_task()
            if account_task:
                __biz = account_task.get('__biz')
                last_publish_time = account_task.get('last_publish_time')
                self.record_last_article_publish_time(__biz, last_publish_time)
                tip = '正在抓取列表'
                url = 'https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz={}&scene=124#wechat_redirect'.format(__biz)
            else:
                article_task = self.get_article_task()
                if article_task:
                    tip = '正在抓取详情'
                    url = article_task.get('article_url')
                else:
                    sleep_time = config.get('spider').get('no_task_sleep_time')
                    log.info('暂无任务 休眠 {}s'.format(sleep_time))
                    tip = '暂无任务 '

        if url:
            next_page = "{tip} 休眠 {sleep_time}s 下次刷新时间 {begin_spider_time} <script>setTimeout(function(){{window.location.href='{url}';}},{sleep_time_msec});</script>".format(
                tip=tip and tip + ' ', sleep_time=sleep_time, begin_spider_time=tools.timestamp_to_date(tools.get_current_timestamp() + sleep_time), url=url, sleep_time_msec=sleep_time * 1000
            )
        else:
            next_page = "{tip} 休眠 {sleep_time}s 下次刷新时间 {begin_spider_time} <script>setTimeout(function(){{window.location.reload();}},{sleep_time_msec});</script>".format(
                tip=tip and tip + ' ', sleep_time=sleep_time, begin_spider_time=tools.timestamp_to_date(tools.get_current_timestamp() + sleep_time), sleep_time_msec=sleep_time * 1000
            )

        return next_page

    def reset_task(self):
        # 清除redis缓存
        keys = self._task_root_key + "*"
        keys = self._redis.getkeys(keys)
        if keys:
            for key in keys:
                self._redis.clear(key)

            # 重设任务
            sql = "update wechat_article_task set state = 0 where state = 2"
            self._mysqldb.update(sql)
示例#3
0
class UrlManager(threading.Thread, Singleton):
    def __init__(self, table_url = 'urls'):
        if not hasattr(self,'_table_url'):
            super(UrlManager, self).__init__()

            self._thread_stop = False

            self._urls_deque = collections.deque()
            self._db = RedisDB()
            self._table_url = table_url
            self._table_url_dupefilter = self._table_url + '_dupefilter'
            self._table_url_end_depth_dupefilter = self._table_url + '_end_depth_dupefilter'

    def run(self):
        while not self._thread_stop:
            try:
                self.__add_url_to_db()
            except Exception as e:
                log.error(e)

            tools.delay_time(1)

    def stop(self):
        self._thread_stop = True

    def put_urls(self, urls):
        urls = urls if isinstance(urls, list) else [urls]
        for url in urls:
            self._urls_deque.append(url)

    def get_urls_count(self):
        return len(self._urls_deque)

    def clear_url(self):
        '''
        @summary: 删除redis里的数据
        ---------
        ---------
        @result:
        '''

        self._db.clear(self._table_url)
        self._db.clear(self._table_url_dupefilter)

    def __add_url_to_db(self):
        url_list = []
        prioritys = []

        while self._urls_deque:
            url = self._urls_deque.popleft()
            url_id = tools.get_sha1(url.get('url'))
            depth = url.get('depth', 0)

            max_depth = url.get('remark',{}).get('spider_depth', 0)
            # 为了获取每层数量,指纹url暂时采用zset,且先校验指纹url,后校验最后一层url,不需要获取每层url时建议采用set存储,且先校验最后一层url
            if depth == max_depth - 1: #最后一层 url单独放,之后不需要清空
                if self._db.zadd(self._table_url_dupefilter, url_id, depth) and self._db.sadd(self._table_url_end_depth_dupefilter, url_id):
                    url_list.append(url)
                    prioritys.append(depth)

            elif self._db.zadd(self._table_url_dupefilter, url_id, depth):
                url_list.append(url)
                prioritys.append(depth)

            if len(url_list) > 100:
                log.debug('添加url到数据库')
                self._db.zadd(self._table_url, url_list, prioritys)
                url_list = []
                prioritys = []

        if url_list:
            log.debug('添加url到数据库')
            self._db.zadd(self._table_url, url_list, prioritys)
示例#4
0
class ArticleManager(threading.Thread, Singleton):
    def __init__(self, table_article = 'articles'):
        if not hasattr(self,'_table_article'):
            super(ArticleManager, self).__init__()

            self._thread_stop = False

            self._articles_deque = collections.deque()
            self._db = RedisDB()
            self._table_article = table_article
            self._table_article_bloomfilter = table_article + '_bloomfilter'

            self._bloomfilter = BloomFilter(redis_obj = self._db, key = self._table_article_bloomfilter)

    def run(self):
        while not self._thread_stop:
            try:
                self.__add_article_to_db()
            except Exception as e:
                log.error(e)

            log.debug('缓存中文章数量 %s'%len(self._articles_deque))
            tools.delay_time(1)

    def stop(self):
        self._thread_stop = True

    def put_articles(self, article):
        self._articles_deque.append(article)

        if self.get_articles_count() > MAX_ARTICLE_COUNT: # 超过最大缓存,总动调用
            self.__add_article_to_db()

    def get_articles_count(self):
        return len(self._table_article)

    def clear_article(self):
        '''
        @summary: 删除redis里的数据
        ---------
        ---------
        @result:
        '''

        self._db.clear(self._table_article)

    def __add_article_to_db(self):
        article_list = []
        while self._articles_deque:
            article = self._articles_deque.popleft()
            # 查看article是否存在
            if self._bloomfilter.is_contains(article.get('uuid')):
                article_list.append(article)
            else:
                self._bloomfilter.insert(article.get('uuid'))

            if len(article_list) > 100:
                log.debug('添加article到数据库')
                self._db.sadd(self._table_article, article_list)
                article_list = []

        if article_list:
            log.debug('添加article到数据库')
            self._db.sadd(self._table_article, article_list)
class TaskManager():
    def __init__(self):
        self._oracledb = OracleDB()
        self._redisdb = RedisDB()
        self._news_url_table = 'news:news_urls'
        self._news_urls_dupefilter = 'news:news_urls_dupefilter'

    def is_have_task(self):
        '''
        @summary: redis 中是否有待做的url
        ---------
        ---------
        @result:
        '''

        return self._redisdb.zget_count(self._news_url_table)

    def get_ever_depth_count(self, total_depth = 5):
        '''
        @summary:
        ---------
        @param total_depth: 不包含。 以客户角度的层数
        ---------
        @result:
        '''

        depth_count_info = {}
        for depth in range(total_depth):
            key = '第%s层url数'% (depth + 1)
            depth_count_info[key] = self._redisdb.zget_count(self._news_urls_dupefilter, priority_min = depth, priority_max = depth)

        depth_count_info['总url数'] = self._redisdb.zget_count(self._news_urls_dupefilter)
        return depth_count_info

    def get_task_from_oracle(self):
        tasks = []

        offset = 0
        while True:
            # 取任务
            task_sql = '''
                select *
                  from (select t.id, t.name, t.position, t.url, t.depth, rownum r
                          from TAB_IOPM_SITE t
                         where classify = 1
                           and t.mointor_status = 701
                           and (t.position != 35 or t.position is null)
                           and rownum < {page_size})
                 where r >= {offset}
            '''.format(page_size = offset + ONE_PAGE_SIZE, offset = offset)

            results = self._oracledb.find(task_sql)
            offset += ONE_PAGE_SIZE

            if not results: break

            # 拼装成json格式的url
            for task in results:
                website_id = task[0]
                website_name = task[1]
                website_position = task[2]
                website_url = task[3]
                website_domain = tools.get_domain(website_url)
                spider_depth = task[4]

                remark = {'website_name':website_name, 'website_position':website_position, 'website_url': website_url, 'website_domain':website_domain, 'spider_depth':spider_depth}
                url_dict = {'site_id':1, 'url':website_url, 'depth': 0, 'remark':remark, 'retry_times' : 0}

                tasks.append(url_dict)

        return tasks

    def add_task_to_redis(self, tasks):
        for task in tasks:
            url = task.get('url')
            if url:
                url_id = tools.get_sha1(url)
                if self._redisdb.zadd(self._news_urls_dupefilter, url_id, prioritys = 0):
                    self._redisdb.zadd(self._news_url_table, task, prioritys = 0)

    def clear_task(self):
        # 清空url指纹表
        self._redisdb.clear('news:news_urls_dupefilter')
示例#6
0
文件: url_manager.py 项目: lxshen/try
class UrlManager(threading.Thread, Singleton):
    def __init__(self, table_url='urls'):
        if not hasattr(self, '_table_url'):
            super(UrlManager, self).__init__()

            self._thread_stop = False

            self._urls_deque = collections.deque()
            self._db = RedisDB()
            self._table_url = table_url
            self._table_url_dupefilter = self._table_url + '_dupefilter'
            self._table_url_end_depth_dupefilter = self._table_url + '_end_depth_dupefilter'

    def run(self):
        while not self._thread_stop:
            try:
                self.__add_url_to_db()
            except Exception as e:
                log.error(e)

            log.debug('缓存url数量 %s' % len(self._urls_deque))
            tools.delay_time(1)

    def stop(self):
        self._thread_stop = True

    def put_urls(self, urls):
        urls = urls if isinstance(urls, list) else [urls]
        for url in urls:
            self._urls_deque.append(url)

        if self.get_urls_count() > MAX_URL_COUNT:  # 超过最大缓存,总动调用
            self.__add_url_to_db()

    def get_urls_count(self):
        return len(self._urls_deque)

    def clear_url(self):
        '''
        @summary: 删除redis里的数据
        ---------
        ---------
        @result:
        '''

        self._db.clear(self._table_url)
        self._db.clear(self._table_url_dupefilter)

    def print_url(self, i):
        while self._urls_deque:
            url = self._urls_deque.popleft()
            print(i, '-->', url)

    def __add_url_to_db(self):
        url_list = []
        prioritys = []

        while self._urls_deque:
            url = self._urls_deque.popleft()
            url_id = tools.get_sha1(url.get('url'))
            depth = url.get('depth', 0)

            max_depth = url.get('remark', {}).get('spider_depth', 0)
            if depth == max_depth - 1:  #最后一层 url单独放,之后不需要清空
                if self._db.sadd(self._table_url_end_depth_dupefilter,
                                 url_id) and self._db.sadd(
                                     self._table_url_dupefilter, url_id):
                    url_list.append(url)
                    prioritys.append(depth)
                    # 统计每层的url数量,将url_id添加到每层的表,不做统计时可注释掉
                    self._db.sadd(self._table_url_dupefilter + str(depth),
                                  url_id)

            elif self._db.sadd(self._table_url_dupefilter, url_id):
                url_list.append(url)
                prioritys.append(depth)
                # 统计每层的url数量,将url_id添加到每层的表,不做统计时可注释掉
                self._db.sadd(self._table_url_dupefilter + str(depth), url_id)

            if len(url_list) > 100:
                log.debug('添加url到数据库')
                self._db.zadd(self._table_url, url_list, prioritys)
                url_list = []
                prioritys = []

        if url_list:
            log.debug('添加url到数据库')
            self._db.zadd(self._table_url, url_list, prioritys)