Exemplo n.º 1
0
    def __init__(self, tab_urls, depth, process_num = None):
        '''
        @summary:
        ---------
        @param tab_urls:
        @param depth:
        @param process_num: 进程编号
        ---------
        @result:
        '''

        super(Collector, self).__init__()
        self._db = RedisDB()
        self._thread_stop = False
        self._urls = collections.deque()
        self._null_times = 0
        self._tab_urls = tab_urls
        self._depth = depth# or int(tools.get_conf_value('config.conf', "collector", "depth"))
        self._interval = int(tools.get_conf_value('config.conf', "collector", "sleep_time"))
        self._allowed_null_times = int(tools.get_conf_value('config.conf', "collector", 'allowed_null_times'))
        self._url_count = int(tools.get_conf_value('config.conf', "collector", "url_count"))

        self._url_manager = UrlManager(tab_urls)

        self._finished_callback = None

        self._is_show_wait = False

        self._tab_worker_status = 'news:worker_status'
        self._worker_mark = LOCAL_HOST_IP + ('_%s'%process_num if process_num else '')
Exemplo n.º 2
0
    def __init__(self):

        self._mysqldb = MysqlDB(**config.get('mysqldb'))
        self._redis = RedisDB(**config.get('redisdb'))

        self._task_root_key = config.get('spider').get(
            'redis_task_cache_root_key')

        self._account_task_key = self._task_root_key + ':z_account_task'
        self._article_task_key = self._task_root_key + ':z_article_task'
        self._last_article_publish_time = self._task_root_key + ':h_last_article_publish_time'
        self._new_last_article_publish_time = self._task_root_key + ':h_new_last_article_publish_time'

        self._ignore_haved_crawl_today_article_account = config.get(
            'spider').get('ignore_haved_crawl_today_article_account')
        self._monitor_interval = config.get('spider').get('monitor_interval')
        self._zombie_account_not_publish_article_days = config.get(
            'spider').get('zombie_account_not_publish_article_days')
        self._spider_interval_min = config.get('spider').get(
            'spider_interval').get('min_sleep_time')
        self._spider_interval_max = config.get('spider').get(
            'spider_interval').get('max_sleep_time')
        self._spider_interval_max = config.get('spider').get(
            'spider_interval').get('max_sleep_time')
        self._crawl_time_range = (config.get("spider").get("crawl_time_range")
                                  or "~").split('~')
    def __init__(self, table_article='articles'):
        if not hasattr(self, '_table_article'):
            super(ArticleManager, self).__init__()

            self._thread_stop = False

            self._articles_deque = collections.deque()
            self._db = RedisDB()
            self._table_article = table_article
Exemplo n.º 4
0
def monitor_proxies():
    redis_0 = RedisDB()
    config = os.path.join(os.path.dirname(__file__) + '/../config.conf')
    redis_key = tools.get_conf_value(config, 'redis', 'redis_key')
    redis_key2 = tools.get_conf_value(config, 'redis', 'redis_key2')
    sum = redis_0.count(redis_key)
    sum2 = redis_0.count(redis_key2)

    log.debug("douban当前redis库中剩余ip总数:%d" % sum)
    log.debug("weibo当前redis库中剩余ip总数:%d" % sum2)
Exemplo n.º 5
0
    def __init__(self, table_url = 'urls'):
        if not hasattr(self,'_table_url'):
            super(UrlManager, self).__init__()

            self._thread_stop = False

            self._urls_deque = collections.deque()
            self._db = RedisDB()
            self._table_url = table_url
            self._table_url_dupefilter = self._table_url + '_dupefilter'
            self._table_url_end_depth_dupefilter = self._table_url + '_end_depth_dupefilter'
Exemplo n.º 6
0
    def __init__(self, table_article = 'articles'):
        if not hasattr(self,'_table_article'):
            super(ArticleManager, self).__init__()

            self._thread_stop = False

            self._articles_deque = collections.deque()
            self._db = RedisDB()
            self._table_article = table_article
            self._table_article_bloomfilter = table_article + '_bloomfilter'

            self._bloomfilter = BloomFilter(redis_obj = self._db, key = self._table_article_bloomfilter)
Exemplo n.º 7
0
def random_proxy():
    try:
        redis = RedisDB()
        ip_pools = redis.sget(table=redis_key, count=1)
        proxy = random.choice(ip_pools)
        proxies = {
            "http": proxy,
            "https": proxy,
        }
    except Exception as e:
        print(e)
        proxies = {}
    return proxies
    def __init__(self, tab_urls, depth):
        super(Collector, self).__init__()
        self._db = RedisDB()
        self._thread_stop = False
        self._urls = collections.deque()
        self._null_times = 0
        self._tab_urls = tab_urls
        self._depth = depth  # or int(tools.get_conf_value('config.conf', "collector", "depth"))
        self._interval = int(
            tools.get_conf_value('config.conf', "collector", "sleep_time"))
        self._allowed_null_times = int(
            tools.get_conf_value('config.conf', "collector",
                                 'allowed_null_times'))
        self._url_count = int(
            tools.get_conf_value('config.conf', "collector", "url_count"))

        self._url_manager = UrlManager(tab_urls)

        self._finished_callback = None

        self._is_show_wait = False
Exemplo n.º 9
0
    def __init__(self):
        super(SyncArtice, self).__init__()

        self._es = ES()
        self._redis = RedisDB()
        self._sync_count = 0
Exemplo n.º 10
0
        print(f"{bcolors.OKGREEN}Client 2:{bcolors.ENDC} "
              f"{bcolors.OKCYAN}Updating resource '" + RESOURCE_NAME +
              f"' {bcolors.ENDC}")

        # Update the resource
        redis.update(name=RESOURCE_NAME, key="client", value="client_2")
        redis.update(name=RESOURCE_NAME, key="random", value=random.random())
    else:
        print(
            f"{bcolors.FAIL}Client 2: Error acquiring the lock on resource '" +
            RESOURCE_NAME + f"' {bcolors.ENDC}")


# Create Redis instance
redis = RedisDB()

# Create Redlock instance
dlm = Redlock([
    {
        "host": "localhost",
        "port": 6379,
        "db": 0
    },
])

print(f"{bcolors.OKBLUE}## EXECUTING TEST 6 ##{bcolors.ENDC}")
print(
    f"{bcolors.OKBLUE} Several clients, Several locks, one resource, client blocked {bcolors.ENDC}"
)
Exemplo n.º 11
0
 def __init__(self):
     self.redis = RedisDB()
     self.test_url = "https://movie.douban.com/"
Exemplo n.º 12
0
 def __init__(self):
     self.redis = RedisDB()
     self.test_url = 'https://m.weibo.cn/'
Exemplo n.º 13
0
 def __init__(self):
     self._oracledb = OracleDB()
     self._redisdb = RedisDB()
     self._wechat_sogo = WechatSogou()
Exemplo n.º 14
0
class WechatService():
    _db = OracleDB()
    _es = ES()
    _redisdb = RedisDB()
    _wechat_sogou = WechatSogou()
    _wechat_public_platform = WechatPublicPlatform()

    _todo_accounts = collections.deque()
    _rownum = 1

    _is_done = False  # 做完一轮
    _is_all_done = False  # 所有账号当日发布的消息均已爬取

    # wechat_sogou 最后没被封的时间
    _wechat_sogou_enable = True
    _wechat_sogou_last_unenable_time = tools.get_current_timestamp()

    # wechat_public_platform 最后没被封的时间
    _wechat_public_platform_enable = True
    _wechat_public_platform_last_unenable_time = tools.get_current_timestamp()

    def __init__(self):
        pass

    def __load_todo_account(self):
        accounts = WechatService._redisdb.sget('wechat:account', count=1)

        for account in accounts:
            account = eval(account)
            WechatService._todo_accounts.append(account)

    def is_have_new_article(self, account_id, account_name, __biz):
        '''
        @summary: 检查是否有新发布的文章
        ---------
        @param account_id:
        @param __biz:
        ---------
        @result:
        '''

        result = ''
        if WechatService._wechat_sogou_enable:  # 搜狗微信可用
            result = WechatService._wechat_sogou.is_have_new_article(
                account_id=account_id, account=account_name)
            if result == constance.UPDATE:
                # 有新发布的文章 抓取
                pass

            elif result == constance.NOT_UPDATE:
                # 无新发布的文章 pass
                pass

            elif result == constance.ERROR:
                pass

            elif result == constance.VERIFICATION_CODE:
                # 被封了 请求失败 记录下失败时间
                WechatService._wechat_sogou_enable = False
                WechatService._wechat_sogou_last_unenable_time = tools.get_current_timestamp(
                )

        # 搜狗微信停用时间超过24小时了 可重新尝试
        elif tools.get_current_timestamp(
        ) - WechatService._wechat_sogou_last_unenable_time > TIME_INTERVAL:  # 搜狗微信不可用 但是已经间歇一天 还可以一试
            result = WechatService._wechat_sogou.is_have_new_article(
                account_id=account_id, account=account_name)
            if result == constance.UPDATE:
                # 搜狗微信可用
                WechatService._wechat_sogou_enable = True

            elif result == constance.NOT_UPDATE:
                pass

            elif result == constance.ERROR:
                pass

            elif result == constance.VERIFICATION_CODE:
                pass

            # 更新下可用时间
            WechatService._wechat_sogou_last_unenable_time = tools.get_current_timestamp(
            )

        # 如果搜狗微信不可用 则使用微信公众平台检查是否有新发布的文章
        if not result or result == constance.VERIFICATION_CODE:
            if WechatService._wechat_public_platform_enable:  # 微信公众平台可用
                result = WechatService._wechat_public_platform.is_have_new_article(
                    __biz)
                if result == constance.UPDATE:
                    # 有新发布的文章 抓取
                    pass

                elif result == constance.NOT_UPDATE:
                    # 无新发布的文章 pass
                    pass

                elif result == constance.ERROR:
                    # 被封了 请求失败 记录下失败时间
                    WechatService._wechat_public_platform_enable = False
                    WechatService._wechat_public_platform_last_unenable_time = tools.get_current_timestamp(
                    )

            elif tools.get_current_timestamp(
            ) - WechatService._wechat_public_platform_last_unenable_time > TIME_INTERVAL:  # 搜狗微信不可用 但是已经间歇一天 还可以一试
                result = WechatService._wechat_public_platform.is_have_new_article(
                    __biz)
                if result == constance.UPDATE:
                    # 有新发布的文章 抓取
                    WechatService._wechat_public_platform_enable = True

                elif result == constance.NOT_UPDATE:
                    # 无新发布的文章 pass
                    pass

                elif result == constance.ERROR:
                    # 被封了 请求失败 记录下失败时间
                    pass

                # 更新下可用时间
                WechatService._wechat_public_platform_last_unenable_time = tools.get_current_timestamp(
                )

        return result

    def get_next_account(self):
        '''
        @summary:
        ---------
        ---------
        @result: 返回biz, 是否已做完一圈 (biz, True)
        '''

        if not WechatService._todo_accounts:
            self.__load_todo_account()

        if not WechatService._todo_accounts:
            return None

        oralce_id, account_id, account_name, last_article_release_time, biz = WechatService._todo_accounts.popleft(
        )
        next_account_id = account_id
        next_account_biz = biz
        next_account_name = account_name

        next_account = next_account_id, next_account_biz

        sql = "update TAB_IOPM_SITE t set t.spider_status=602 where t.biz = '%s'" % (
            next_account_biz)
        WechatService._db.update(sql)

        return next_account

    def update_account_article_num(self, __biz):
        # 查询es 统计数量
        # 今日
        body = {
            "size": 0,
            "query": {
                "filtered": {
                    "filter": {
                        "range": {
                            "record_time": {
                                "gte":
                                tools.get_current_date('%Y-%m-%d') +
                                ' 00:00:00',
                                "lte":
                                tools.get_current_date('%Y-%m-%d') +
                                ' 23:59:59'
                            }
                        }
                    },
                    "query": {
                        'match': {
                            "__biz": __biz
                        }
                    }
                }
            }
        }
        result = WechatService._es.search('wechat_article', body)
        today_msg = result.get('hits', {}).get('total', 0)

        # 历史总信息量
        body = {
            "size": 0,
            "query": {
                "filtered": {
                    "query": {
                        'match': {
                            "__biz": __biz
                        }
                    }
                }
            }
        }
        result = WechatService._es.search('wechat_article', body)
        total_msg = result.get('hits', {}).get('total', 0)

        if total_msg:
            sql = "update TAB_IOPM_SITE t set t.today_msg = %d, t.total_msg = %d, t.spider_status=603 where t.biz = '%s'" % (
                today_msg, total_msg, __biz)
        else:
            sql = "update TAB_IOPM_SITE t set t.today_msg = %d, t.spider_status=603 where t.biz = '%s'" % (
                today_msg, __biz)
        print(sql)
        WechatService._db.update(sql)

    def is_exist(self, table, data_id):
        if WechatService._es.get(table, data_id=data_id, doc_type=table):
            return True
        else:
            return False

    def add_article_info(self, article_info):
        '''
        @summary:
        ---------
        @param article_info:
        ---------
        @result:
        '''

        log.debug('''
            -----文章信息-----
            标题     %s
            发布时间 %s
            作者     %s
            公众号   %s
            url      %s
            ''' % (article_info['title'], article_info['release_time'],
                   article_info['author'], article_info['account'],
                   article_info['url']))

        WechatService._es.add('wechat_article', article_info,
                              article_info.get('article_id'))

    def add_account_info(self, account_info):
        log.debug('''
            -----公众号信息-----
            %s''' % tools.dumps_json(account_info))

        WechatService._es.add('wechat_account', account_info,
                              account_info.get('__biz'))
Exemplo n.º 15
0
 def __init__(self):
     self.redis = RedisDB()
Exemplo n.º 16
0
import socket
import os, signal
import time
from storage_module.dection_ping_proxy import check_ip
from retrying import retry
# def write(content_info):
#     f = open('D:\start_get_ip\pid.txt','a',encoding="utf-8")
#     f.write(str(content_info)+"\n")
#     f.close()
# pid=os.getpid()
# print(pid)
# write(pid)
# while True:
#     print(1)
#     time.sleep(34)
redis_0 = RedisDB()
MAX_POOL = 400
config = os.path.join('D:\proxy\\' + 'config.conf')

redis_key = tools.get_conf_value(config, 'redis', 'redis_key')
redis_key2 = tools.get_conf_value(config, 'redis', 'redis_key2')
bj_ip = socket.gethostbyname(socket.gethostname())


def retry(attempt):
    def decorator(func):
        def wrapper(*args, **kw):
            att = 0
            while att < attempt:
                try:
                    return func(*args, **kw)
Exemplo n.º 17
0
class TaskService():
    _task_ring_buff = RingBuff(TASK_BUFFER_SIZE)
    _offset = 1
    _lock = threading.RLock()
    _spider_start_timestamp = 0
    _spider_end_timestamp = 0
    _total_task_size = 0
    _db = OracleDB()
    _redisdb = RedisDB()

    def __init__(self ):
        pass

    def load_task(self):
        if TaskService._offset == 1:
            log.info('开始新的一轮抓取')
            TaskService._spider_start_timestamp = tools.get_current_timestamp()
            TaskService._total_task_size = 0

            # 清空url表
            TaskService._redisdb.clear('news:news_urls')
            TaskService._redisdb.clear('news:news_urls_dupefilter')


        task_sql = '''
            select *
              from (select t.id, t.name, t.position, t.url, t.depth, rownum r
                      from TAB_IOPM_SITE t
                     where classify = 1
                       and t.mointor_status = 701
                       and t.position != 35
                       and rownum < {page_size})
             where r >= {offset}
        '''.format(page_size = TaskService._offset + TASK_BUFFER_SIZE, offset = TaskService._offset)
        TaskService._offset += TASK_BUFFER_SIZE

        print(task_sql)
        tasks = TaskService._db.find(task_sql)
        TaskService._total_task_size += len(tasks)

        if not tasks:
            TaskService._spider_end_timestamp = tools.get_current_timestamp()
            log.info('已做完一轮,共处理网站%s个 耗时%s'%(TaskService._total_task_size, tools.seconds_to_h_m_s(TaskService._spider_end_timestamp - TaskService._spider_start_timestamp)))
            TaskService._offset = 1
            self.load_task()

        TaskService._task_ring_buff.put_data(tasks)

    def get_task(self, count = TASK_COUNT):
        TaskService._lock.acquire() #加锁
        tasks = TaskService._task_ring_buff.get_data(count)
        if not tasks:
            self.load_task()
            tasks = TaskService._task_ring_buff.get_data(count)

        TaskService._lock.release()
        return {'tasks':tasks, 'thread_count':THREAD_COUNT}

    def update_task_status(self, tasks, status):
        TaskService._lock.acquire() #加锁
        for task in tasks:
          website_id = task[0]

          sql = "update tab_iopm_site t set t.spider_time = to_date('%s', 'yyyy-mm-dd :hh24:mi:ss'), t.spider_status = %s where id = %s"%(tools.get_current_date(), status, website_id)

          TaskService._db.update(sql)
        TaskService._lock.release()
Exemplo n.º 18
0
 def __init__(self):
     self._oracledb = OracleDB()
     self._redisdb = RedisDB()
     self._news_url_table = 'news:news_urls'
     self._news_urls_dupefilter = 'news:news_urls_dupefilter'