示例#1
0
    def __init__(self, queue, p_action, name, p_sleep_time, c_max_num,
                 c_max_sleep_time, c_retry_times):
        """
        生产者线程初始化参数
        :param queue:            队列
        :param p_action:         生产动作对象实例
        :param name:             线程名称
        :param p_sleep_time:     生产线程每多长时间工作一次
        :param c_max_num:        消费线程的最大线程数
        :param c_max_sleep_time: 消费线程工作间隔最大休眠时间
        :param c_retry_times:    消费动作对象action 最大重试次数

        """
        super(self.__class__, self).__init__()
        self.queue = queue
        self.p_action = p_action
        self.name = name
        self.p_sleep_time = p_sleep_time
        self.c_max_num = c_max_num
        self.c_max_sleep_time = c_max_sleep_time
        self.c_retry_times = c_retry_times

        #校验p_action 是不是 ProducerAction的子类,如果不是抛异常
        if not isinstance(self.p_action, ProducerAction):
            raise Exception("%s is not ProducerAction instance" %
                            self.p_action.__name__)
        #初始化logger
        self.logger = LogUtil().get_logger("producer_%s" % self.name,
                                           "producer_%s" % self.name)
 def __init__(self, url, param, queue_id, pro_flag, queue_name):
     ConsumerAction.__init__(self)
     self.url = url[:-1] if url.endswith('/') else url
     self.param = param
     self.queue_id = queue_id
     self.pro_flag = pro_flag
     self.queue_name = queue_name
     self.logger = LogUtil().get_logger('consumer', 'consumer' + queue_name)
示例#3
0
    def __init__(self, id, act, params, max_fail_times):
        super(self.__class__, self).__init__()
        self.id = id
        self.act = act
        self.params = params
        self.max_fail_times = max_fail_times

        self.logger = LogUtil().get_logger("HainiuConsumerAction",
                                           "HainiuConsumerAction")
示例#4
0
 def __init__(self, queue, name, max_sleep_time, retry_times):
     super(self.__class__, self).__init__()
     self.queue = queue
     self.name = name
     self.max_sleep_time = max_sleep_time
     self.retry_times = retry_times
     Consumer._MAX_RETRY_TIMES = retry_times
     #初始化日志
     self.logger = LogUtil().get_logger("comsumer_%s" % self.name,
                                        "comsumer_%s" % self.name)
示例#5
0
def push_queue_items():
    count_news_seed_queue_sql = """select count(*) from hainiu_queue where type=3 and fail_times=0;"""
    insert_news_seed_internally_queue_items_sql = """insert into hainiu_queue (type,action,params) values(3,%s,%s);"""
    count_news_seed_internally_sql = """select count(*) from hainiu_web_seed_internally where status=0 for update;"""
    selec_news_seed_internally_sql = """select a_url,param,id from hainiu_web_seed_internally where status=0 limit %s,%s;"""
    update_news_seed_internally_sql = """update hainiu_web_seed_internally set status=1 where id in (%s);"""
    rl = LogUtil().get_base_logger()
    try:
        d = DBUtil(config._HAINIU_DB)
        queue_total = d.read_one(count_news_seed_queue_sql)[0]
        if queue_total != 0:
            rl.info(
                'last download_page queue not finish,last queue %s unFinish' %
                (queue_total))
            return

        starttime = time.clock()
        d = DBUtil(config._HAINIU_DB)
        total = long(d.read_one(count_news_seed_internally_sql)[0])
        page_size = 2
        page = total / page_size
        for i in range(0, page + 1):
            sql = selec_news_seed_internally_sql % (0, page_size)
            list = d.read_tuple(sql)
            values = []
            id_values = []
            for l in list:
                url = l[0]
                url = url if url is not None else ''
                param = l[1]
                param1 = param if param is not None else ''

                id = l[2]

                param = '%s##%s' % (str(id), param1)
                values.append((url, param))

                id_values.append(str(id))
            if id_values.__len__() != 0:
                d.executemany_no_commit(
                    insert_news_seed_internally_queue_items_sql, values)
                ids = ','.join(id_values)
                sql = update_news_seed_internally_sql % (ids)
                d.execute(sql)
        endtime = time.clock()
        worksec = int(round((endtime - starttime)))
        rl.info(
            'push seed_internally queue finish,total items %s,action time %s\'s'
            % (total, worksec))
    except:
        rl.exception()
        rl.error(sql)
        d.rollback()
    finally:
        d.close()
示例#6
0
def push_queue_items():
    count_news_seed_sql = """select count(*) from hainiu_web_seed where status=0;"""
    select_news_seed_sql = """select url,category,last_crawl_time from hainiu_web_seed where status=0 limit %s,%s;"""
    insert_news_seed_queue_items_sql = """insert into hainiu_queue (type,action,params) values(1,%s,%s);"""
    count_news_seed_queue_sql = """select count(*) from hainiu_queue where type=1 and fail_times=0;"""
    rl = LogUtil().get_base_logger()
    try:
        d = DBUtil(config._HAINIU_DB)
        queue_total = d.read_one(count_news_seed_queue_sql)[0]
        if queue_total != 0:
            rl.info('last news_find queue not finish,last queue %s unFinish' %
                    (queue_total))
            return

        starttime = time.clock()
        total = long(d.read_one(count_news_seed_sql)[0])
        page_size = 1000
        page = total / page_size
        for i in range(0, page + 1):
            sql = select_news_seed_sql % (i * page_size, page_size)
            list = d.read_tuple(sql)
            values = []
            for l in list:
                url = l[0]
                publisher = get_fld(url)
                publisher = publisher[0:publisher.index((
                    '.'))] if publisher.__contains__('.') else publisher
                param = {}
                param['category'] = l[1]
                param['publisher'] = publisher
                param = json.dumps(param, ensure_ascii=False)
                values.append((url, param))

            if values.__len__() != 0:
                random.shuffle(values)
                d.executemany(insert_news_seed_queue_items_sql, values)
        endtime = time.clock()
        worksec = int(round((endtime - starttime)))
        rl.info(
            'push news_find queue finish,total items %s,action time %s\'s' %
            (total, worksec))
    except:
        rl.exception()
        rl.error(sql)
        d.rollback()
    finally:
        d.close()
示例#7
0
 def __init__(self, max_fail_times, limit_num):
     super(self.__class__, self).__init__()
     self.max_fail_times = max_fail_times
     self.limit_num = limit_num
     self.logger = LogUtil().get_logger('HainiuProducerAction',
                                        'HainiuProducerAction')
def push_queue_items():
    # 符合 写入的种子的队列数据的数量
    count_news_seed_queue_sql = """select count(*) from hainiu_queue where type=3 and fail_times=0;"""
    # 生成写入队列数据 条件: type=3
    insert_news_seed_internally_queue_items_sql = """insert into hainiu_queue (type,action,params) values(3,%s,%s);"""
    # 日志
    rl = LogUtil().get_base_logger()

    redisdb = RedisUtill()
    try:

        # 开始时间
        starttime = time.clock()

        redis_data_statu = True
        # 线程锁
        lock_key = 'get_news_seed_internally_data'
        sql = ""
        total_all = 0

        d = DBUtil(config._HAINIU_DB)
        d.execute_no_commit("set NAMES utf8mb4;")
        #符合 写入的种子的队列数据的数量 --- 之前的队列数据还没有处理完,所以不重新写队列数据到队列中
        sql = count_news_seed_queue_sql

        queue_total = d.read_one(sql)[0]
        if queue_total != 0:
            rl.info(
                'last download_page queue not finish,last queue %s unFinish' %
                (queue_total))
            # return

        while redis_data_statu:

            is_lock = redisdb.get_conn().exists(lock_key)

            if is_lock == False:
                #锁上线程  --- 10 秒失效
                lockd = redisdb.get_lock(lock_key, 10)
                if lockd == False:
                    rl.info('无法获取线程锁,退出采集下载queue线程 ')
                    continue

                ips = config._REDIS_CLUSTER_CONFIG['IPS']
                port = config._REDIS_CLUSTER_CONFIG['PORT']

                def scan_limit_to_queue_table(host, port, cursor, match,
                                              count):
                    total_num = 0
                    r = redis.Redis(host, port)
                    rs = r.scan(cursor, match, count)
                    next_num = rs[0]
                    key_list = []
                    value_list = []
                    for k in rs[1]:
                        key_list.append(k)
                        total_num += 1

                    # print key_list
                    print total_num
                    values = redisdb.get_values_batch_keys(key_list)

                    for v in values:
                        value_list.append((v, ''))
                    print value_list

                    sql = insert_news_seed_internally_queue_items_sql
                    d.executemany(sql, value_list)

                    redisdb.delete_batch(rs[1])

                    if next_num == 0:
                        return total_num
                    return total_num + scan_limit_to_queue_table(
                        host, port, next_num, match, count)

                total_num = 0
                for ip in ips:
                    total_num += scan_limit_to_queue_table(
                        ip, port, 0, 'down:*', 10)
                    print '======'
                print total_num

                if total_num > 0:
                    break

                redisdb.release(lock_key)
            else:
                rl.info('其他线程正在处理,请等待 ')
                time.sleep(0.3)
        endtime = time.time()
        # 一共执行的时间
        worksec = int(round((endtime - starttime)))
        # 日志

        rl.info(
            'push seed_internally queue finish,total items %s,action time %s\'s'
            % (total_all, worksec))
    except:
        rl.exception()
        rl.error(sql)
        d.rollback()
    finally:
        redisdb.release(lock_key)
        d.close()
 def __init__(self, url, param, queue_id):
     ConsumerAction.__init__(self)
     self.url = url[:-1] if url.endswith('/') else url
     self.param = param
     self.queue_id = queue_id
     self.rl = LogUtil().get_logger('NewsFindConsumer', 'NewsFindConsumer')
 def __init__(self, limit, fail_times):
     self.limit = limit
     self.fail_times = fail_times
     self.rl = LogUtil().get_logger('NewsFindProducer', 'NewsFindProducer')
示例#11
0
    def __init__(self, name):
        super(self.__class__, self).__init__()
        self.name = name

        self.logger = LogUtil().get_logger("DemoConsumerAction",
                                           'DemoConsumerAction')
示例#12
0
 def __init__(self, limit, pro_flag, fail_times, queue_name):
     self.limit = limit
     self.fail_times = fail_times
     self.pro_flag = pro_flag
     self.queue_name = queue_name
     self.rl = LogUtil().get_logger('producer', 'producer' + queue_name)