Пример #1
0
def push_queue_items():
    count_news_seed_queue_sql = """select count(*) from hainiu_queue where type=3 and fail_times=0;"""
    insert_news_seed_internally_queue_items_sql = """insert into hainiu_queue (type,action,params) values(3,%s,%s);"""
    count_news_seed_internally_sql = """select count(*) from hainiu_web_seed_internally where status=0 for update;"""
    selec_news_seed_internally_sql = """select a_url,param,id from hainiu_web_seed_internally where status=0 limit %s,%s;"""
    update_news_seed_internally_sql = """update hainiu_web_seed_internally set status=1 where id in (%s);"""
    rl = LogUtil().get_base_logger()
    try:
        d = DBUtil(config._HAINIU_DB)
        queue_total = d.read_one(count_news_seed_queue_sql)[0]
        if queue_total != 0:
            rl.info(
                'last download_page queue not finish,last queue %s unFinish' %
                (queue_total))
            return

        starttime = time.clock()
        d = DBUtil(config._HAINIU_DB)
        total = long(d.read_one(count_news_seed_internally_sql)[0])
        page_size = 2
        page = total / page_size
        for i in range(0, page + 1):
            sql = selec_news_seed_internally_sql % (0, page_size)
            list = d.read_tuple(sql)
            values = []
            id_values = []
            for l in list:
                url = l[0]
                url = url if url is not None else ''
                param = l[1]
                param1 = param if param is not None else ''

                id = l[2]

                param = '%s##%s' % (str(id), param1)
                values.append((url, param))

                id_values.append(str(id))
            if id_values.__len__() != 0:
                d.executemany_no_commit(
                    insert_news_seed_internally_queue_items_sql, values)
                ids = ','.join(id_values)
                sql = update_news_seed_internally_sql % (ids)
                d.execute(sql)
        endtime = time.clock()
        worksec = int(round((endtime - starttime)))
        rl.info(
            'push seed_internally queue finish,total items %s,action time %s\'s'
            % (total, worksec))
    except:
        rl.exception()
        rl.error(sql)
        d.rollback()
    finally:
        d.close()
Пример #2
0
def push_queue_items():
    count_news_seed_sql = """select count(*) from hainiu_web_seed where status=0;"""
    select_news_seed_sql = """select url,category,last_crawl_time from hainiu_web_seed where status=0 limit %s,%s;"""
    insert_news_seed_queue_items_sql = """insert into hainiu_queue (type,action,params) values(1,%s,%s);"""
    count_news_seed_queue_sql = """select count(*) from hainiu_queue where type=1 and fail_times=0;"""
    rl = LogUtil().get_base_logger()
    try:
        d = DBUtil(config._HAINIU_DB)
        queue_total = d.read_one(count_news_seed_queue_sql)[0]
        if queue_total != 0:
            rl.info('last news_find queue not finish,last queue %s unFinish' %
                    (queue_total))
            return

        starttime = time.clock()
        total = long(d.read_one(count_news_seed_sql)[0])
        page_size = 1000
        page = total / page_size
        for i in range(0, page + 1):
            sql = select_news_seed_sql % (i * page_size, page_size)
            list = d.read_tuple(sql)
            values = []
            for l in list:
                url = l[0]
                publisher = get_fld(url)
                publisher = publisher[0:publisher.index((
                    '.'))] if publisher.__contains__('.') else publisher
                param = {}
                param['category'] = l[1]
                param['publisher'] = publisher
                param = json.dumps(param, ensure_ascii=False)
                values.append((url, param))

            if values.__len__() != 0:
                random.shuffle(values)
                d.executemany(insert_news_seed_queue_items_sql, values)
        endtime = time.clock()
        worksec = int(round((endtime - starttime)))
        rl.info(
            'push news_find queue finish,total items %s,action time %s\'s' %
            (total, worksec))
    except:
        rl.exception()
        rl.error(sql)
        d.rollback()
    finally:
        d.close()
Пример #3
0
class DemoConsumerAction(ConsumerAction):
    def __init__(self, name):
        super(self.__class__, self).__init__()
        self.name = name

        self.logger = LogUtil().get_logger("DemoConsumerAction",
                                           'DemoConsumerAction')

    def action(self):
        self.logger.info('consume %s' % self.name)

        flag = True

        return self.result(flag, [self.name])

    def success_action(self):
        print 'success_op() ==> %s' % self.name

    def fail_action(self):
        print 'fail_op() ==> %s' % self.name
Пример #4
0
class Consumer(threading.Thread):

    _MAX_RETRY_TIMES = 0

    def __init__(self, queue, name, max_sleep_time, retry_times):
        super(self.__class__, self).__init__()
        self.queue = queue
        self.name = name
        self.max_sleep_time = max_sleep_time
        self.retry_times = retry_times
        Consumer._MAX_RETRY_TIMES = retry_times
        #初始化日志
        self.logger = LogUtil().get_logger("comsumer_%s" % self.name,
                                           "comsumer_%s" % self.name)

    def run(self):
        while True:
            try:
                #如果队列是空的,就睡眠一会,继续判断
                if self.queue.empty():
                    time.sleep(self.max_sleep_time)
                    continue

                #获取开始时间
                start_time = time.time()

                #从队列(queue)里取出action
                action = self.queue.get()

                action.consumer_thread_name = self.name

                #在调用action()进行消费
                result = action.action()

                rs = 'SUCCESS' if result[0] else 'FAIL'

                #获取结束时间
                end_time = time.time()

                #获取随机休眠时间
                random_sleep_time = round(
                    random.uniform(0.2, self.max_sleep_time), 2)

                run_time = end_time - start_time

                #打印日志
                self.logger.info(
                    "queue.name=【comsumer_%s】, run_time=%d, sleep_time=%d, retry_times=%d, "
                    " result=%s, detail=%s" %
                    (self.name, run_time, random_sleep_time,
                     action.current_retry_times, rs, result[1:]))

                #判断结果成功还是失败,如果是失败,并且失败次数小于最大重试次数,需要重试
                if not result[
                        0] and action.current_retry_times < self.retry_times:
                    action.current_retry_times += 1
                    self.queue.put(action)

                #无论成功失败都要执行
                self.queue.task_done()

                #随机睡眠
                time.sleep(random_sleep_time)
            except Exception, message:
                self.logger.exception(message)
def push_queue_items():
    # 符合 写入的种子的队列数据的数量
    count_news_seed_queue_sql = """select count(*) from hainiu_queue where type=3 and fail_times=0;"""
    # 生成写入队列数据 条件: type=3
    insert_news_seed_internally_queue_items_sql = """insert into hainiu_queue (type,action,params) values(3,%s,%s);"""
    # 日志
    rl = LogUtil().get_base_logger()

    redisdb = RedisUtill()
    try:

        # 开始时间
        starttime = time.clock()

        redis_data_statu = True
        # 线程锁
        lock_key = 'get_news_seed_internally_data'
        sql = ""
        total_all = 0

        d = DBUtil(config._HAINIU_DB)
        d.execute_no_commit("set NAMES utf8mb4;")
        #符合 写入的种子的队列数据的数量 --- 之前的队列数据还没有处理完,所以不重新写队列数据到队列中
        sql = count_news_seed_queue_sql

        queue_total = d.read_one(sql)[0]
        if queue_total != 0:
            rl.info(
                'last download_page queue not finish,last queue %s unFinish' %
                (queue_total))
            # return

        while redis_data_statu:

            is_lock = redisdb.get_conn().exists(lock_key)

            if is_lock == False:
                #锁上线程  --- 10 秒失效
                lockd = redisdb.get_lock(lock_key, 10)
                if lockd == False:
                    rl.info('无法获取线程锁,退出采集下载queue线程 ')
                    continue

                ips = config._REDIS_CLUSTER_CONFIG['IPS']
                port = config._REDIS_CLUSTER_CONFIG['PORT']

                def scan_limit_to_queue_table(host, port, cursor, match,
                                              count):
                    total_num = 0
                    r = redis.Redis(host, port)
                    rs = r.scan(cursor, match, count)
                    next_num = rs[0]
                    key_list = []
                    value_list = []
                    for k in rs[1]:
                        key_list.append(k)
                        total_num += 1

                    # print key_list
                    print total_num
                    values = redisdb.get_values_batch_keys(key_list)

                    for v in values:
                        value_list.append((v, ''))
                    print value_list

                    sql = insert_news_seed_internally_queue_items_sql
                    d.executemany(sql, value_list)

                    redisdb.delete_batch(rs[1])

                    if next_num == 0:
                        return total_num
                    return total_num + scan_limit_to_queue_table(
                        host, port, next_num, match, count)

                total_num = 0
                for ip in ips:
                    total_num += scan_limit_to_queue_table(
                        ip, port, 0, 'down:*', 10)
                    print '======'
                print total_num

                if total_num > 0:
                    break

                redisdb.release(lock_key)
            else:
                rl.info('其他线程正在处理,请等待 ')
                time.sleep(0.3)
        endtime = time.time()
        # 一共执行的时间
        worksec = int(round((endtime - starttime)))
        # 日志

        rl.info(
            'push seed_internally queue finish,total items %s,action time %s\'s'
            % (total_all, worksec))
    except:
        rl.exception()
        rl.error(sql)
        d.rollback()
    finally:
        redisdb.release(lock_key)
        d.close()
Пример #6
0
class Producer(threading.Thread):
    """
    生产者线程
    """
    def __init__(self, queue, p_action, name, p_sleep_time, c_max_num,
                 c_max_sleep_time, c_retry_times):
        """
        生产者线程初始化参数
        :param queue:            队列
        :param p_action:         生产动作对象实例
        :param name:             线程名称
        :param p_sleep_time:     生产线程每多长时间工作一次
        :param c_max_num:        消费线程的最大线程数
        :param c_max_sleep_time: 消费线程工作间隔最大休眠时间
        :param c_retry_times:    消费动作对象action 最大重试次数

        """
        super(self.__class__, self).__init__()
        self.queue = queue
        self.p_action = p_action
        self.name = name
        self.p_sleep_time = p_sleep_time
        self.c_max_num = c_max_num
        self.c_max_sleep_time = c_max_sleep_time
        self.c_retry_times = c_retry_times

        #校验p_action 是不是 ProducerAction的子类,如果不是抛异常
        if not isinstance(self.p_action, ProducerAction):
            raise Exception("%s is not ProducerAction instance" %
                            self.p_action.__name__)
        #初始化logger
        self.logger = LogUtil().get_logger("producer_%s" % self.name,
                                           "producer_%s" % self.name)

    def run(self):

        list = []
        while True:
            try:
                #获取starttime
                start_time = time.time()

                #判断list是否是空的,如果是,就调用 p_action.queue_ites(),
                # 生产 ConsumerAction 子类实例列表
                if len(list) == 0:
                    list = self.p_action.queue_items()

                #计算本次生产了多少
                total_num = len(list)

                #打印日志
                self.logger.info(
                    "queue.name=【producer_%s】, current time produce %d "
                    "actions" % (self.name, total_num))

                while True:
                    #列表空了,就出去继续生产
                    if len(list) == 0:
                        break

                    #当队列的未完成数量小于等于最大消费线程数,就往queue里面put
                    if self.queue.unfinished_tasks <= self.c_max_num:
                        c_action = list.pop()

                        self.queue.put(c_action)

                # 获取endtime
                end_time = time.time()

                run_time = end_time - start_time

                # 计算每分钟生产多少个
                if run_time == 0:
                    rate = total_num
                else:
                    rate = round(float(total_num * 60) / run_time, 2)

                self.logger.info(
                    "queue.name=【producer_%s】, total_num=%d,"
                    " produce %d actions/min, sleep_time=%d" %
                    (self.name, total_num, rate, self.p_sleep_time))

                # 睡眠
                time.sleep(self.p_sleep_time)

            except Exception, message:
                self.logger.exception(message)