Exemplos de LogUtil.exception em Python, exemplos de commons.util.log_util.LogUtil.exception em Python

Exemplo n.º 1

0

Exibir arquivo

def push_queue_items():
    count_news_seed_queue_sql = """select count(*) from hainiu_queue where type=3 and fail_times=0;"""
    insert_news_seed_internally_queue_items_sql = """
        insert into hainiu_queue (type,action,params) values(3,%s,%s);
    """
    count_news_seed_internally_sql = """select count(*) from hainiu_web_seed_internally where status=0 for update;"""
    selec_news_seed_internally_sql = """select a_url,param,id from hainiu_web_seed_internally where status=0 limit %s,%s;"""
    update_news_seed_internally_sql = """update hainiu_web_seed_internally set status=1 where id in (%s);"""
    rl = LogUtil().get_base_logger()
    try:
        d = DBUtil(config._HAINIU_DB)
        queue_total = d.read_one(count_news_seed_queue_sql)[0]
        if queue_total != 0:
            rl.info('last download_page queue not finish,last queue %s unFinish' % (queue_total))
            return


        starttime = time.clock()
        d = DBUtil(config._HAINIU_DB)
        total = long(d.read_one(count_news_seed_internally_sql)[0])
        page_size = 1000
        page = total / page_size
        for i in range(0, page + 1):
            sql = selec_news_seed_internally_sql % (0, page_size)
            list = d.read_tuple(sql)
            values = []
            id_values = []
            for l in list:
                url = l[0]
                url = url if url is not None else ''
                param = l[1]
                param = param if param is not None else ''
                values.append((url,param))
                id = l[2]
                id_values.append(str(id))
            if id_values.__len__() != 0:
                random.shuffle(values)
                d.executemany_no_commit(insert_news_seed_internally_queue_items_sql,values)
                ids = ','.join(id_values)
                sql = update_news_seed_internally_sql % (ids)
                d.execute(sql)
        endtime = time.clock()
        worksec = int(round((endtime - starttime)))
        rl.info('push seed_internally queue finish,total items %s,action time %s\'s' % (total,worksec))
    except:
        rl.exception()
        rl.error(sql)
        d.rollback()
    finally:
        d.close()

Exemplo n.º 2

0

Exibir arquivo

Arquivo: news_find_queue.py Projeto: fuxiaofengfu/demo_crawler

def push_queue_items():
    count_news_seed_sql = """select count(*) from hainiu_web_seed where status=0;"""
    select_news_seed_sql = """select url,category,last_crawl_time from hainiu_web_seed where status=0 limit %s,%s;"""
    insert_news_seed_queue_items_sql = """insert into hainiu_queue (type,action,params) values(1,%s,%s);"""
    count_news_seed_queue_sql = """select count(*) from hainiu_queue where type=1 and fail_times=0;"""
    rl = LogUtil().get_base_logger()
    try:
        d = DBUtil(config._HAINIU_DB)
        queue_total = d.read_one(count_news_seed_queue_sql)[0]
        if queue_total != 0:
            rl.info('last news_find queue not finish,last queue %s unFinish' %
                    (queue_total))
            return

        starttime = time.clock()
        total = long(d.read_one(count_news_seed_sql)[0])
        page_size = 1000
        page = total / page_size
        for i in range(0, page + 1):
            sql = select_news_seed_sql % (i * page_size, page_size)
            list = d.read_tuple(sql)
            values = []
            for l in list:
                url = l[0]
                publisher = get_tld(url)
                publisher = publisher[0:publisher.index((
                    '.'))] if publisher.__contains__('.') else publisher
                param = {}
                param['category'] = l[1]
                param['publisher'] = publisher
                param = json.dumps(param, ensure_ascii=False)
                values.append((url, param))

            if values.__len__() != 0:
                random.shuffle(values)
                d.executemany(insert_news_seed_queue_items_sql, values)
        endtime = time.clock()
        worksec = int(round((endtime - starttime)))
        rl.info(
            'push news_find queue finish,total items %s,action time %s\'s' %
            (total, worksec))
    except:
        rl.exception()
        rl.error(sql)
        d.rollback()
    finally:
        d.close()

Exemplo n.º 3

0

Exibir arquivo

 def send_sms(self, content, phone=config._ALERT_PHONE):
     """send alter sms for phone with content
     """
     l = LogUtil().get_base_logger()
     try:
         send_url = 'http://send.sms.hainiu.com:8080/s?command=cralwer&phone=%s&' % (
             phone)
         send_url += urllib.urlencode(
             {'content': content.decode('utf-8').encode('gbk')})
         r = urllib2.urlopen(send_url).read()
         if '0-OK' != r:
             l.error("短信发送失败,短信服务器返回状态为:%s,手机号:%s,内容:%s" %
                     (r, phone, content))
             return False
     except:
         l.exception()
         return False
     return True

Exemplo n.º 4

0

Exibir arquivo

Arquivo: hainiu_queue.py Projeto: fuxiaofengfu/demo_crawler

def push_queue_items():
    inert_sql = """
    insert into hainiu_queue (type,params,action) values(1,%s,%s);
    """
    count_sql = """
    select count(1) from hainiu_queue where type=1;
    """
    select_sql = """
    select id from hainiu_queue where type=1 limit %s,%s;
    """
    rl = LogUtil().get_base_logger()
    try:
        d = DBUtil(config._HAINIU_DB)
        sql = inert_sql
        insert_list = [("aaa", "bbb"), ("dffddf", "awwee")]
        d.executemany(sql, insert_list)

        sql = count_sql
        queue_total = d.read_one(sql)[0]
        print "queue_total", queue_total
        page_size = 10
        page = (queue_total / page_size) + 1
        print "page", page

        for i in range(0, page):
            sql = select_sql % (i * page_size, page_size)
            select_list = d.read_tuple(sql)
            print "page", i
            for record in select_list:
                id = record[0]
                print id

    except:
        rl.exception()
        rl.error(sql)
        d.rollback()
    finally:
        d.close()

Exemplo n.º 5

0

Exibir arquivo

class NewsFindProducer(ProducerAction):
    def __init__(self, limit, fail_times):
        self.limit = limit
        self.fail_times = fail_times
        self.rl = LogUtil().get_logger('producer', 'producer' + queue_name)


    def queue_items(self):
        select_queue_sql = """
            select id,action,params from hainiu_queue where type=1 and fail_times <= %s limit 0,%s for UPDATE;
        """
        update_queue_sql = """
            update hainiu_queue set type=0 where id in (%s);
        """
        list = []
        try:
            d = DBUtil(config._HAINIU_DB)
            sql = select_queue_sql % (self.fail_times,self.limit)
            tuple = d.read_tuple(sql)
            if len(tuple) == 0:
                return list
            queue_ids = ''
            for t in tuple:
                queue_id = t[0]
                url = t[1]
                param = '' if t[2] is None else t[2]
                queue_ids += str(queue_id) + ','
                c = NewsFindConsumer(url, param, queue_id)
                list.append(c)
            queue_ids = queue_ids[:-1]
            d.execute(update_queue_sql % (queue_ids))
        except:
            self.rl.exception()
            d.rollback()
            d.commit()
        finally:
            d.close()
        return list

Exemplo n.º 6

0

Exibir arquivo

Arquivo: new_seed.py Projeto: zz-big/python

def create_seed():
    sql = """
    insert into web_seed (url,md5,domain,host,category,status) values
    ('%s','%s','%s','%s','%s',0);
    """
    url = "https://news.sina.com.cn/"
    catetory = "新闻"
    hu = HtmlUtil()
    domain = get_tld(url)
    host = hu.get_url_host(url)
    u = Util()
    md5 = u.get_md5(url)

    rl = LogUtil().get_base_logger()
    try:
        d = DBUtil(config._ZZ_DB)
        sql = sql % (url, md5, domain, host, catetory)
        d.execute(sql)
    except:
        rl.exception()
        d.rollback()
    finally:
        d.close()

Exemplo n.º 7

0

Exibir arquivo

def create_seed():
    url = "https://www.autohome.com.cn/all"
    catetory = "汽车"
    sql = """
    insert into hainiu_web_seed (url,md5,domain,host,category,status) values
    ('%s','%s','%s','%s','%s',0);
    """
    hu = HtmlUtil()
    domain = get_tld(url)
    host = hu.get_url_host(url)
    u = Util()
    md5 = u.get_md5(url)

    rl = LogUtil().get_base_logger()
    try:
        d = DBUtil(config._HAINIU_DB)
        sql = sql % (url, md5, domain, host, catetory)
        d.execute(sql)
    except:
        rl.exception()
        d.rollback()
    finally:
        d.close()

Exemplo n.º 8

0

Exibir arquivo

Arquivo: kafka_util.py Projeto: zz-big/python

class KafkaUtil:

    __kafka_connect_cache = {}

    __lock = threading.Lock()

    def __init__(self,kafka_conf):
        host_list = [host for host in kafka_conf['HOST'].split(',')]
        random.shuffle(host_list)
        host_str = ','.join(host_list)
        self.cache_key = '_'.join((host_str,kafka_conf['TOPIC']))
        self.host = host_str
        self.topic = kafka_conf['TOPIC']
        self.rl = LogUtil().get_logger('consumer', 'consumer_kafka')


    def push_message(self,message):
        self.__lock.acquire()
        u = Util()
        producer = u.get_dict_value(self.__kafka_connect_cache,self.cache_key)
        if producer is None:
            client = KafkaClient(hosts=self.host)
            topic = client.topics[self.topic]
            producer = topic.get_producer()
            self.__kafka_connect_cache[self.cache_key] = producer

        is_success = True
        try:
            producer.produce(message)
        except:
            is_success = False
            del self.__kafka_connect_cache[self.cache_key]
            self.rl.error('kafka push error cacheKey is %s' % (self.cache_key))
            self.rl.exception()

        self.__lock.release()
        return is_success

Exemplo n.º 9

0

Exibir arquivo

Arquivo: hainiu_action.py Projeto: fuxiaofengfu/demo_crawler

class HainiuProducer(ProducerAction):
    def __init__(self, limit, fail_times):
        '''
        初始化队列的发者

        :param limit:           每次从队列中取多少条记录
        :param fail_times:      限定取记录的失败次数条件
        '''
        super(self.__class__, self).__init__()
        self.limit = limit
        self.fail_times = fail_times
        self.rl = LogUtil().get_logger('producer', 'producer' + queue_name)

    def queue_items(self):
        '''
        从队列中取出要处理的消息，并封装成消费者动作，然后更新队列的状态
        :return:            封装好的消费者动作列表
        '''

        # 会限制本机处理失败之后就不再进行获取的获取，通过机器IP来限制
        # select_queue_sql = """
        # select id,action,params from hainiu_queue where type=1 and fail_ip <>'%s' and fail_times<=%s
        # limit 0,%s for update;
        # """

        select_queue_sql = """
        select id,action,params from hainiu_queue where type=1 and fail_times<=%s
        limit 0,%s for update;
        """

        update_queue_sql = """
        update hainiu_queue set type=0 where id in (%s);
        """
        return_list = []
        try:
            d = DBUtil(config._HAINIU_DB)
            # u = Util()
            # ip = u.get_local_ip()
            # sql = select_queue_sql % (self.fail_times,ip,self.limit)
            sql = select_queue_sql % (self.fail_times, self.limit)
            select_dict = d.read_dict(sql)
            if len(select_dict) == 0:
                return return_list

            query_ids = []
            for record in select_dict:
                id = record["id"]
                action = record["action"]
                params = record["params"]
                query_ids.append(str(id))
                c = HainiuConsumer(id, action, params)
                return_list.append(c)

            ids = ",".join(query_ids)
            sql = update_queue_sql % ids
            d.execute(sql)
        except:
            self.rl.exception()
            self.rl.error(sql)
            d.rollback()
        finally:
            d.close()
        return return_list

Exemplo n.º 10

0

Exibir arquivo

Arquivo: consumer.py Projeto: MrZhaii/PythonCrawler

class Consumer(threading.Thread):
    '''
    定义消费线程类
    '''
    def __init__(self, queue, thread_name, max_sleep_time, max_retry_num):
        '''
        初始化数据
        :param queue:          Queue对象，从该对象中获取要消费的对象
        :param thread_name:    线程名称，在线程中打印日志
        :param max_sleep_time: 消费完后到下次消费时的休眠间隔时间
        :param max_retry_num:  每个ConsumerAction对象实例如果消费失败了，可以重试，
                               配置的最大重试次数
        '''

        # 1）主动调用父类的__init__()
        super(self.__class__, self).__init__()

        # 2) 初始化参数
        self.queue = queue
        self.thread_name = thread_name
        self.max_sleep_time = max_sleep_time
        self.max_retry_num = max_retry_num

        # 3）初始化日志对象
        self.logger = LogUtil().get_logger(thread_name, thread_name)

    def run(self):
        self.logger.info('%s thread running ...' % self.thread_name)
        while True:
            try:
                # 计算随机休眠时间
                random_sleep_time = round(
                    random.uniform(0.5, self.max_sleep_time), 2)

                # 1) 从队列里取出c_ation
                c_action = self.queue.get()
                self.queue.task_done()

                # 校验c_action 的有效性
                if not isinstance(c_action, ConsumerAction):
                    raise Exception("%s is not ConsumerAction instance!" %
                                    c_action)
                # print self.thread_name
                c_action.consumer_thread_name = self.thread_name

                # print c_action.consumer_thread_name
                start_time = time.time()

                # 2）调用c_action.action() 执行消费，并返回结果
                result_list = c_action.action()

                end_time = time.time()
                run_time = end_time - start_time

                is_success = result_list[0]

                self.logger.info(
                    'thread.name=【%s】, run_time=%.2f s, sleep_time=%.2f s, retry_times=%d, '
                    'result=%s, detail=%s' %
                    (self.thread_name, run_time, random_sleep_time,
                     c_action.current_retry_num + 1,
                     'SUCCESS' if is_success else 'FAIL',
                     result_list[1:] if len(result_list) > 1 else "null"))

                # 3）如果消费失败，需要重试
                # 重试的时机：当前c_action 的重试次数已经达到最大的重试次数
                # 因为是先消费，后判断，所以
                # c_action.current_retry_num < self.max_retry_num - 1
                if not is_success and c_action.current_retry_num < self.max_retry_num - 1:
                    # 当前c_action 重试次数+1
                    c_action.current_retry_num += 1
                    # 无条件还回队列
                    self.queue.put(c_action)

                # 4）随机休眠
                time.sleep(random_sleep_time)

            except Exception, e:
                self.logger.exception(e)

Exemplo n.º 11

0

Exibir arquivo

def redis2Hdfs():

    select_xpath_rule_sql = """select host,xpath,type from stream_extract_xpath_rule where host='%s' and status=0"""
    rl = LogUtil().get_base_logger()
    try:
        d = DBUtil(_ZZ_DB)

        start = 0
        is_finish = True
        host_set = set()

        f = FileUtil()
        t = TimeUtil()
        time_str = t.now_time(format='%Y%m%d%H%M%S')
        #local_xpath_file_path = '/user/zengqingyong17/spark/xpath_cache_file' + time_str
        local_xpath_file_path = 'E:/python_workspaces/data/xpath/xpath_file' + time_str

        starttime = time.clock()
        r = redis.Redis('nn1.hadoop', '6379', db=6)
        while is_finish:
            values = set()
            rs = r.scan(start, "total_z:*", 10)
            # 新游标
            start = rs[0]
            if start ==0:
                is_finish = False
            # print rs
            for i in rs[1]:
                host = i.split(":")[1]
                total_key = i
                txpath_key = 'txpath_z:%s' % host
                fxpath_key = 'fxpath_z:%s' % host
                total = r.get(total_key)

                # 降序排序获得次数(0,1)
                txpath = r.zrevrange(txpath_key, 0, 1)
                row_format = "%s\t%s\t%s\t%s"

                if txpath:
                    txpath_num = int(r.zscore(txpath_key, txpath[0]))
                    if txpath.__len__() == 2:
                        # 返回txpath_key 中txpath[1]的数值
                        txpath_num_1 = int(r.zscore(txpath_key, txpath[1]))
                        txpath_num_1 = txpath_num_1 if txpath_num_1 is not None else 0
                    if txpath_num / float(total) >= 0.8:
                        values.add(row_format % (host, txpath[0], 'true', '0'))
                        host_set.add(host)
                    else:
                        if txpath_num >= 100:
                            values.add(row_format % (host, txpath[0], 'true', '0'))
                            host_set.add(host)
                        if txpath_num_1 is not None and txpath_num_1 >= 100:
                            values.add(row_format % (host, txpath[1], 'true', '0'))
                            host_set.add(host)

                # 获得fxpath_key的全部值
                fxpath = r.smembers(fxpath_key)
                if fxpath:
                    # print 'fxpath:%s' % fxpath
                    for fx in fxpath:
                        values.add(row_format % (host, fx, 'false', '1'))
                    host_set.add(host)

                sql = select_xpath_rule_sql % host
                list_rule = d.read_tuple(sql)
                for rule in list_rule:
                    type = rule[2]
                    if type == 0:
                        values.add(row_format % (rule[0], rule[1], 'true', '2'))
                        host_set.add(host)
                    elif type == 1:
                        values.add(row_format % (rule[0], rule[1], 'false', '3'))
                        host_set.add(host)

            f.write_file_line_pattern(local_xpath_file_path, values, "a")

        #上传到HDFS的XPATH配置文件目录
        # c.upload("/user/qingniu/xpath_cache_file/", local_xpath_file_path)
        endtime = time.clock()
        worksec = int(round((endtime - starttime)))
        rl.info('total host %s,action time %s\'s' % (host_set.__len__(), worksec))
    except:
        rl.exception()
        d.rollback()
    finally:
        d.close()

Exemplo n.º 12

0

Exibir arquivo

class NewsFindConsumer(ConsumerAction):
    def __init__(self, url, param ,queue_id):
        ConsumerAction.__init__(self)
        self.url = url[:-1] if url.endswith('/') else url
        self.param = param
        self.queue_id = queue_id
        self.rl = LogUtil().get_logger('consumer', 'consumer' + queue_name)

    def action(self):
        is_success = True
        t = TimeUtil()
        u = Util()
        hu = HtmlUtil()
        r = RequestUtil()
        in_values = []
        ex_values = []
        a_href = ''
        main_md5 = u.get_md5(self.url)
        now_time = datetime.now()
        update_time = int(time.mktime(now_time.timetuple()))
        create_time = update_time
        create_day = int(t.now_day().replace('-', ''))
        create_hour = int(t.now_hour())
        try:
            html = r.http_get_phandomjs(self.url)
            domain = get_tld(self.url)

            soup = BeautifulSoup(html, 'lxml')
            a_docs = soup.find_all("a")
            a_set = set()
            a_param = {}
            out_json_srt = ''
            status = 0
            host = hu.get_url_host(self.url)

            for a in a_docs:
                a_href = self.get_format_url(a,host)
                a_title = a.get_text().strip()
                if a_href == '' or a_title == '':
                    continue
                if a_set.__contains__(a_href):
                    continue
                a_set.add(a_href)

                req = urllib2.Request(url=a_href)
                a_host = req.get_host() if req.get_host() is not None else ''
                a_md5 = u.get_md5(a_href)

                if a_title != '':
                    a_param['title'] = a_title
                    out_json_srt = json.dumps(a_param,ensure_ascii=False)

                a_xpath = hu.get_dom_parent_xpath_js(a)
                insert_values = (main_md5,domain,host,a_md5,a_host,a_xpath,create_time,create_day,create_hour,update_time,status,
                                 MySQLdb.escape_string(self.url),
                                 MySQLdb.escape_string(a_href),
                                 MySQLdb.escape_string(a_title),
                                 out_json_srt)

                if a_host.__contains__(domain):
                    in_values.append(insert_values)
                else:
                    ex_values.append(insert_values)

            in_table = 'hainiu_web_seed_internally'
            ex_table = 'hainiu_web_seed_externally'
            insert_sql = """
                insert into <table> (md5,domain,host,a_md5,a_host,a_xpath,create_time,create_day,create_hour,update_time,status,url,a_url,a_title,param)
                      values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE update_time=update_time;
            """
            try:
                d = DBUtil(config._HAINIU_DB)
                d.execute_no_commit("set NAMES utf8mb4;")
                if in_values.__len__() != 0:
                    sql = insert_sql.replace('<table>',in_table)
                    d.executemany_no_commit(sql,in_values)
                if ex_values.__len__() != 0:
                    sql = insert_sql.replace('<table>',ex_table)
                    d.executemany_no_commit(sql,ex_values)
                d.commit()
            except:
                is_success = False
                self.rl.exception()
                self.rl.error(sql)
                d.rollback()
            finally:
                d.close()

        except:
            is_success = False
            self.rl.exception()
        finally:
            r.close_phandomjs()

        return super(self.__class__, self).result(is_success, [main_md5,self.url,a_href,in_values.__len__(),ex_values.__len__(),self.queue_id])


    def success_action(self, values):
        delete_sql = """
            delete from hainiu_queue where id=%s;
        """
        update_hainiu_news_seed_sql = """
            update hainiu_web_seed set last_crawl_internally=%s,last_crawl_externally=%s,last_crawl_time=now() where md5="%s";"""
        try:
            d = DBUtil(config._HAINIU_DB)
            id = values[5]
            sql = delete_sql % id
            d.execute_no_commit(sql)
            sql = update_hainiu_news_seed_sql % (values[3],values[4],values[0])
            d.execute_no_commit(sql)
            d.commit()
        except:
            self.rl.exception()
            self.rl.error(sql)
            d.rollback()
            d.commit()
        finally:
            d.close()


    def fail_action(self, values):
        update_sql = """
            update hainiu_queue set fail_times=fail_times+1,fail_ip='%s' where id=%s;
        """
        update_sql_1 = """
            update hainiu_queue set type=1 where id=%s;
        """
        update_hainiu_news_seed_sql = """
            update hainiu_web_seed set fail_times=fail_times+1,fail_ip="%s" where md5="%s";
        """
        try:
            d = DBUtil(config._HAINIU_DB)
            id = values[5]
            u = Util()
            ip = u.get_local_ip()
            sql = update_sql % (ip, id)
            d.execute_no_commit(sql)
            main_md5 = values[0]
            sql = update_hainiu_news_seed_sql % (ip, main_md5)
            d.execute_no_commit(sql)
            if (self.try_num == Consumer._WORK_TRY_NUM):
                sql = update_sql_1 % (id)
                d.execute_no_commit(sql)
            d.commit()
        except:
            self.rl.exception()
            self.rl.error(sql)
            d.rollback()
            d.commit()
        finally:
            d.close()


    def get_format_url(self, a_doc, host):
        a_href = a_doc.get('href')
        try:
            if a_href is not None and a_href.__len__() > 0:
                a_href = str(a_href).strip()
                a_href = a_href[:a_href.index('#')] if a_href.__contains__('#') else a_href
                # a_href = a_href.encode('utf8')
                # a_href = urllib.quote(a_href,safe='.:/?&=')
                if a_href.startswith('//'):
                    url = 'https:' + a_href if self.url.startswith('https:') else 'http:' + a_href
                    url = mx.URL.URL(str(url))
                    a_href = url.url
                elif a_href.startswith('/'):
                    url = 'https://' + host + a_href if self.url.startswith('https:') else 'http://' + host + a_href
                    url = mx.URL.URL(str(url))
                    a_href = url.url
                elif a_href.startswith('./') or a_href.startswith('../'):
                    url = mx.URL.URL(str(self.url) + '/' + a_href)
                    a_href = url.url
                elif not a_href.startswith('javascript') and not a_href.startswith('mailto') and not a_href.startswith('http') and a_href != '':
                    url = 'https://' + host + '/' + a_href if self.url.startswith('https:') else 'http://' + host + '/' + a_href
                    url = mx.URL.URL(str(url))
                    a_href = url.url
                a_href = a_href[:-1] if a_href.endswith('/') else a_href
                # a_href = a_href.lower()
            get_tld(a_href)
        except:
            return ''

        if not a_href.startswith('http'):
            return ''

        if a_href.__contains__('?'):
            a_params_str = a_href[a_href.index('?') + 1:]
            a_params = a_params_str.split('&')
            a_params.sort()
            a_params_str = '&'.join(a_params)
            a_href = a_href[:a_href.index('?') + 1] + a_params_str

        return a_href

Exemplo n.º 13

0

Exibir arquivo

class Producer(threading.Thread):
    def __init__(self, queue, q_name, p_action, p_sleep_time, c_max_num,
                 c_max_sleep_time, c_max_retry_num):
        '''

        :param queue:       队列对象
        :param q_name:       队列名称
        :param p_action:      生产动作对象
        :param p_sleep_time:    每次生产后的休眠时间
        :param c_max_num:       消费者的最大线程数
        :param c_max_sleep_time:  每次运行后的最大休眠时间
        :param c_max_retry_num:    运行失败后的最大重试次数
        :return:
        '''

        super(self.__class__, self).__init__()
        self.queue = queue
        self.q_name = q_name
        self.p_action = p_action
        self.p_sleep_time = p_sleep_time
        self.c_max_num = c_max_num
        self.c_max_sleep_time = c_max_sleep_time
        self.c_max_retry_num = c_max_retry_num

        # 校验p_action是不是ProducerAction的子类的实例对象
        if not isinstance(self.p_action, ProducerAction):
            raise Exception("%s is not ProducerAction instance" %
                            self.p_action)

        # 初始化日志对象
        self.logger = LogUtil().get_logger('producer_%s' % self.q_name,
                                           'producer_%s' % self.q_name)

    def run(self):
        '''
        线程体
        :return:
        '''

        actions = []

        while True:
            try:
                # 线程开始时间
                start_time = time.time()

                # 通过p_action 生产消费动作对象列表
                if len(actions) == 0:
                    actions = self.p_action.queue_items()

                # 本次生产了多少对象
                total_num = len(actions)

                self.logger.info(
                    'queue.name = [producer_%s], current time produce %d actions'
                    % (self.q_name, total_num))

                # 一个一个的放入队列
                while True:
                    if len(actions) == 0:
                        break
                    # 通过q.unfinished_tasks的数 小于 消费者最大线程数，就往队列里放
                    if self.queue.unfinished_tasks < self.c_max_num:
                        c_action = actions.pop()
                        self.queue.put(c_action)

                # 线程结束时间
                end_time = time.time()
                # 本次从生产到全部放到队列的秒数
                run_time = end_time - start_time
                rate = int(float(total_num) * 60 / run_time)

                self.logger.info(
                    "queue.name=[producer_%s], total_num=%d, "
                    "producer %d actions/min, sleep_time=%d" %
                    (self.q_name, total_num, rate, self.p_sleep_time))

                # 休眠一下
                time.sleep(self.p_sleep_time)

            except Exception, err:
                traceback.print_exc(err)
                self.logger.exception(err)

Exemplo n.º 14

0

Exibir arquivo

Arquivo: queue_consumer.py Projeto: fuxiaofengfu/demo_crawler

class Consumer(threading.Thread):
    '''
    消费者线程，主要任务是执行拿到的消费动作
    '''

    #消费动作失败之后重新尝试的次数，可供外面访问
    _WORK_TRY_NUM = 0

    def __init__(self, queue, name, sleep_time, work_try_num):
        '''
        初始化消费线程

        :param queue:           使用的队列
        :param name:            消费者线程的名称，用其代表消费者的名字
        :param sleep_time:      执行下一次消费动作时休息的时间
        :param work_try_num:    每个消费动作允许失败的次数
        '''
        super(self.__class__, self).__init__()
        self.queue = queue
        self.name = name
        self.sleep_time = sleep_time
        self.work_try_num = work_try_num
        Consumer._WORK_TRY_NUM = work_try_num
        self.rl = LogUtil().get_logger(
            'consumer', 'consumer' + self.name[:self.name.find("_")])

    def run(self):
        while True:
            try:
                #从队列中得到一个消费动作，其消费动作在队列中的状态由new转为work
                action = self.queue.get()
                if not isinstance(action, base_consumer_action.ConsumerAction):
                    raise Exception('Action not Consumer base')

                #任务下一次消费动作随机休息的时长，最长不超过设置的上self.sleep_time
                sleep_time = random.randint(0, self.sleep_time * 10) * 0.1
                time.sleep(sleep_time)

                action.consumer_thread_name = self.name
                start_time = time.clock()
                #执行得到的消息动作
                re = action.action()
                end_time = time.clock()

                #计算执行消费动作的时间
                work_sec = int(round((end_time - start_time)))

                #输出消费线程日志
                self.rl.info("queue name %s finish,sleep time %s\'s,action time %s \'s,"
                             "action retry %s times,result:%s" % \
                             (self.name,sleep_time,work_sec,action.try_num,
                              re.__str__() if re is not None else ''))

                #根据消费动作的结果和该消费动作的失败次数，决定是否再次放入队列中重新尝试
                if not re[0] and action.try_num < self.work_try_num:
                    #该消费动作的失败次数累加
                    action.try_num += 1
                    #再次把消费动作放到队列中，其消费动作在队列中的状态为new
                    self.queue.put(action)

                #把得到的消费动作的状态在队列中从work转为done
                self.queue.task_done()
            except:
                self.rl.exception()

Exemplo n.º 15

0

Exibir arquivo

def xpath_config_file():
    select_xpath_rule_sql = """select host,xpath,type from stream_extract_xpath_rule where host='%s' and status=0"""
    rl = LogUtil().get_base_logger()
    try:
        # _HAINIU_DB = {'HOST': '192.168.137.190', 'USER': '******', 'PASSWD': '12345678', 'DB': 'hainiucrawler',
        #             'CHARSET': 'utf8', 'PORT': 3306}
        d = DBUtil(config._HAINIU_DB)
        # d = DBUtil(_HAINIU_DB)
        r = redis.Redis('nn1.hadoop', 6379, db=6)
        # r = redis.Redis('redis.hadoop', 6379, db=6)
        f = FileUtil()
        t = TimeUtil()
        c = Client("http://nn1.hadoop:50070")

        time_str = t.now_time(format='%Y%m%d%H%M%S')
        # local_xpath_file_path = '/Users/leohe/Data/input/xpath_cache_file/xpath_file' + time_str
        local_xpath_file_path = '/home/qingniu/xpath_cache_file/xpath_file' + time_str

        start_cursor = 0
        is_finish = True
        starttime = time.clock()
        host_set = set()

        while is_finish:
            values = set()
            limit = r.scan(start_cursor, 'total:*', 10)
            if limit[0] == 0:
                is_finish = False
            start_cursor = limit[0]
            for h in limit[1]:
                host = h.split(":")[1]
                total_key = h
                txpath_key = 'txpath:%s' % host
                fxpath_key = 'fxpath:%s' % host
                total = r.get(total_key)

                txpath = r.zrevrange(txpath_key, 0, 1)
                row_format = "%s\t%s\t%s\t%s"
                if txpath:
                    # print 'txpath:%s' % txpath
                    txpath_num = int(r.zscore(txpath_key, txpath[0]))
                    if txpath.__len__() == 2:
                        txpath_num_1 = int(r.zscore(txpath_key, txpath[1]))
                        txpath_num_1 = txpath_num_1 if txpath_num_1 is not None else 0

                    # print 'txpath_max_num:%s' % txpath_num
                    if txpath_num / float(total) >= 0.8:
                        values.add(row_format % (host, txpath[0], 'true', '0'))
                        host_set.add(host)
                    else:
                        if txpath_num >= 1:
                            values.add(row_format %
                                       (host, txpath[0], 'true', '0'))
                            host_set.add(host)
                        if txpath_num_1 is not None and txpath_num_1 >= 1:
                            values.add(row_format %
                                       (host, txpath[1], 'true', '0'))
                            host_set.add(host)

                fxpath = r.smembers(fxpath_key)
                if fxpath:
                    # print 'fxpath:%s' % fxpath
                    for fx in fxpath:
                        values.add(row_format % (host, fx, 'false', '0'))
                    host_set.add(host)

                sql = select_xpath_rule_sql % host
                list_rule = d.read_tuple(sql)
                for rule in list_rule:
                    type = rule[2]
                    if type == 0:
                        values.add(row_format %
                                   (rule[0], rule[1], 'true', '2'))
                        host_set.add(host)
                    elif type == 1:
                        values.add(row_format %
                                   (rule[0], rule[1], 'false', '3'))
                        host_set.add(host)

            f.write_file_line_pattern(local_xpath_file_path, values, "a")
        #上传到HDFS的XPATH配置文件目录
        c.upload("/user/qingniu/xpath_cache_file/", local_xpath_file_path)
        endtime = time.clock()
        worksec = int(round((endtime - starttime)))
        rl.info('total host %s,action time %s\'s' %
                (host_set.__len__(), worksec))
    except:
        rl.exception()
        d.rollback()
    finally:
        d.close()

Exemplo n.º 16

0

Exibir arquivo

Arquivo: download_action.py Projeto: fuxiaofengfu/demo_crawler

class DownLoadConsumer(ConsumerAction):
    def __init__(self, url, param, queue_id, pro_flag):
        ConsumerAction.__init__(self)
        self.url = url[:-1] if url.endswith('/') else url
        self.param = param
        self.queue_id = queue_id
        self.pro_flag = pro_flag
        self.rl = LogUtil().get_logger('consumer', 'consumer' + queue_name)

    def action(self):
        is_success = True
        t = TimeUtil()
        f = FileUtil()
        u = Util()
        hu = HtmlUtil()
        r = RequestUtil()
        values = []
        md5 = u.get_md5(self.url)
        now_time = datetime.now()
        update_time = int(time.mktime(now_time.timetuple()))
        create_time = update_time
        create_day = int(t.now_day().replace('-', ''))
        create_hour = int(t.now_hour())
        now_minute = int(t.now_min())
        for i in xrange(60, -5, -5):
            if now_minute >= i:
                now_minute = i
                break
        now_minute = t.now_time(format='%Y%m%d%H') + (
            '0%s' % (str(now_minute)) if now_minute < 10 else str(now_minute))

        values.append(MySQLdb.escape_string(self.url))
        values.append(md5)
        values.append(create_time)
        values.append(create_day)
        values.append(create_hour)
        values.append('')
        values.append(MySQLdb.escape_string(self.param))
        values.append(update_time)
        try:
            html = r.http_get_phandomjs(self.url)
            domain = get_tld(self.url)
            values[5] = domain

            soup = BeautifulSoup(html, 'lxml')
            title_doc = soup.find('title')
            title = title_doc.contents[0] if title_doc is not None and len(
                title_doc.contents) == 1 else ''

            host = hu.get_url_host(self.url)
            values.append(host)
            values.append(MySQLdb.escape_string(title))

            # k = KafkaUtil(config._KAFKA_CONFIG)
            html = html.replace(content._SEQ1,
                                '').replace(content._SEQ2, content._SEQ4)
            # push_str = content._SEQ3.join(('%s','%s')) % (self.url,html)
            # push_str = content._SEQ3.join(('%s','%s')) % (u.get_md5(push_str),push_str)
            # push_str = bytes(push_str)
            # is_success = k.push_message(push_str)

            is_success = True
            if is_success:
                self.save_file(create_time, f, now_minute, u, self.url, html)
            else:
                values.append('')
                values.append('')
                self.rl.error("kafka push error")

        except:
            is_success = False
            values.append('')
            values.append('')
            self.rl.exception()
        finally:
            r.close_phandomjs()

        try:
            if is_success:
                values.append(1)
                insert_web_page_sql = """
                    insert into hainiu_web_page (url,md5,create_time,create_day,create_hour,domain,param,update_time,host,
                    title,status) values ("%s","%s",%s,%s,%s,"%s","%s",%s,"%s","%s",%s) on DUPLICATE KEY  UPDATE update_time=values(update_time);
                """
            else:
                ip = u.get_local_ip()
                values.append(ip)
                values.append(2)
                insert_web_page_sql = """
                    insert into hainiu_web_page (url,md5,create_time,create_day,create_hour,domain,param,update_time,host,
                    title,fail_ip,status) values ("%s","%s",%s,%s,%s,"%s","%s",%s,"%s","%s","%s",%s)
                    on DUPLICATE KEY UPDATE fail_times=fail_times+1,fail_ip=values(fail_ip);
                """

            d = DBUtil(config._HAINIU_DB)
            sql = insert_web_page_sql % tuple(values)
            d.execute(sql)
        except:
            is_success = False
            self.rl.exception()
            self.rl.error(sql)
            d.rollback()
            d.commit()
        finally:
            d.close()

        return super(self.__class__,
                     self).result(is_success,
                                  [md5, self.url, update_time, self.queue_id])

    def success_action(self, values):
        delete_sql = """
            delete from hainiu_queue where id=%s;
        """
        update_hainiu_news_internally_sql = """
            update hainiu_web_seed_internally set update_time=%s where md5="%s";
        """
        try:
            d = DBUtil(config._HAINIU_DB)
            id = values[3]
            sql = delete_sql % id
            d.execute_no_commit(sql)
            sql = update_hainiu_news_internally_sql % (values[2], values[0])
            d.execute_no_commit(sql)
            d.commit()
        except:
            self.rl.exception()
            self.rl.error(sql)
            d.rollback()
            d.commit()
        finally:
            d.close()

    def fail_action(self, values):
        update_sql = """
            update hainiu_queue set fail_times=fail_times+1,fail_ip='%s' where id=%s;
        """
        update_sql_1 = """
            update hainiu_queue set type=3 where id=%s;
        """
        update_hainiu_news_internally_sql = """
            update hainiu_web_seed_internally set fail_times=fail_times+1,fail_ip="%s",update_time=%s where md5="%s";
        """
        try:
            d = DBUtil(config._HAINIU_DB)
            id = values[3]
            u = Util()
            ip = u.get_local_ip()
            sql = update_sql % (ip, id)
            d.execute_no_commit(sql)
            sql = update_hainiu_news_internally_sql % (ip, values[2],
                                                       values[0])
            d.execute_no_commit(sql)
            if (self.try_num == Consumer._WORK_TRY_NUM):
                sql = update_sql_1 % (id)
                d.execute_no_commit(sql)
            d.commit()
        except:
            self.rl.exception()
            self.rl.error(sql)
            d.rollback()
            d.commit()
        finally:
            d.close()

    def save_file(self, create_time, fi, now_minute, u, url, html):
        now_file_name = '%s_%s_%s' % (self.consumer_thread_name, self.pro_flag,
                                      now_minute)
        last_file_name = u.get_dict_value(html_file_path_cache,
                                          self.consumer_thread_name)
        html_file_path_cache[self.consumer_thread_name] = now_file_name
        tmp_path = config._LOCAL_DATA_DIR % (
            '%s/%s_%s' % ('tmp', self.consumer_thread_name, self.pro_flag))
        start_char = content._SEQ2
        if last_file_name is None or now_file_name != last_file_name:
            start_char = ''
            if os.path.exists(tmp_path) and os.path.getsize(tmp_path) > 0:
                done_path = config._LOCAL_DATA_DIR % (
                    '%s/%s_%s' % ('done', now_file_name, create_time))
                shutil.move(tmp_path, done_path)

        html = html.replace(content._SEQ1, '').replace(content._SEQ2,
                                                       content._SEQ4)
        record_str = content._SEQ3.join(('%s', '%s')) % (url, html)
        record_str = content._SEQ3.join(
            ('%s', '%s')) % (u.get_md5(record_str), record_str)
        html_record_format_str = start_char + record_str
        fi.write_file_content_pattern(tmp_path,
                                      html_record_format_str,
                                      pattern='a')

Exemplo n.º 17

0

Exibir arquivo

Arquivo: hainiu_action.py Projeto: fuxiaofengfu/demo_crawler

class HainiuConsumer(ConsumerAction):
    def __init__(self, id, ac, params):
        '''
        初始化队列的消费者

        :param id:          消息的ID，也就是数据库表里的ID
        :param ac:          消息的动作信息，也就是数据库表里的action字段
        :param params:      消息的动作的附加参数
        '''
        super(self.__class__, self).__init__()
        self.id = id
        self.ac = ac
        self.params = params
        self.rl = LogUtil().get_logger('consumer', 'consumer' + queue_name)

    def action(self):
        '''
        处理拿到的消息

        :return:消费动作的处理结果，用于消费者线程的日志打印和传递处理成功和失败方法所需要的数据
        '''
        is_success = True
        try:
            print self.ac, self.params
            # 1/0
        except:
            is_success = False
            self.rl.exception()

        return super(self.__class__, self).result(is_success, [self.id])

    def success_action(self, values):
        '''
        消息动作处理成功之后，从队列中间件删除该消息，表示这个消息最终处理完成

        :param values:      消息动作处理之后的结果
        '''
        delete_sql = """
           delete from hainiu_queue where id=%s
        """
        try:
            d = DBUtil(config._HAINIU_DB)
            id = values[0]
            sql = delete_sql % id
            d.execute(sql)
        except:
            self.rl.exception()
            self.rl.error(sql)
            d.rollback()
        finally:
            d.close()

    def fail_action(self, values):
        '''
        消息动作处理失败之后，更改队列中间件中该消息的失败次数并记录执行机器的IP
        如果达到该机器的最大尝试失败次数，则更改队列中间件中该消息的状态为未处理，目的让其它机器再次尝试去处理该消息

        :param values:      消息动作处理之后的结果
        '''
        update_sql = """
            update hainiu_queue set fail_times=fail_times+1,fail_ip='%s' where id=%s;
        """
        update_sql_1 = """
            update hainiu_queue set type=1 where id=%s
        """
        try:
            d = DBUtil(config._HAINIU_DB)
            id = values[0]
            u = Util()
            ip = u.get_local_ip()
            sql = update_sql % (ip, id)
            d.execute_no_commit(sql)
            if (self.try_num == Consumer._WORK_TRY_NUM):
                sql = update_sql_1 % id
                d.execute_no_commit(sql)
            d.commit()
        except:
            self.rl.exception()
            self.rl.error(sql)
            d.rollback()
        finally:
            d.close()

Exemplo n.º 18

0

Exibir arquivo

Arquivo: consumer.py Projeto: zz-big/python

class Consumer(threading.Thread):
    '''
    消费线程，用于从队列获得消费动作对象，然后调用消费动作对象的action()进行消费
    '''
    def __init__(self, queue, thread_name, max_sleep_time, max_retry_num):
        '''

        :param queue:       队列对象
        :param thread_name:  消费线程名称
        :param sleep_time:  每次消费后的休眠时间
        :param max_retry_num:   每次失败后最多的重试次数
        :return:
        '''
        # 调用父类初始化对象，这样才能运行run方法
        super(self.__class__, self).__init__()

        self.queue = queue
        self.thread_name = thread_name
        self.max_sleep_time = max_sleep_time
        self.max_retry_num = max_retry_num

        # 初始化日志
        self.logger = LogUtil().get_logger(self.thread_name, self.thread_name)

    def run(self):
        '''
        线程体
        :return:
        '''
        while True:
            try:
                # 随机休眠的时间
                random_sleep_time = round(
                    random.uniform(0.2, self.max_sleep_time))
                # 线程开始时间
                start_time = time.time()

                # 从队列里取c_action对象
                c_action = self.queue.get()
                # 校验
                if not isinstance(c_action, ConsumerAction):
                    raise Exception("%s is not ConsumerAction instance" %
                                    c_action)

                # 调用c_action对象的action 方法消费
                result = c_action.action(self.thread_name)
                # 线程结束时间
                end_time = time.time()

                run_time = end_time - start_time

                success_flag = result[0]
                success_str = "SUCCESS" if result[0] else "FAIL"

                self.logger.info(
                    "thread.name=[%s], run_time=%.2f s, sleep_time=%.2f s, retry_times=%d, "
                    "result=%s, detail=%s" %
                    (self.thread_name, run_time, random_sleep_time,
                     c_action.current_retry_num + 1, success_str, result[1:]))

                # 如果消费失败，可以进行重试
                if not success_flag and c_action.current_retry_num < self.max_retry_num - 1:

                    c_action.current_retry_num += 1
                    # 把c_action 还回队列
                    self.queue.put(c_action)

                # 标记本次从队列里取出的c_action 已经执行完成
                self.queue.task_done()
                # 随机休眠

                time.sleep(random_sleep_time)

            except Exception, err:
                traceback.print_exc(err)
                self.logger.exception(err)

Exemplo n.º 19

0

Exibir arquivo

Arquivo: producer.py Projeto: MrZhaii/PythonCrawler

class Producer(threading.Thread):
    def __init__(self, queue, queue_name, p_action, p_sleep_time, c_max_num,
                 c_max_sleep_time, c_max_retry_num):
        '''
        初始化数据
        :param queue:             Queue对象，往该对象里放数据
        :param queue_name:        队列名称，每个业务有自己的队列， 可以通过队列名称区分业务
        :param p_action:          具体业务的ProducerAction对象
        :param p_sleep_time:      生产一次后，下次生产的休眠间隔时间
        :param c_max_num:         最大的消费线程数，初始化多少个消费线程取决于该值
        :param c_max_sleep_time:  消费者线程消费完后到下次消费时的休眠间隔时间
        :param c_max_retry_num:   每个ConsumerAction对象实例如果消费失败了，可以重试，
                                  配置的最大重试次数
        '''
        # 1）主动调用父类的__init__()
        super(self.__class__, self).__init__()

        # 2) 初始化参数
        self.queue = queue
        self.queue_name = queue_name
        self.p_action = p_action
        self.p_sleep_time = p_sleep_time
        self.c_max_num = c_max_num
        self.c_max_sleep_time = c_max_sleep_time
        self.c_max_try_num = c_max_retry_num

        # 3）校验p_action的有效性
        if not isinstance(p_action, ProducerAction):
            raise Exception("%s is not ProducerAction instance!" % p_action)

        # 4）初始化日志对象
        self.thread_name = '%s_producer' % self.queue_name
        self.logger = LogUtil().get_logger(self.thread_name, self.thread_name)

    def run(self):
        '''
        生产线程运行逻辑
        '''
        self.logger.info('%s thread running ...' % self.thread_name)

        c_actions = []
        while True:
            try:

                # 获取start_time
                start_time = time.time()

                # 1）通过p_action.queue_items() 创建对应 ConsumerAction对象列表
                if len(c_actions) == 0:
                    c_actions = self.p_action.queue_items()

                total_num = len(c_actions)
                self.logger.info(
                    'thread.name=【%s】, current time produce %d actions' %
                    (self.thread_name, total_num))

                # 2) 把产生的列表对象往队列里放
                while True:
                    if len(c_actions) == 0:
                        break

                    # 寻找往队列里放的契机
                    # 当前队列的未完成任务数 <= 消费线程数
                    if self.queue.unfinished_tasks <= self.c_max_num:
                        # 从列表里pop，pop一次长度-1
                        c_action = c_actions.pop()
                        # 往队列里放
                        self.queue.put(c_action)

                # 获取end_time
                end_time = time.time()
                run_time = end_time - start_time

                if int(run_time) == 0:
                    rate = total_num * 60 / 0.01
                else:
                    rate = int(total_num * 60 / run_time)

                self.logger.info(
                    'thread.name=【%s】, total_num=%d, produce %d actions/min, sleep_time=%d'
                    % (self.thread_name, total_num, rate, self.p_sleep_time))

                # 3）休眠
                time.sleep(self.p_sleep_time)

            except Exception, e:
                traceback.print_exc(e)
                self.logger.exception(e)

Exemplo n.º 20

0

Exibir arquivo

def put_seed_to_queue(page_show_num):
    '''
    采用分页查询种子表数据，批量导入到hainiu_queue
    :param page_show_num: 一次查询条数
    '''
    # 统计hainiu_queue 未处理的记录数
    select_queue_count_sql = """
    select count(*) from hainiu_queue where type=%s and is_work=0 and fail_times=0;
    """

    # 统计种子表符合条件的总记录数
    select_seed_count_sql = """
    select count(*) from hainiu_web_seed where status=0;
    """

    # 分页查询种子表数据SQL
    select_seed_limit_sql = """
    select url, md5, domain, host, category from hainiu_web_seed
    where status=0 limit %s,%s;
     """

    # insert hainiu_queue sql
    insert_queue_sql = """
    insert into hainiu_queue (type,action,params) values (%s, %s, %s);
    """
    logger = LogUtil().get_logger("news_find_queue", "news_find_queue")
    db_util = DBUtil(_HAINIU_DB)
    try:
        #1) 统计hainiu_queue 未处理的记录数
        sql_params = [1]
        # res1 是 ()
        res1 = db_util.read_one(select_queue_count_sql, sql_params)
        queue_count = res1[0]
        if queue_count >= 5:
            logger.info("hainiu_queue 有 %d 条未处理的记录，不需要导入！" % queue_count)
            return None

        start_time = time.time()

        #2) 统计种子表符合条件的总记录数
        res2 = db_util.read_one(select_seed_count_sql)
        seed_count = res2[0]

        # 计算有多少页
        page_num = seed_count / page_show_num if seed_count % page_show_num == 0 \
            else seed_count / page_show_num + 1

        # 分页查询
        for i in range(page_num):
            sql_params = [i * page_show_num, page_show_num]
            # ({},{},{},{},{})
            res3 = db_util.read_dict(select_seed_limit_sql, sql_params)
            # 插入队列表的数据
            insert_queue_values = []

            params_dict = {}
            for row in res3:
                # url, md5, domain, host, category
                act = row['url']
                md5 = row['md5']
                domain = row['domain']
                host = row['host']
                category = row['category']
                params_dict['md5'] = md5
                params_dict['domain'] = domain
                params_dict['host'] = host
                params_dict['category'] = category

                params_json = json.dumps(params_dict,
                                         ensure_ascii=False,
                                         encoding='utf-8')

                insert_queue_values.append((1, act, params_json))
            # 把查询的数据批量插入到队列表
            db_util.executemany(insert_queue_sql, insert_queue_values)

        end_time = time.time()
        run_time = end_time - start_time
        logger.info("本地导入 %d 条数据， 用时 %.2f 秒" % (seed_count, run_time))

    except Exception, e:
        logger.exception(e)

Exemplo n.º 21

0

Exibir arquivo

Arquivo: log_demo.py Projeto: MrZhaii/PythonCrawler

#-*- encoding: utf-8 -*-
'''
log_demo.py
Created on 21-1-30 上午11:23
Copyright (c) 21-1-30, 海牛学院版权所有.
@author: 潘牛
'''
from commons.util.log_util import LogUtil

logger1 = LogUtil().get_logger("log_name", "log_file")

logger2 = LogUtil().get_logger("log_name", "log_file")

# 两个对象指向同一内存地址
print logger1 is logger2

logger1.info("测试 info 级别")
logger1.error("测试 error 级别")

try:
    1 / 0
except Exception, e:
    logger1.exception(e)

Exemplo n.º 22

0

Exibir arquivo

class Producer(threading.Thread):
    '''
    生产者线程
    '''

    def __init__(self,queue,action,name,max_num,sleep_time,work_sleep_time,work_try_num):
        '''
        初始化生产线程

        :param queue:           使用的队列
        :param action:          生产者动作
        :param name:            生产者名称
        :param max_num:         启动的消费者的数量
        :param sleep_time:      执行下一次生产动作时休息的时间
        :param work_sleep_time: 每个消费者的休息时间
        :param work_try_num:    每个消费动作允许失败的次数
        '''
        super(self.__class__,self).__init__()
        self.queue = queue
        self.action = action
        self.name = name
        self.max_num = max_num
        self.sleep_time = sleep_time
        self.work_sleep_time = work_sleep_time
        self.work_try_num = work_try_num
        self.rl = LogUtil().get_logger('producer','producer' + self.name)
        if not isinstance(self.action,base_producer_action.ProducerAction):
            raise Exception('Action not Producer base')

    def run(self):
        #缓存生产者产生的消费动作，用于消费者线程有空闲时进行任务的填充
        action_list = []
        while True:
            try:
                start_time = time.clock()

                #当缓存消费动作为空时，调用生产动作拿到新的一批消费动作
                if len(action_list) == 0:
                    action_list = self.action.queue_items()

                #日志输出本次的消费动作有多少
                totle_times = len(action_list)
                self.rl.info('get queue %s total items is %s' %(self.name,totle_times))

                while True:
                    #当生产者的消费动作都交给了消费者线程时，跳出循环
                    if len(action_list) == 0:
                        break

                    #得到队列中work状态的消费动作有多少
                    unfinished_tasks = self.queue.unfinished_tasks
                    #当work状态的消费动作小于消费者线程数时就往队列中派发一个消费动作
                    if unfinished_tasks <= self.max_num:
                        action = action_list.pop()
                        self.queue.put(action)


                end_time = time.clock()
                #计算生产者完成本次生产任务的时间和频次
                sec = int(round((end_time - start_time)))
                min = int(round(sec/float(60)))

                self.rl.info("put queue %s total items is %s,total time is %s\'s,(at %s items/min)" % \
                             (self.name,totle_times,sec,
                              int(totle_times) if min == 0 else round(float((totle_times/float(min))),2)))

                time.sleep(self.sleep_time)
            except:
                self.rl.exception()





    def start_work(self):
        '''
        启动生产者线程和根据消费者线程的数设置启动对应数量的消费者线程
        '''

        for i in range(0,self.max_num):
            qc = queue_consumer.Consumer(self.queue,self.name + '_' + str(i),self.work_sleep_time,self.work_try_num)
            qc.start()

        time.sleep(5)
        self.start()