示例#1
0
 def queue_items(self):
     select_queue_sql = """
         select id,action,params from hainiu_queue where type=1 and fail_times <= %s limit 0,%s for UPDATE;
     """
     update_queue_sql = """
         update hainiu_queue set type=0 where id in (%s);
     """
     list = []
     try:
         d = DBUtil(config._HAINIU_DB)
         sql = select_queue_sql % (self.fail_times,self.limit)
         tuple = d.read_tuple(sql)
         if len(tuple) == 0:
             return list
         queue_ids = ''
         for t in tuple:
             queue_id = t[0]
             url = t[1]
             param = '' if t[2] is None else t[2]
             queue_ids += str(queue_id) + ','
             c = NewsFindConsumer(url, param, queue_id)
             list.append(c)
         queue_ids = queue_ids[:-1]
         d.execute(update_queue_sql % (queue_ids))
     except:
         self.rl.exception()
         d.rollback()
         d.commit()
     finally:
         d.close()
     return list
    def queue_items(self):
        '''
        从队列中取出要处理的消息,并封装成消费者动作,然后更新队列的状态
        :return:            封装好的消费者动作列表
        '''

        # 会限制本机处理失败之后就不再进行获取的获取,通过机器IP来限制
        # select_queue_sql = """
        # select id,action,params from hainiu_queue where type=1 and fail_ip <>'%s' and fail_times<=%s
        # limit 0,%s for update;
        # """

        select_queue_sql = """
        select id,action,params from hainiu_queue where type=1 and fail_times<=%s
        limit 0,%s for update;
        """

        update_queue_sql = """
        update hainiu_queue set type=0 where id in (%s);
        """
        return_list = []
        try:
            d = DBUtil(config._HAINIU_DB)
            # u = Util()
            # ip = u.get_local_ip()
            # sql = select_queue_sql % (self.fail_times,ip,self.limit)
            sql = select_queue_sql % (self.fail_times, self.limit)
            select_dict = d.read_dict(sql)
            if len(select_dict) == 0:
                return return_list

            query_ids = []
            for record in select_dict:
                id = record["id"]
                action = record["action"]
                params = record["params"]
                query_ids.append(str(id))
                c = HainiuConsumer(id, action, params)
                return_list.append(c)

            ids = ",".join(query_ids)
            sql = update_queue_sql % ids
            d.execute(sql)
        except:
            self.rl.exception()
            self.rl.error(sql)
            d.rollback()
        finally:
            d.close()
        return return_list
示例#3
0
def push_queue_items():
    count_news_seed_queue_sql = """select count(*) from hainiu_queue where type=3 and fail_times=0;"""
    insert_news_seed_internally_queue_items_sql = """
        insert into hainiu_queue (type,action,params) values(3,%s,%s);
    """
    count_news_seed_internally_sql = """select count(*) from hainiu_web_seed_internally where status=0 for update;"""
    selec_news_seed_internally_sql = """select a_url,param,id from hainiu_web_seed_internally where status=0 limit %s,%s;"""
    update_news_seed_internally_sql = """update hainiu_web_seed_internally set status=1 where id in (%s);"""
    rl = LogUtil().get_base_logger()
    try:
        d = DBUtil(config._HAINIU_DB)
        queue_total = d.read_one(count_news_seed_queue_sql)[0]
        if queue_total != 0:
            rl.info('last download_page queue not finish,last queue %s unFinish' % (queue_total))
            return


        starttime = time.clock()
        d = DBUtil(config._HAINIU_DB)
        total = long(d.read_one(count_news_seed_internally_sql)[0])
        page_size = 1000
        page = total / page_size
        for i in range(0, page + 1):
            sql = selec_news_seed_internally_sql % (0, page_size)
            list = d.read_tuple(sql)
            values = []
            id_values = []
            for l in list:
                url = l[0]
                url = url if url is not None else ''
                param = l[1]
                param = param if param is not None else ''
                values.append((url,param))
                id = l[2]
                id_values.append(str(id))
            if id_values.__len__() != 0:
                random.shuffle(values)
                d.executemany_no_commit(insert_news_seed_internally_queue_items_sql,values)
                ids = ','.join(id_values)
                sql = update_news_seed_internally_sql % (ids)
                d.execute(sql)
        endtime = time.clock()
        worksec = int(round((endtime - starttime)))
        rl.info('push seed_internally queue finish,total items %s,action time %s\'s' % (total,worksec))
    except:
        rl.exception()
        rl.error(sql)
        d.rollback()
    finally:
        d.close()
    def success_action(self, values):
        '''
        消息动作处理成功之后,从队列中间件删除该消息,表示这个消息最终处理完成

        :param values:      消息动作处理之后的结果
        '''
        delete_sql = """
           delete from hainiu_queue where id=%s
        """
        try:
            d = DBUtil(config._HAINIU_DB)
            id = values[0]
            sql = delete_sql % id
            d.execute(sql)
        except:
            self.rl.exception()
            self.rl.error(sql)
            d.rollback()
        finally:
            d.close()
示例#5
0
    def fail_action(self, values):
        # 每次失败都需要更新ip 和 失败次数
        update_sql1='''
        update web_queue set fail_ip = %s , fail_times = fail_times + 1 where id = %s;
        '''
        # 当失败次数到达每台机器的最大重试次数,就将该记录的is_work=0 ,让其重试
        update_sql2='''
        update web_queue set is_work = 0 where id = %s;
        '''
        # 更新seed表状态
        update_seed_sql = '''
        update web_seed set fail_times=fail_times + 1,fail_ip=%s where md5 =%s;
        '''
        # 更新externally表状态
        update_exter_sql = '''
        update web_seed_externally set fail_times=fail_times + 1,fail_ip=%s where a_md5 =%s;
        '''

        db_util = DBUtil(_ZZ_DB)

        try:
            id = values[0]
            ip = Util().get_local_ip()
            # 每次更新失败ip 和失败次数
            # queue表
            sql_params = [ip, id]
            db_util.execute_no_commit(update_sql1, sql_params)
            # seed 表
            sql_params = [ip, values[1]]
            db_util.execute(update_seed_sql, sql_params)
            # externally表
            db_util.execute(update_exter_sql, sql_params)

            if self.current_retry_num == _QUEUE_ZZ["C_RETRY_TIMES"] - 1:
                db_util.execute_no_commit(update_sql2 % id)

            db_util.commit()

        except Exception,err:
            db_util.rollback()
            traceback.print_exc(err)
示例#6
0
def create_seed():
    url = "https://www.autohome.com.cn/all"
    catetory = "汽车"
    sql = """
    insert into hainiu_web_seed (url,md5,domain,host,category,status) values
    ('%s','%s','%s','%s','%s',0);
    """
    hu = HtmlUtil()
    domain = get_tld(url)
    host = hu.get_url_host(url)
    u = Util()
    md5 = u.get_md5(url)

    rl = LogUtil().get_base_logger()
    try:
        d = DBUtil(config._HAINIU_DB)
        sql = sql % (url, md5, domain, host, catetory)
        d.execute(sql)
    except:
        rl.exception()
        d.rollback()
    finally:
        d.close()
示例#7
0
文件: new_seed.py 项目: zz-big/python
def create_seed():
    sql = """
    insert into web_seed (url,md5,domain,host,category,status) values
    ('%s','%s','%s','%s','%s',0);
    """
    url = "https://news.sina.com.cn/"
    catetory = "新闻"
    hu = HtmlUtil()
    domain = get_tld(url)
    host = hu.get_url_host(url)
    u = Util()
    md5 = u.get_md5(url)

    rl = LogUtil().get_base_logger()
    try:
        d = DBUtil(config._ZZ_DB)
        sql = sql % (url, md5, domain, host, catetory)
        d.execute(sql)
    except:
        rl.exception()
        d.rollback()
    finally:
        d.close()
示例#8
0
    def success_action(self, values):
        # 删除列表对应的记录
        del_sql = '''
        delete from web_queue where id =%s;
        '''
        # 更新seed表状态
        update_sql = '''
        update web_seed set last_crawl_time=%s,last_crawl_internally=%s,last_crawl_externally=%s where md5 =%s;
        '''
        db_util = DBUtil(_ZZ_DB)

        try:
            # 删除队列表
            id = values[0]
            sql_param = [id]
            db_util.execute(del_sql, sql_param)
            # 更新seed表
            # [(1574519076,), 95, 7, '824e29a21f2a02379f78b0675d1fc5eb']
            sql_param =[values[2], values[3],values[4],values[1]]
            db_util.execute(update_sql, sql_param)
        except Exception, err:
            db_util.rollback()
            traceback.print_exc(err)
示例#9
0
def put_seed():

    # 统计seed符合条件的记录数
    count_queue_sql = '''
    select count(*) from web_seed where status=%s and fail_times < %s;
    '''

    # 统计web_seed表的符合条件的总记录数
    count_exter_sql = '''
    select count(*) from web_seed_externally where status=0;
    '''

    # web_seed_externally 表的记录
    select_exter_limit_sql = '''
    select id,a_url,a_md5,a_host,param from web_seed_externally where status=0 limit %s,%s;
    '''

    # 插入seed表记录
    insert_seed_sql = '''
    insert into web_seed (url,md5,domain,host,category) values (%s,%s,%s,%s,%s);
    '''

    # web_seed_internally status
    update_sql = '''
    update web_seed_externally set status=1 where id in(%s);
     '''
    db_uitl = DBUtil(_ZZ_DB)

    sql_params = [0, _QUEUE_ZZ['MAX_FAIL_TIMES']]
    res1 = db_uitl.read_one(count_queue_sql, sql_params)
    total_num1 = res1[0]
    if total_num1 != 0:
        print "queue has %d records,not insert!" % total_num1
        return None

    start_time = time.time()

    res2 = db_uitl.read_one(count_exter_sql)
    total_num2 = res2[0]

    # 计算分多少页查询
    page_num = total_num2 / _QUEUE_ZZ["LIMIT_NUM"] if total_num2 % _QUEUE_ZZ[
        "LIMIT_NUM"] == 0 else total_num2 / _QUEUE_ZZ["LIMIT_NUM"] + 1

    # hu = HtmlUtil()
    # u = Util()
    # 分页插入queue表
    try:
        ids = []
        for i in range(0, page_num):

            sql_params = [i * _QUEUE_ZZ["LIMIT_NUM"], _QUEUE_ZZ["LIMIT_NUM"]]
            res3 = db_uitl.read_dict(select_exter_limit_sql, sql_params)

            list1 = []

            for row in res3:
                id = row["id"]
                ids.append(str(id))

                url = row["a_url"]
                domain = get_tld(url)
                # host = hu.get_url_host(url)

                # md5 = u.get_md5(url)
                host = row["a_host"]
                md5 = row["a_md5"]
                category = row["param"]
                list1.append((url, md5, domain, host, category))
            # 批量插入queue
            db_uitl.executemany(insert_seed_sql, list1)

        # 更新status = 1
        db_uitl.execute(update_sql % ",".join(ids))

    except Exception, err:
        db_uitl.rollback()
        traceback.print_exc(err)
示例#10
0
def put_queue_inner():

    # 统计queue符合条件的记录数
    count_queue_sql = '''
    select count(*) from web_queue where is_work=%s and fail_times < %s;
    '''

    # 统计internally表的符合条件的总记录数
    count_inner_sql = '''
    select count(*) from web_seed_internally where status=0;
    '''

    # web_seed_internally 表的记录
    select_inner_limit_sql = '''
    select id,a_url,param from web_seed_internally where status=0 limit %s,%s;
    '''

    # 插入queue表记录
    insert_queue_sql = '''
    insert into web_queue (type,action,params) values(%s,%s,%s);
    '''

    # web_seed_internally status
    update_sql = '''
    update web_seed_internally set status=1 where id in(%s);
        '''
    db_uitl = DBUtil(_ZZ_DB)
    try:
        sql_params = [0, _QUEUE_ZZ['MAX_FAIL_TIMES']]
        res1 = db_uitl.read_one(count_queue_sql, sql_params)
        total_num1 = res1[0]
        if total_num1 != 0:
            print "queue has %d records,not insert!" % total_num1
            return None

        start_time = time.time()

        res2 = db_uitl.read_one(count_inner_sql)
        total_num2 = res2[0]

        # 计算分多少页查询
        page_num = total_num2 / _QUEUE_ZZ[
            "LIMIT_NUM"] if total_num2 % _QUEUE_ZZ[
                "LIMIT_NUM"] == 0 else total_num2 / _QUEUE_ZZ["LIMIT_NUM"] + 1

        # 分页插入queue表
        ids = []
        for i in range(0, page_num):
            sql_params = [i * _QUEUE_ZZ["LIMIT_NUM"], _QUEUE_ZZ["LIMIT_NUM"]]
            res3 = db_uitl.read_dict(select_inner_limit_sql, sql_params)

            list1 = []

            for row in res3:
                id = row["id"]
                ids.append(str(id))
                action = row["a_url"]
                params1 = row["param"]
                type = 2
                list1.append((type, action, params1))
            # 批量插入queue
            db_uitl.executemany(insert_queue_sql, list1)

        # 更新status = 1
        db_uitl.execute(update_sql % ",".join(ids))
        db_uitl.commit()

        end_time = time.time()
        run_time = end_time - start_time
        print "total_num:%d, run_time:%.2f" % (total_num2, run_time)

    except Exception, err:
        db_uitl.rollback()
        traceback.print_exc(err)
示例#11
0
    def action(self, *values):

        # 插入内链表sql语句
        insert_seed_internally='''
        insert into web_seed_internally
        (url,md5,param,domain,host,a_url,a_md5,a_host,a_xpath,a_title,create_time,create_day,create_hour,update_time,status)
        values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE update_time=VALUES(update_time);
        '''
        # 插入外链表sql语句
        insert_seed_externally='''
        insert into web_seed_externally
        (url,md5,param,domain,host,a_url,a_md5,a_host,a_xpath,a_title,create_time,create_day,create_hour,update_time,status)
        values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE update_time=VALUES(update_time);
        '''

        # 获取时间
        a_time = TimeUtil()
        db_util = DBUtil(_ZZ_DB)
        # redis_d = RedisUtill()
        total_count = 0
        in_count = 0
        ex_count = 0
        try:
            # 解析主网页信息
            hu = HtmlUtil()

            domain = hu.get_url_domain(self.act)
            host = hu.get_url_host(self.act)
            u = Util()
            md5 = u.get_md5(self.act)

            # 解析a标签信息
            r = RequestUtil()
            # 通过phandomjs 请求url,返回网页,包括网页的ajax请求
            html = r.http_get_phandomjs(self.act)
            # 可以从HTML或XML文件中提取数据的Python第三方库
            soup = BeautifulSoup(html, 'lxml')
            # a链接dom对象列表
            aset = set()
            # 获取host
            a_host = hu.get_url_host(self.act)

            # a_docs = soup.find_all("a",href=re.compile("^(/|.*"+domain+")"))
            a_docs = soup.find_all("a")

            for a in a_docs:

                total_count += 1
                # 获取a标签的href
                a_url = hu.get_format_url(self.act,a,a_host)
                # 获取a标签的内容
                a_title = a.get_text().strip()
                if a_url == '' or a_title == '':
                    continue

                if aset.__contains__(a_url):
                    continue
                aset.add(a_url)

                # 获取a标签的host
                a_host = hu.get_url_host(a_url)

                # 获取a标签href链接url的md5
                a_md5 = u.get_md5(a_url)

                # 获取a标签所对应的xpath
                a_xpath = hu.get_dom_parent_xpath_js_new(a)

                create_time = a_time.get_timestamp()
                create_day = int(a_time.now_day(format='%Y%m%d'))
                create_hour = int(a_time.now_hour())

                params_sql = [self.act,md5,self.params,domain,host,a_url,a_md5,a_host,a_xpath,a_title,create_time,create_day,create_hour,create_time,0]
                if re.compile("^(/|.*"+domain+")").match(a_url) is not None:
                    db_util.execute(insert_seed_internally, params_sql)
                #
                #     # redis
                #     redis_md5 = u.get_md5(md5+"\001"+a_md5)
                #     find_key = redis_d.get_value_for_key('seed:%s:a_url' % redis_md5)
                #     if find_key == None:
                #         # url,md5,param,domain,host,a_url,a_md5,a_host,a_xpath,a_title,create_time,create_day,create_hour,update_time,status
                #         dicts = {'seed:%s:param' % redis_md5 :self.params, 'seed:%s:a_url' % redis_md5 : a_url,
                #                 'seed:%s:md5' % redis_md5 : md5, 'seed:%s:a_md5' % redis_md5 :a_md5}
                #
                #         dicts_temp = {'seed_temp:%s:param' % redis_md5 :self.params,'seed_temp:%s:a_url' % redis_md5 : a_url,
                #                     'seed_temp:%s:md5' % redis_md5 : md5, 'seed_temp:%s:a_md5' % redis_md5 : a_md5}
                #         redis_d.set_batch_datas(dicts)
                #         redis_d.set_batch_datas(dicts_temp)

                    in_count += 1
                else:
                    db_util.execute(insert_seed_externally, params_sql)
                    ex_count += 1

            r.close_phandomjs()
        except Exception, err:
            db_util.rollback()
            traceback.print_exc(err)
    def action(self):
        is_success = True
        t = TimeUtil()
        f = FileUtil()
        u = Util()
        hu = HtmlUtil()
        r = RequestUtil()
        values = []
        md5 = u.get_md5(self.url)
        now_time = datetime.now()
        update_time = int(time.mktime(now_time.timetuple()))
        create_time = update_time
        create_day = int(t.now_day().replace('-', ''))
        create_hour = int(t.now_hour())
        now_minute = int(t.now_min())
        for i in xrange(60, -5, -5):
            if now_minute >= i:
                now_minute = i
                break
        now_minute = t.now_time(format='%Y%m%d%H') + (
            '0%s' % (str(now_minute)) if now_minute < 10 else str(now_minute))

        values.append(MySQLdb.escape_string(self.url))
        values.append(md5)
        values.append(create_time)
        values.append(create_day)
        values.append(create_hour)
        values.append('')
        values.append(MySQLdb.escape_string(self.param))
        values.append(update_time)
        try:
            html = r.http_get_phandomjs(self.url)
            domain = get_tld(self.url)
            values[5] = domain

            soup = BeautifulSoup(html, 'lxml')
            title_doc = soup.find('title')
            title = title_doc.contents[0] if title_doc is not None and len(
                title_doc.contents) == 1 else ''

            host = hu.get_url_host(self.url)
            values.append(host)
            values.append(MySQLdb.escape_string(title))

            # k = KafkaUtil(config._KAFKA_CONFIG)
            html = html.replace(content._SEQ1,
                                '').replace(content._SEQ2, content._SEQ4)
            # push_str = content._SEQ3.join(('%s','%s')) % (self.url,html)
            # push_str = content._SEQ3.join(('%s','%s')) % (u.get_md5(push_str),push_str)
            # push_str = bytes(push_str)
            # is_success = k.push_message(push_str)

            is_success = True
            if is_success:
                self.save_file(create_time, f, now_minute, u, self.url, html)
            else:
                values.append('')
                values.append('')
                self.rl.error("kafka push error")

        except:
            is_success = False
            values.append('')
            values.append('')
            self.rl.exception()
        finally:
            r.close_phandomjs()

        try:
            if is_success:
                values.append(1)
                insert_web_page_sql = """
                    insert into hainiu_web_page (url,md5,create_time,create_day,create_hour,domain,param,update_time,host,
                    title,status) values ("%s","%s",%s,%s,%s,"%s","%s",%s,"%s","%s",%s) on DUPLICATE KEY  UPDATE update_time=values(update_time);
                """
            else:
                ip = u.get_local_ip()
                values.append(ip)
                values.append(2)
                insert_web_page_sql = """
                    insert into hainiu_web_page (url,md5,create_time,create_day,create_hour,domain,param,update_time,host,
                    title,fail_ip,status) values ("%s","%s",%s,%s,%s,"%s","%s",%s,"%s","%s","%s",%s)
                    on DUPLICATE KEY UPDATE fail_times=fail_times+1,fail_ip=values(fail_ip);
                """

            d = DBUtil(config._HAINIU_DB)
            sql = insert_web_page_sql % tuple(values)
            d.execute(sql)
        except:
            is_success = False
            self.rl.exception()
            self.rl.error(sql)
            d.rollback()
            d.commit()
        finally:
            d.close()

        return super(self.__class__,
                     self).result(is_success,
                                  [md5, self.url, update_time, self.queue_id])