示例#1
0
 def queue_items(self):
     select_queue_sql = """
         select id,action,params from hainiu_queue where type=1 and fail_times <= %s limit 0,%s for UPDATE;
     """
     update_queue_sql = """
         update hainiu_queue set type=0 where id in (%s);
     """
     list = []
     try:
         d = DBUtil(config._HAINIU_DB)
         sql = select_queue_sql % (self.fail_times,self.limit)
         tuple = d.read_tuple(sql)
         if len(tuple) == 0:
             return list
         queue_ids = ''
         for t in tuple:
             queue_id = t[0]
             url = t[1]
             param = '' if t[2] is None else t[2]
             queue_ids += str(queue_id) + ','
             c = NewsFindConsumer(url, param, queue_id)
             list.append(c)
         queue_ids = queue_ids[:-1]
         d.execute(update_queue_sql % (queue_ids))
     except:
         self.rl.exception()
         d.rollback()
         d.commit()
     finally:
         d.close()
     return list
示例#2
0
def push_queue_items():
    count_news_seed_queue_sql = """select count(*) from hainiu_queue where type=3 and fail_times=0;"""
    insert_news_seed_internally_queue_items_sql = """
        insert into hainiu_queue (type,action,params) values(3,%s,%s);
    """
    count_news_seed_internally_sql = """select count(*) from hainiu_web_seed_internally where status=0 for update;"""
    selec_news_seed_internally_sql = """select a_url,param,id from hainiu_web_seed_internally where status=0 limit %s,%s;"""
    update_news_seed_internally_sql = """update hainiu_web_seed_internally set status=1 where id in (%s);"""
    rl = LogUtil().get_base_logger()
    try:
        d = DBUtil(config._HAINIU_DB)
        queue_total = d.read_one(count_news_seed_queue_sql)[0]
        if queue_total != 0:
            rl.info('last download_page queue not finish,last queue %s unFinish' % (queue_total))
            return


        starttime = time.clock()
        d = DBUtil(config._HAINIU_DB)
        total = long(d.read_one(count_news_seed_internally_sql)[0])
        page_size = 1000
        page = total / page_size
        for i in range(0, page + 1):
            sql = selec_news_seed_internally_sql % (0, page_size)
            list = d.read_tuple(sql)
            values = []
            id_values = []
            for l in list:
                url = l[0]
                url = url if url is not None else ''
                param = l[1]
                param = param if param is not None else ''
                values.append((url,param))
                id = l[2]
                id_values.append(str(id))
            if id_values.__len__() != 0:
                random.shuffle(values)
                d.executemany_no_commit(insert_news_seed_internally_queue_items_sql,values)
                ids = ','.join(id_values)
                sql = update_news_seed_internally_sql % (ids)
                d.execute(sql)
        endtime = time.clock()
        worksec = int(round((endtime - starttime)))
        rl.info('push seed_internally queue finish,total items %s,action time %s\'s' % (total,worksec))
    except:
        rl.exception()
        rl.error(sql)
        d.rollback()
    finally:
        d.close()
def push_queue_items():
    count_news_seed_sql = """select count(*) from hainiu_web_seed where status=0;"""
    select_news_seed_sql = """select url,category,last_crawl_time from hainiu_web_seed where status=0 limit %s,%s;"""
    insert_news_seed_queue_items_sql = """insert into hainiu_queue (type,action,params) values(1,%s,%s);"""
    count_news_seed_queue_sql = """select count(*) from hainiu_queue where type=1 and fail_times=0;"""
    rl = LogUtil().get_base_logger()
    try:
        d = DBUtil(config._HAINIU_DB)
        queue_total = d.read_one(count_news_seed_queue_sql)[0]
        if queue_total != 0:
            rl.info('last news_find queue not finish,last queue %s unFinish' %
                    (queue_total))
            return

        starttime = time.clock()
        total = long(d.read_one(count_news_seed_sql)[0])
        page_size = 1000
        page = total / page_size
        for i in range(0, page + 1):
            sql = select_news_seed_sql % (i * page_size, page_size)
            list = d.read_tuple(sql)
            values = []
            for l in list:
                url = l[0]
                publisher = get_tld(url)
                publisher = publisher[0:publisher.index((
                    '.'))] if publisher.__contains__('.') else publisher
                param = {}
                param['category'] = l[1]
                param['publisher'] = publisher
                param = json.dumps(param, ensure_ascii=False)
                values.append((url, param))

            if values.__len__() != 0:
                random.shuffle(values)
                d.executemany(insert_news_seed_queue_items_sql, values)
        endtime = time.clock()
        worksec = int(round((endtime - starttime)))
        rl.info(
            'push news_find queue finish,total items %s,action time %s\'s' %
            (total, worksec))
    except:
        rl.exception()
        rl.error(sql)
        d.rollback()
    finally:
        d.close()
示例#4
0
def push_queue_items():
    inert_sql = """
    insert into hainiu_queue (type,params,action) values(1,%s,%s);
    """
    count_sql = """
    select count(1) from hainiu_queue where type=1;
    """
    select_sql = """
    select id from hainiu_queue where type=1 limit %s,%s;
    """
    rl = LogUtil().get_base_logger()
    try:
        d = DBUtil(config._HAINIU_DB)
        sql = inert_sql
        insert_list = [("aaa", "bbb"), ("dffddf", "awwee")]
        d.executemany(sql, insert_list)

        sql = count_sql
        queue_total = d.read_one(sql)[0]
        print "queue_total", queue_total
        page_size = 10
        page = (queue_total / page_size) + 1
        print "page", page

        for i in range(0, page):
            sql = select_sql % (i * page_size, page_size)
            select_list = d.read_tuple(sql)
            print "page", i
            for record in select_list:
                id = record[0]
                print id

    except:
        rl.exception()
        rl.error(sql)
        d.rollback()
    finally:
        d.close()
示例#5
0
def redis2Hdfs():

    select_xpath_rule_sql = """select host,xpath,type from stream_extract_xpath_rule where host='%s' and status=0"""
    rl = LogUtil().get_base_logger()
    try:
        d = DBUtil(_ZZ_DB)

        start = 0
        is_finish = True
        host_set = set()

        f = FileUtil()
        t = TimeUtil()
        time_str = t.now_time(format='%Y%m%d%H%M%S')
        #local_xpath_file_path = '/user/zengqingyong17/spark/xpath_cache_file' + time_str
        local_xpath_file_path = 'E:/python_workspaces/data/xpath/xpath_file' + time_str

        starttime = time.clock()
        r = redis.Redis('nn1.hadoop', '6379', db=6)
        while is_finish:
            values = set()
            rs = r.scan(start, "total_z:*", 10)
            # 新游标
            start = rs[0]
            if start ==0:
                is_finish = False
            # print rs
            for i in rs[1]:
                host = i.split(":")[1]
                total_key = i
                txpath_key = 'txpath_z:%s' % host
                fxpath_key = 'fxpath_z:%s' % host
                total = r.get(total_key)

                # 降序排序获得次数(0,1)
                txpath = r.zrevrange(txpath_key, 0, 1)
                row_format = "%s\t%s\t%s\t%s"

                if txpath:
                    txpath_num = int(r.zscore(txpath_key, txpath[0]))
                    if txpath.__len__() == 2:
                        # 返回txpath_key 中txpath[1]的数值
                        txpath_num_1 = int(r.zscore(txpath_key, txpath[1]))
                        txpath_num_1 = txpath_num_1 if txpath_num_1 is not None else 0
                    if txpath_num / float(total) >= 0.8:
                        values.add(row_format % (host, txpath[0], 'true', '0'))
                        host_set.add(host)
                    else:
                        if txpath_num >= 100:
                            values.add(row_format % (host, txpath[0], 'true', '0'))
                            host_set.add(host)
                        if txpath_num_1 is not None and txpath_num_1 >= 100:
                            values.add(row_format % (host, txpath[1], 'true', '0'))
                            host_set.add(host)

                # 获得fxpath_key的全部值
                fxpath = r.smembers(fxpath_key)
                if fxpath:
                    # print 'fxpath:%s' % fxpath
                    for fx in fxpath:
                        values.add(row_format % (host, fx, 'false', '1'))
                    host_set.add(host)

                sql = select_xpath_rule_sql % host
                list_rule = d.read_tuple(sql)
                for rule in list_rule:
                    type = rule[2]
                    if type == 0:
                        values.add(row_format % (rule[0], rule[1], 'true', '2'))
                        host_set.add(host)
                    elif type == 1:
                        values.add(row_format % (rule[0], rule[1], 'false', '3'))
                        host_set.add(host)

            f.write_file_line_pattern(local_xpath_file_path, values, "a")

        #上传到HDFS的XPATH配置文件目录
        # c.upload("/user/qingniu/xpath_cache_file/", local_xpath_file_path)
        endtime = time.clock()
        worksec = int(round((endtime - starttime)))
        rl.info('total host %s,action time %s\'s' % (host_set.__len__(), worksec))
    except:
        rl.exception()
        d.rollback()
    finally:
        d.close()
示例#6
0
def xpath_config_file():
    select_xpath_rule_sql = """select host,xpath,type from stream_extract_xpath_rule where host='%s' and status=0"""
    rl = LogUtil().get_base_logger()
    try:
        # _HAINIU_DB = {'HOST': '192.168.137.190', 'USER': '******', 'PASSWD': '12345678', 'DB': 'hainiucrawler',
        #             'CHARSET': 'utf8', 'PORT': 3306}
        d = DBUtil(config._HAINIU_DB)
        # d = DBUtil(_HAINIU_DB)
        r = redis.Redis('nn1.hadoop', 6379, db=6)
        # r = redis.Redis('redis.hadoop', 6379, db=6)
        f = FileUtil()
        t = TimeUtil()
        c = Client("http://nn1.hadoop:50070")

        time_str = t.now_time(format='%Y%m%d%H%M%S')
        # local_xpath_file_path = '/Users/leohe/Data/input/xpath_cache_file/xpath_file' + time_str
        local_xpath_file_path = '/home/qingniu/xpath_cache_file/xpath_file' + time_str

        start_cursor = 0
        is_finish = True
        starttime = time.clock()
        host_set = set()

        while is_finish:
            values = set()
            limit = r.scan(start_cursor, 'total:*', 10)
            if limit[0] == 0:
                is_finish = False
            start_cursor = limit[0]
            for h in limit[1]:
                host = h.split(":")[1]
                total_key = h
                txpath_key = 'txpath:%s' % host
                fxpath_key = 'fxpath:%s' % host
                total = r.get(total_key)

                txpath = r.zrevrange(txpath_key, 0, 1)
                row_format = "%s\t%s\t%s\t%s"
                if txpath:
                    # print 'txpath:%s' % txpath
                    txpath_num = int(r.zscore(txpath_key, txpath[0]))
                    if txpath.__len__() == 2:
                        txpath_num_1 = int(r.zscore(txpath_key, txpath[1]))
                        txpath_num_1 = txpath_num_1 if txpath_num_1 is not None else 0

                    # print 'txpath_max_num:%s' % txpath_num
                    if txpath_num / float(total) >= 0.8:
                        values.add(row_format % (host, txpath[0], 'true', '0'))
                        host_set.add(host)
                    else:
                        if txpath_num >= 1:
                            values.add(row_format %
                                       (host, txpath[0], 'true', '0'))
                            host_set.add(host)
                        if txpath_num_1 is not None and txpath_num_1 >= 1:
                            values.add(row_format %
                                       (host, txpath[1], 'true', '0'))
                            host_set.add(host)

                fxpath = r.smembers(fxpath_key)
                if fxpath:
                    # print 'fxpath:%s' % fxpath
                    for fx in fxpath:
                        values.add(row_format % (host, fx, 'false', '0'))
                    host_set.add(host)

                sql = select_xpath_rule_sql % host
                list_rule = d.read_tuple(sql)
                for rule in list_rule:
                    type = rule[2]
                    if type == 0:
                        values.add(row_format %
                                   (rule[0], rule[1], 'true', '2'))
                        host_set.add(host)
                    elif type == 1:
                        values.add(row_format %
                                   (rule[0], rule[1], 'false', '3'))
                        host_set.add(host)

            f.write_file_line_pattern(local_xpath_file_path, values, "a")
        #上传到HDFS的XPATH配置文件目录
        c.upload("/user/qingniu/xpath_cache_file/", local_xpath_file_path)
        endtime = time.clock()
        worksec = int(round((endtime - starttime)))
        rl.info('total host %s,action time %s\'s' %
                (host_set.__len__(), worksec))
    except:
        rl.exception()
        d.rollback()
    finally:
        d.close()