示例#1
0
def put_inner_to_queue():
    redis_util = RedisUtill()
    '''

    '''
    page_show_num = 10
    # 统计hainiu_queue 未处理的记录数
    select_queue_count_sql = """
    select count(*) from hainiu_queue where type=%s and is_work=0 and fail_times=0;
    """
    # 插入hainiu_queue表
    insert_queue_sql = """
    insert into hainiu_queue (type,action,params) values (%s, %s, %s);
    """

    logger = LogUtil().get_logger("download_news_queue", "download_news_queue")
    db_util = DBUtil(_HAINIU_DB)
    db_util.execute_no_commit("set NAMES utf8mb4;")
    try:
        # 统计hainiu_queue 未处理的记录数
        sql_params = [2]
        res1 = db_util.read_one(select_queue_count_sql, sql_params)
        queue_count = res1[0]
        start_time = time.time()
        if queue_count >= 5:
            logger.info("hainiu_queue 有 %d 条未处理的记录,不需要导入!" % queue_count)
            return None
        inner_count = 0
        for ip in ips:
            key_list = []
            scan_limit_to_queue_table(ip, port, 0, 'down:*', 20, key_list)

            inner_count = inner_count + len(key_list)
            # 根据key列表上Redis里获取value列表
            values = redis_util.get_values_batch_keys(key_list)
            # 导入hainiu_queue表
            insert_queue_record = []
            for value in values:
                queue_param = json.loads(value)
                a_url = queue_param['a_url']
                insert_queue_record.append((2, a_url, value))

            db_util.executemany_no_commit(insert_queue_sql,
                                          insert_queue_record)
            db_util.commit()
            # 把导入表后的key列表从redis里删掉
            redis_util.delete_batch(key_list)

        end_time = time.time()
        run_time = end_time - start_time
        logger.info("本地导入 %d 条数据, 用时 %.2f 秒" % (inner_count, run_time))

    except Exception, e:
        traceback.print_exc(e)
        db_util.rollback()
示例#2
0
def push_queue_items():
    count_news_seed_queue_sql = """select count(*) from hainiu_queue where type=3 and fail_times=0;"""
    insert_news_seed_internally_queue_items_sql = """
        insert into hainiu_queue (type,action,params) values(3,%s,%s);
    """
    count_news_seed_internally_sql = """select count(*) from hainiu_web_seed_internally where status=0 for update;"""
    selec_news_seed_internally_sql = """select a_url,param,id from hainiu_web_seed_internally where status=0 limit %s,%s;"""
    update_news_seed_internally_sql = """update hainiu_web_seed_internally set status=1 where id in (%s);"""
    rl = LogUtil().get_base_logger()
    try:
        d = DBUtil(config._HAINIU_DB)
        queue_total = d.read_one(count_news_seed_queue_sql)[0]
        if queue_total != 0:
            rl.info('last download_page queue not finish,last queue %s unFinish' % (queue_total))
            return


        starttime = time.clock()
        d = DBUtil(config._HAINIU_DB)
        total = long(d.read_one(count_news_seed_internally_sql)[0])
        page_size = 1000
        page = total / page_size
        for i in range(0, page + 1):
            sql = selec_news_seed_internally_sql % (0, page_size)
            list = d.read_tuple(sql)
            values = []
            id_values = []
            for l in list:
                url = l[0]
                url = url if url is not None else ''
                param = l[1]
                param = param if param is not None else ''
                values.append((url,param))
                id = l[2]
                id_values.append(str(id))
            if id_values.__len__() != 0:
                random.shuffle(values)
                d.executemany_no_commit(insert_news_seed_internally_queue_items_sql,values)
                ids = ','.join(id_values)
                sql = update_news_seed_internally_sql % (ids)
                d.execute(sql)
        endtime = time.clock()
        worksec = int(round((endtime - starttime)))
        rl.info('push seed_internally queue finish,total items %s,action time %s\'s' % (total,worksec))
    except:
        rl.exception()
        rl.error(sql)
        d.rollback()
    finally:
        d.close()
示例#3
0
    def action(self):
        is_success = True
        t = TimeUtil()
        u = Util()
        hu = HtmlUtil()
        r = RequestUtil()
        in_values = []
        ex_values = []
        a_href = ''
        main_md5 = u.get_md5(self.url)
        now_time = datetime.now()
        update_time = int(time.mktime(now_time.timetuple()))
        create_time = update_time
        create_day = int(t.now_day().replace('-', ''))
        create_hour = int(t.now_hour())
        try:
            html = r.http_get_phandomjs(self.url)
            domain = get_tld(self.url)

            soup = BeautifulSoup(html, 'lxml')
            a_docs = soup.find_all("a")
            a_set = set()
            a_param = {}
            out_json_srt = ''
            status = 0
            host = hu.get_url_host(self.url)

            for a in a_docs:
                a_href = self.get_format_url(a,host)
                a_title = a.get_text().strip()
                if a_href == '' or a_title == '':
                    continue
                if a_set.__contains__(a_href):
                    continue
                a_set.add(a_href)

                req = urllib2.Request(url=a_href)
                a_host = req.get_host() if req.get_host() is not None else ''
                a_md5 = u.get_md5(a_href)

                if a_title != '':
                    a_param['title'] = a_title
                    out_json_srt = json.dumps(a_param,ensure_ascii=False)

                a_xpath = hu.get_dom_parent_xpath_js(a)
                insert_values = (main_md5,domain,host,a_md5,a_host,a_xpath,create_time,create_day,create_hour,update_time,status,
                                 MySQLdb.escape_string(self.url),
                                 MySQLdb.escape_string(a_href),
                                 MySQLdb.escape_string(a_title),
                                 out_json_srt)

                if a_host.__contains__(domain):
                    in_values.append(insert_values)
                else:
                    ex_values.append(insert_values)

            in_table = 'hainiu_web_seed_internally'
            ex_table = 'hainiu_web_seed_externally'
            insert_sql = """
                insert into <table> (md5,domain,host,a_md5,a_host,a_xpath,create_time,create_day,create_hour,update_time,status,url,a_url,a_title,param)
                      values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE update_time=update_time;
            """
            try:
                d = DBUtil(config._HAINIU_DB)
                d.execute_no_commit("set NAMES utf8mb4;")
                if in_values.__len__() != 0:
                    sql = insert_sql.replace('<table>',in_table)
                    d.executemany_no_commit(sql,in_values)
                if ex_values.__len__() != 0:
                    sql = insert_sql.replace('<table>',ex_table)
                    d.executemany_no_commit(sql,ex_values)
                d.commit()
            except:
                is_success = False
                self.rl.exception()
                self.rl.error(sql)
                d.rollback()
            finally:
                d.close()

        except:
            is_success = False
            self.rl.exception()
        finally:
            r.close_phandomjs()

        return super(self.__class__, self).result(is_success, [main_md5,self.url,a_href,in_values.__len__(),ex_values.__len__(),self.queue_id])
示例#4
0
    def action(self):
        #爬取 hainiu_queue 中符合要求的url 请求页面的所有 a标签url
        r = RequestUtil()
        hu = HtmlUtil()
        u = Util()
        #
        is_success = True
        db_util = DBUtil(_HAINIU_DB)
        time_util = TimeUtil()
        # 内外链表的列表
        inner_list = []
        exter_list = []
        #获取种子的md5
        md5 = u.get_md5(self.act)
        try:
            # 通过phandomjs 请求url,返回网页,包括网页的ajax请求
            html = r.http_get_phandomjs(self.act)
            #可以从HTML或XML文件中提取数据的Python第三方库
            soup = BeautifulSoup(html, 'lxml')
            # a链接dom对象列表
            a_docs = soup.find_all("a")
            if len(a_docs) == 0:
                is_success = False
            aset = set()
            #获取种子的domain
            domain = hu.get_url_domain(self.act)
            #获取种子的host
            host = hu.get_url_host(self.act)

            # 时间(create_time、create_day、create_hour、update_time)
            # create_time=time_util.get_timestamp()
            #
            # create_day = int(time_util.now_day().replace('-', ''))
            # create_hour=int(time_util.now_hour())
            # update_time=create_time
            create_time = time_util.get_timestamp()
            # 获取年月日格式
            create_day = int(time_util.now_day(format='%Y%m%d'))
            # 获取小时
            create_hour = int(time_util.now_hour())
            update_time = create_time

            # params_json = json.dumps(self.params, ensure_ascii=False, encoding='utf-8')

            for a_doc in a_docs:
                #获取a标签的href
                a_href = hu.get_format_url(self.act, a_doc, host)
                #获取a标签的内容
                a_title = a_doc.get_text().strip()
                if a_href == '' or a_title == '':
                    continue
                if aset.__contains__(a_href):
                    continue
                aset.add(a_href)
                #获取a标签的host
                a_host = hu.get_url_host(a_href)

                #获取a标签href链接url的md5
                a_md5 = u.get_md5(a_href)

                #获取a标签所对应的xpath
                a_xpath = hu.get_dom_parent_xpath_js_new(a_doc)
                # 一行数据
                row_data = (self.act, md5, self.params, domain, host, a_href,
                            a_md5, a_host, a_xpath, a_title, create_time,
                            create_day, create_hour, update_time)
                if a_href.__contains__(domain):
                    inner_list.append(row_data)
                else:
                    exter_list.append(row_data)
            # 并解析存入内链表或外链表,在存入时,如果url已存在,只做
            # update 操作。(保证链接页面不会重复爬取)
            if len(inner_list) > 0:
                inner_insert_sql = """
              insert into hainiu_web_seed_internally
              (url,md5,param,domain,host,a_url,a_md5,a_host,a_xpath,a_title,create_time,
              create_day,create_hour,update_time)
              values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
              ON DUPLICATE KEY UPDATE update_time=values(update_time);
            """
                db_util.executemany_no_commit(inner_insert_sql, inner_list)
            if len(exter_list) > 0:
                exter_insert_sql = """
              insert into hainiu_web_seed_externally
              (url,md5,param,domain,host,a_url,a_md5,a_host,a_xpath,a_title,create_time,
              create_day,create_hour,update_time)
              values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
              ON DUPLICATE KEY UPDATE update_time=values(update_time);
            """
                db_util.executemany_no_commit(exter_insert_sql, exter_list)
            db_util.commit()
        except Exception, e:
            is_success = False
            db_util.rollback()
            traceback.print_exc(e)