Пример #1
0
    def fail_action(self, values):
        '''
        消息动作处理失败之后,更改队列中间件中该消息的失败次数并记录执行机器的IP
        如果达到该机器的最大尝试失败次数,则更改队列中间件中该消息的状态为未处理,目的让其它机器再次尝试去处理该消息

        :param values:      消息动作处理之后的结果
        '''
        update_sql = """
            update hainiu_queue set fail_times=fail_times+1,fail_ip='%s' where id=%s;
        """
        update_sql_1 = """
            update hainiu_queue set type=1 where id=%s
        """
        try:
            d = DBUtil(config._HAINIU_DB)
            id = values[0]
            u = Util()
            ip = u.get_local_ip()
            sql = update_sql % (ip, id)
            d.execute_no_commit(sql)
            if (self.try_num == Consumer._WORK_TRY_NUM):
                sql = update_sql_1 % id
                d.execute_no_commit(sql)
            d.commit()
        except:
            self.rl.exception()
            self.rl.error(sql)
            d.rollback()
        finally:
            d.close()
Пример #2
0
 def fail_action(self, values):
     update_sql = """
         update hainiu_queue set fail_times=fail_times+1,fail_ip='%s' where id=%s;
     """
     update_sql_1 = """
         update hainiu_queue set type=1 where id=%s;
     """
     update_hainiu_news_seed_sql = """
         update hainiu_web_seed set fail_times=fail_times+1,fail_ip="%s" where md5="%s";
     """
     try:
         d = DBUtil(config._HAINIU_DB)
         id = values[5]
         u = Util()
         ip = u.get_local_ip()
         sql = update_sql % (ip, id)
         d.execute_no_commit(sql)
         main_md5 = values[0]
         sql = update_hainiu_news_seed_sql % (ip, main_md5)
         d.execute_no_commit(sql)
         if (self.try_num == Consumer._WORK_TRY_NUM):
             sql = update_sql_1 % (id)
             d.execute_no_commit(sql)
         d.commit()
     except:
         self.rl.exception()
         self.rl.error(sql)
         d.rollback()
         d.commit()
     finally:
         d.close()
Пример #3
0
def date_merge():
    u = Util()
    fi = FileUtil()
    t = TimeUtil()
    s = SendSmsUtil()
    alter_time = t.now_time()
    beijing_now = datetime.now()
    now_time = int(time.mktime(beijing_now.timetuple()))
    tmp_path = config._LOCAL_DATA_DIR % ('%s/%s_%s.tmp' % ('tmp','hainiu', now_time))
    up_path = config._LOCAL_DATA_DIR % ('%s/%s_%s.done' % ('up','hainiu', now_time))
    start_char = ''
    for dirpath, dirnames, filenames in os.walk(config._LOCAL_DATA_DIR % ('done')):
        for filename in filenames:
            total = 0
            merge_total = 0
            dir = os.path.join(dirpath, filename)
            file_size = os.path.getsize(dir)
            record_list = []
            with open(dir) as f:
                for line in f:
                    try:
                        total += 1
                        line = line.strip().encode('utf-8')
                        if not line:
                            continue
                        md5 = line[:line.find('\001')]
                        record = line[line.find('\001') + 1:]
                        record_md5 = u.get_md5(record)
                        if md5 == record_md5:
                            merge_total += 1
                            record_list.append(record)
                        else:
                            raise Exception('check is faild')

                        if record_list.__len__() >=10:
                            fi.write_file_content_pattern(tmp_path,start_char + ('\n'.join(record_list)), pattern='a')
                            record_list = []
                            start_char = '\n'
                    except Exception:
                        traceback.print_exc()
                        print line
                        alter_msg = 'alter merge api hainiu time:%s ip:%s' % (alter_time, u.get_local_ip())
                        s.send_sms(alter_msg)

            if record_list.__len__() >0:
                fi.write_file_content_pattern(tmp_path,start_char + ('\n'.join(record_list)), pattern='a')
                start_char = '\n'

            os.remove(dir)
            print dir,file_size,total,merge_total

    if os.path.exists(tmp_path) and os.path.getsize(tmp_path) > 0:
        shutil.move(tmp_path, up_path)
Пример #4
0
def crawler_web_seed_url(url):
    '''
    爬取种子页的所有a链接
    :param url:   种子页url
    :return: 无
    '''

    r = RequestUtil()
    hu = HtmlUtil()
    u = Util()

    # 通过phandomjs 请求url,返回网页,包括网页的ajax请求
    html = r.http_get_phandomjs(url)

    #html = html.decode('utf-8').encode(sys.getfilesystemdomainencoding())
    #print html
    #可以从HTML或XML文件中提取数据的Python第三方库
    soup = BeautifulSoup(html, 'lxml')
    # a链接dom对象列表
    a_docs = soup.find_all("a")
    aset = set()
    #获取domain
    domain = hu.get_url_domain(url)
    #获取host
    host = hu.get_url_host(url)
    print 'domain==>', domain
    print 'host==>', host
    for a in a_docs:
        #获取a标签的href
        a_href = hu.get_format_url(url, a, host)
        #获取a标签的内容
        a_title = a.get_text().strip()
        if a_href == '' or a_title == '':
            continue

        if aset.__contains__(a_href):
            continue
        aset.add(a_href)

        #获取a标签的host
        a_host = hu.get_url_host(a_href)

        #获取a标签href链接url的md5
        a_md5 = u.get_md5(a_href)

        #获取a标签所对应的xpath
        a_xpath = hu.get_dom_parent_xpath_js_new(a)
        print("%s\t%s\t%s\t%s\t%s") % (a_title.decode("utf-8"), a_href, a_host,
                                       a_md5, a_xpath)

    r.close_phandomjs()
Пример #5
0
 def fail_action(self, values):
     ip = Util().get_local_ip()
     db_util = DBUtil(_HAINIU_DB)
     #1)记录队列表错误次数和ip;
     queue_update_sql1 = """
     update hainiu_queue set fail_times=fail_times+1,fail_ip=%s where id=%s;
     """
     #2)当某个机器的错误次数达到了当前机器设定的最大重试次数,把hainiu_queue 表对应的记录的
     #is_work = 0,让其他机器重试;
     queue_update_sql2 = """
     update hainiu_queue set is_work=0 where id=%s;
     """
     #3)更新内链表的失败次数和失败ip,队列表的数据不删除;
     inner_update_sql = """
     update hainiu_web_seed_internally set  fail_times=fail_times+1,fail_ip=%s where md5=%s and a_md5=%s
     """
     try:
         # 1)
         sql_params = [ip, values[0]]
         db_util.execute_no_commit(queue_update_sql1, sql_params)
         # 2)
         # 比较失败次数
         if self.current_retry_num == _QUEUE_NEWS_FIND['C_RETRY_TIMES'] - 1:
             sql_params = [self.id]
             db_util.execute_no_commit(queue_update_sql2, sql_params)
         sql_params = [ip, values[1], values[2]]
         db_util.execute_no_commit(inner_update_sql, sql_params)
         db_util.commit()
     except Exception, e:
         db_util.rollback()
         traceback.print_exc(e)
Пример #6
0
 def fail_action(self, values):
     ip = Util().get_local_ip()
     db_util = DBUtil(_HAINIU_DB)
     #1)记录hainiu_queue表错误次数和ip;
     # is_success,self.id,len(inner_list),len(exter_list),md5
     queue_update_sql1 = """
     update hainiu_queue set fail_times=fail_times+1,fail_ip=%s where id=%s;
     """
     #2)当某个机器的错误次数达到了当前机器设定的最大重试次数,把hainiu_queue
     # 表对应的记录的 is_work = 0,让其他机器重试;
     queue_update_sql2 = """
     update hainiu_queue set is_work=0 where id=%s;
     """
     #3)更新种子表的失败次数、失败ip;队列表的数据不删除,有可能是因为目标网站把ip给封了,
     # 在某个时间,写个脚本,把失败的队列数据改状态和失败次数和失败ip,重新爬取试试。
     seed_update_sql = """
     update hainiu_web_seed set  fail_times=fail_times+1,fail_ip=%s where md5=%s
     """
     try:
         sql_params = [ip, values[0]]
         db_util.execute_no_commit(queue_update_sql1, sql_params)
         # 比较失败次数
         if self.current_retry_num == _QUEUE_NEWS_FIND['C_RETRY_TIMES'] - 1:
             sql_params = [self.id]
             db_util.execute_no_commit(queue_update_sql2, sql_params)
         sql_params = [ip, values[3]]
         db_util.execute_no_commit(seed_update_sql, sql_params)
         db_util.commit()
     except Exception, e:
         traceback.print_exc(e)
         db_util.rollback()
Пример #7
0
    def push_message(self, message):
        self.__lock.acquire()
        u = Util()
        producer = u.get_dict_value(self.__kafka_connect_cache, self.cache_key)
        if producer is None:
            client = KafkaClient(hosts=self.host)
            topic = client.topics[self.topic]
            producer = topic.get_producer()
            self.__kafka_connect_cache[self.cache_key] = producer

        is_success = True
        try:
            producer.produce(message)
        except:
            is_success = False
            del self.__kafka_connect_cache[self.cache_key]
            self.rl.error('kafka push error cacheKey is %s' % (self.cache_key))
            self.rl.exception()

        self.__lock.release()
        return is_success
Пример #8
0
def test_beautiful():
    # url = 'http://roll.news.qq.com'
    url ='http://politics.gmw.cn/node_9844.htm'
    r = RequestUtil()
    hu = HtmlUtil()
    html = r.http_get_phandomjs(url)
    domain = get_tld(url)
    host = hu.get_url_host(url)
    u = Util()
    print "domain:",domain,":host:",host

    soup = BeautifulSoup(html, 'lxml')
    a_docs = soup.find_all("a")
    for a in a_docs:

        a_href = get_format_url(url,a,host)
        if a.text:
            print a.text
        if a_href:
            xpath = hu.get_dom_parent_xpath_js(a)

            print a_href,'_',xpath,u.get_md5(xpath)
Пример #9
0
def test_beautiful():
    r = RequestUtil()
    hu = HtmlUtil()
    u = Util()
    url = 'https://news.sina.com.cn/roll/#pageid=153&lid=2509&k=&num=50&page=1'
    html = r.http_get_phandomjs(url)

    #html = html.decode('utf-8').encode(sys.getfilesystemencoding())
    #print html
    #可以从HTML或XML文件中提取数据的Python第三方库
    soup = BeautifulSoup(html, 'lxml')
    a_docs = soup.find_all("a")
    aset = set()
    #获取domain
    domain = get_fld(url)
    #获取host
    host = hu.get_url_host(url)
    print 'domain==>', domain
    print 'host==>', host
    for a in a_docs:
        #获取a标签的href
        a_href = get_format_url(url, a, host)
        #获取a标签的内容
        a_title = a.get_text().strip()
        if a_href == '' or a_title == '':
            continue

        if aset.__contains__(a_href):
            continue
        aset.add(a_href)
        #获取a标签的host
        a_host = hu.get_url_host(a_href)
        #获取a标签href链接url的md5
        a_md5 = u.get_md5(a_href)
        #获取a标签所对应的xpath
        a_xpath = hu.get_dom_parent_xpath_js(a)
        print("%s\t%s\t%s\t%s\t%s") % (a_title.decode("utf-8"), a_href, a_host,
                                       a_md5, a_xpath)
Пример #10
0
def create_seed():
    url = "https://www.autohome.com.cn/all"
    catetory = "汽车"
    sql = """
    insert into hainiu_web_seed (url,md5,domain,host,category,status) values
    ('%s','%s','%s','%s','%s',0);
    """
    hu = HtmlUtil()
    domain = get_tld(url)
    host = hu.get_url_host(url)
    u = Util()
    md5 = u.get_md5(url)

    rl = LogUtil().get_base_logger()
    try:
        d = DBUtil(config._HAINIU_DB)
        sql = sql % (url, md5, domain, host, catetory)
        d.execute(sql)
    except:
        rl.exception()
        d.rollback()
    finally:
        d.close()
Пример #11
0
def create_seed():
    sql = """
    insert into web_seed (url,md5,domain,host,category,status) values
    ('%s','%s','%s','%s','%s',0);
    """
    url = "https://news.sina.com.cn/"
    catetory = "新闻"
    hu = HtmlUtil()
    domain = get_tld(url)
    host = hu.get_url_host(url)
    u = Util()
    md5 = u.get_md5(url)

    rl = LogUtil().get_base_logger()
    try:
        d = DBUtil(config._ZZ_DB)
        sql = sql % (url, md5, domain, host, catetory)
        d.execute(sql)
    except:
        rl.exception()
        d.rollback()
    finally:
        d.close()
Пример #12
0
def print_news_url_content(news_url):
    '''
    打印最终新闻页面内容
    :param news_url:
    :return:
    '''
    r = RequestUtil()
    hu = HtmlUtil()
    u = Util()

    # 通过phandomjs 请求url,返回网页,包括网页的ajax请求
    html = r.http_get_phandomjs(news_url)

    #html = html.decode('utf-8').encode(sys.getfilesystemdomainencoding())
    print html

    r.close_phandomjs()
Пример #13
0
    def queue_items(self):
        '''
        通过悲观锁+事务+更新状态来实现多个机器串行拿取数据,
        并把其封装成HainiuConsumerAction对象实例列表返回
        '''
        select_sql = """
        select id,action,params
        from hainiu_queue where type=%s and is_work=%s and fail_ip!=%s and fail_times<%s limit %s for update;
        """

        # 更新SQL-拼字符串
        update_sql = """
        update hainiu_queue set is_work=1 where id in (%s);
        """
        c_actions = []
        # 用于装id,来更新
        ids = []
        db_util = DBUtil(_HAINIU_DB)
        try:
            # sql_params = [1, 0, _QUEUE_NEWS_FIND['MAX_FAIL_TIMES'], _QUEUE_NEWS_FIND['LIMIT_NUM']]
            # 屏蔽ip查询的参数
            ip = Util().get_local_ip()
            sql_params = [
                1, 0, ip, _QUEUE_NEWS_FIND['MAX_FAIL_TIMES'],
                _QUEUE_NEWS_FIND['LIMIT_NUM']
            ]
            # ({},{})
            res1 = db_util.read_dict(select_sql, sql_params)
            for row in res1:
                id = row['id']
                ids.append(str(id))
                act = row['action']
                params = row['params']
                c_action = NewsFindConsumerAction(id, act, params)
                c_actions.append(c_action)

            if len(ids) > 0:
                db_util.execute_no_commit(update_sql % ",".join(ids))

            db_util.commit()
        except Exception, e:
            db_util.rollback()
            traceback.print_exc(e)
Пример #14
0
    def queue_items(self):

        # 屏蔽ip的查询方式
        select_sql='''
        select id, action, params from web_queue where type=%s
        and is_work=%s and fail_ip != %s and fail_times < %s limit 0, %s for update;
        '''

        update_sql='''
        update web_queue set is_work=1 where id in(%s);
        '''
        db_util = DBUtil(_ZZ_DB)

        try:
            ip = Util().get_local_ip()
            sql_params = [1, 0, ip, _QUEUE_ZZ["MAX_FAIL_TIMES"], _QUEUE_ZZ["LIMIT_NUM"]]

            res = db_util.read_dict(select_sql, sql_params)
            actions = []

            ids = []
            for row in res:
                id = row["id"]
                ids.append(str(id))
                action = row["action"]
                params = row["params"]

                # 封装对象
                c_action = WebConsumerAction(id, action, params)
                actions.append(c_action)

            if len(actions) != 0:
                # 更新 is_work=1
                db_util.execute_no_commit(update_sql % ",".join(ids))

            db_util.commit()

        except Exception, err:
            actions = []
            db_util.rollback()
            traceback.print_exc(err)
Пример #15
0
    def fail_action(self, values):
        # 每次失败都需要更新ip 和 失败次数
        update_sql1='''
        update web_queue set fail_ip = %s , fail_times = fail_times + 1 where id = %s;
        '''
        # 当失败次数到达每台机器的最大重试次数,就将该记录的is_work=0 ,让其重试
        update_sql2='''
        update web_queue set is_work = 0 where id = %s;
        '''
        # 更新seed表状态
        update_seed_sql = '''
        update web_seed set fail_times=fail_times + 1,fail_ip=%s where md5 =%s;
        '''
        # 更新externally表状态
        update_exter_sql = '''
        update web_seed_externally set fail_times=fail_times + 1,fail_ip=%s where a_md5 =%s;
        '''

        db_util = DBUtil(_ZZ_DB)

        try:
            id = values[0]
            ip = Util().get_local_ip()
            # 每次更新失败ip 和失败次数
            # queue表
            sql_params = [ip, id]
            db_util.execute_no_commit(update_sql1, sql_params)
            # seed 表
            sql_params = [ip, values[1]]
            db_util.execute(update_seed_sql, sql_params)
            # externally表
            db_util.execute(update_exter_sql, sql_params)

            if self.current_retry_num == _QUEUE_ZZ["C_RETRY_TIMES"] - 1:
                db_util.execute_no_commit(update_sql2 % id)

            db_util.commit()

        except Exception,err:
            db_util.rollback()
            traceback.print_exc(err)
Пример #16
0
Copyright (c) 2019/3/16, 海牛学院版权所有.
@author: 潘牛
'''

import mx.URL, sys
from tld import get_tld
from bs4 import BeautifulSoup
from lxml import etree
from commons.util.request_util import RequestUtil
from commons.util.html_util import HtmlUtil
from commons.util.util import Util

if __name__ == '__main__':
    r = RequestUtil()
    hu = HtmlUtil()
    u = Util()
    url = 'https://news.sina.com.cn/roll/#pageid=153&lid=2509&k=&num=50&page=1'
    html = r.http_get_phandomjs(url)

    dom_tree = etree.HTML(html)

    ###XPath匹配
    a_text = dom_tree.xpath(
        "//div[@id='d_list']/ul[5]/li[2]/span[contains(@class,'c_tit')]/a[1]/text()"
    )
    a_href = dom_tree.xpath("//div[@id='d_list']/ul[8]/li[3]/span[2]/a/@href")
    print a_text[0]
    print a_href[0]

    #--------本地测试-----------------------
    # myPage = '''<html>
Пример #17
0
    def action(self):
        is_success = True
        t = TimeUtil()
        f = FileUtil()
        u = Util()
        hu = HtmlUtil()
        r = RequestUtil()
        values = []
        md5 = u.get_md5(self.url)
        now_time = datetime.now()
        update_time = int(time.mktime(now_time.timetuple()))
        create_time = update_time
        create_day = int(t.now_day().replace('-', ''))
        create_hour = int(t.now_hour())
        now_minute = int(t.now_min())
        for i in xrange(60, -5, -5):
            if now_minute >= i:
                now_minute = i
                break
        now_minute = t.now_time(format='%Y%m%d%H') + (
            '0%s' % (str(now_minute)) if now_minute < 10 else str(now_minute))

        values.append(MySQLdb.escape_string(self.url))
        values.append(md5)
        values.append(create_time)
        values.append(create_day)
        values.append(create_hour)
        values.append('')
        values.append(MySQLdb.escape_string(self.param))
        values.append(update_time)
        try:
            html = r.http_get_phandomjs(self.url)
            domain = get_tld(self.url)
            values[5] = domain

            soup = BeautifulSoup(html, 'lxml')
            title_doc = soup.find('title')
            title = title_doc.contents[0] if title_doc is not None and len(
                title_doc.contents) == 1 else ''

            host = hu.get_url_host(self.url)
            values.append(host)
            values.append(MySQLdb.escape_string(title))

            # k = KafkaUtil(config._KAFKA_CONFIG)
            html = html.replace(content._SEQ1,
                                '').replace(content._SEQ2, content._SEQ4)
            # push_str = content._SEQ3.join(('%s','%s')) % (self.url,html)
            # push_str = content._SEQ3.join(('%s','%s')) % (u.get_md5(push_str),push_str)
            # push_str = bytes(push_str)
            # is_success = k.push_message(push_str)

            is_success = True
            if is_success:
                self.save_file(create_time, f, now_minute, u, self.url, html)
            else:
                values.append('')
                values.append('')
                self.rl.error("kafka push error")

        except:
            is_success = False
            values.append('')
            values.append('')
            self.rl.exception()
        finally:
            r.close_phandomjs()

        try:
            if is_success:
                values.append(1)
                insert_web_page_sql = """
                    insert into hainiu_web_page (url,md5,create_time,create_day,create_hour,domain,param,update_time,host,
                    title,status) values ("%s","%s",%s,%s,%s,"%s","%s",%s,"%s","%s",%s) on DUPLICATE KEY  UPDATE update_time=values(update_time);
                """
            else:
                ip = u.get_local_ip()
                values.append(ip)
                values.append(2)
                insert_web_page_sql = """
                    insert into hainiu_web_page (url,md5,create_time,create_day,create_hour,domain,param,update_time,host,
                    title,fail_ip,status) values ("%s","%s",%s,%s,%s,"%s","%s",%s,"%s","%s","%s",%s)
                    on DUPLICATE KEY UPDATE fail_times=fail_times+1,fail_ip=values(fail_ip);
                """

            d = DBUtil(config._HAINIU_DB)
            sql = insert_web_page_sql % tuple(values)
            d.execute(sql)
        except:
            is_success = False
            self.rl.exception()
            self.rl.error(sql)
            d.rollback()
            d.commit()
        finally:
            d.close()

        return super(self.__class__,
                     self).result(is_success,
                                  [md5, self.url, update_time, self.queue_id])
Пример #18
0
    def action(self):
        is_success = True
        t = TimeUtil()
        u = Util()
        hu = HtmlUtil()
        r = RequestUtil()
        in_values = []
        ex_values = []
        a_href = ''
        main_md5 = u.get_md5(self.url)
        now_time = datetime.now()
        update_time = int(time.mktime(now_time.timetuple()))
        create_time = update_time
        create_day = int(t.now_day().replace('-', ''))
        create_hour = int(t.now_hour())
        try:
            html = r.http_get_phandomjs(self.url)
            domain = get_tld(self.url)

            soup = BeautifulSoup(html, 'lxml')
            a_docs = soup.find_all("a")
            a_set = set()
            a_param = {}
            out_json_srt = ''
            status = 0
            host = hu.get_url_host(self.url)

            for a in a_docs:
                a_href = self.get_format_url(a,host)
                a_title = a.get_text().strip()
                if a_href == '' or a_title == '':
                    continue
                if a_set.__contains__(a_href):
                    continue
                a_set.add(a_href)

                req = urllib2.Request(url=a_href)
                a_host = req.get_host() if req.get_host() is not None else ''
                a_md5 = u.get_md5(a_href)

                if a_title != '':
                    a_param['title'] = a_title
                    out_json_srt = json.dumps(a_param,ensure_ascii=False)

                a_xpath = hu.get_dom_parent_xpath_js(a)
                insert_values = (main_md5,domain,host,a_md5,a_host,a_xpath,create_time,create_day,create_hour,update_time,status,
                                 MySQLdb.escape_string(self.url),
                                 MySQLdb.escape_string(a_href),
                                 MySQLdb.escape_string(a_title),
                                 out_json_srt)

                if a_host.__contains__(domain):
                    in_values.append(insert_values)
                else:
                    ex_values.append(insert_values)

            in_table = 'hainiu_web_seed_internally'
            ex_table = 'hainiu_web_seed_externally'
            insert_sql = """
                insert into <table> (md5,domain,host,a_md5,a_host,a_xpath,create_time,create_day,create_hour,update_time,status,url,a_url,a_title,param)
                      values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE update_time=update_time;
            """
            try:
                d = DBUtil(config._HAINIU_DB)
                d.execute_no_commit("set NAMES utf8mb4;")
                if in_values.__len__() != 0:
                    sql = insert_sql.replace('<table>',in_table)
                    d.executemany_no_commit(sql,in_values)
                if ex_values.__len__() != 0:
                    sql = insert_sql.replace('<table>',ex_table)
                    d.executemany_no_commit(sql,ex_values)
                d.commit()
            except:
                is_success = False
                self.rl.exception()
                self.rl.error(sql)
                d.rollback()
            finally:
                d.close()

        except:
            is_success = False
            self.rl.exception()
        finally:
            r.close_phandomjs()

        return super(self.__class__, self).result(is_success, [main_md5,self.url,a_href,in_values.__len__(),ex_values.__len__(),self.queue_id])
Пример #19
0
    def action(self):
        #爬取 hainiu_queue 中符合要求的url 请求页面的所有 a标签url
        r = RequestUtil()
        hu = HtmlUtil()
        u = Util()
        #
        is_success = True
        db_util = DBUtil(_HAINIU_DB)
        time_util = TimeUtil()
        # 内外链表的列表
        inner_list = []
        exter_list = []
        #获取种子的md5
        md5 = u.get_md5(self.act)
        try:
            # 通过phandomjs 请求url,返回网页,包括网页的ajax请求
            html = r.http_get_phandomjs(self.act)
            #可以从HTML或XML文件中提取数据的Python第三方库
            soup = BeautifulSoup(html, 'lxml')
            # a链接dom对象列表
            a_docs = soup.find_all("a")
            if len(a_docs) == 0:
                is_success = False
            aset = set()
            #获取种子的domain
            domain = hu.get_url_domain(self.act)
            #获取种子的host
            host = hu.get_url_host(self.act)

            # 时间(create_time、create_day、create_hour、update_time)
            # create_time=time_util.get_timestamp()
            #
            # create_day = int(time_util.now_day().replace('-', ''))
            # create_hour=int(time_util.now_hour())
            # update_time=create_time
            create_time = time_util.get_timestamp()
            # 获取年月日格式
            create_day = int(time_util.now_day(format='%Y%m%d'))
            # 获取小时
            create_hour = int(time_util.now_hour())
            update_time = create_time

            # params_json = json.dumps(self.params, ensure_ascii=False, encoding='utf-8')

            for a_doc in a_docs:
                #获取a标签的href
                a_href = hu.get_format_url(self.act, a_doc, host)
                #获取a标签的内容
                a_title = a_doc.get_text().strip()
                if a_href == '' or a_title == '':
                    continue
                if aset.__contains__(a_href):
                    continue
                aset.add(a_href)
                #获取a标签的host
                a_host = hu.get_url_host(a_href)

                #获取a标签href链接url的md5
                a_md5 = u.get_md5(a_href)

                #获取a标签所对应的xpath
                a_xpath = hu.get_dom_parent_xpath_js_new(a_doc)
                # 一行数据
                row_data = (self.act, md5, self.params, domain, host, a_href,
                            a_md5, a_host, a_xpath, a_title, create_time,
                            create_day, create_hour, update_time)
                if a_href.__contains__(domain):
                    inner_list.append(row_data)
                else:
                    exter_list.append(row_data)
            # 并解析存入内链表或外链表,在存入时,如果url已存在,只做
            # update 操作。(保证链接页面不会重复爬取)
            if len(inner_list) > 0:
                inner_insert_sql = """
              insert into hainiu_web_seed_internally
              (url,md5,param,domain,host,a_url,a_md5,a_host,a_xpath,a_title,create_time,
              create_day,create_hour,update_time)
              values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
              ON DUPLICATE KEY UPDATE update_time=values(update_time);
            """
                db_util.executemany_no_commit(inner_insert_sql, inner_list)
            if len(exter_list) > 0:
                exter_insert_sql = """
              insert into hainiu_web_seed_externally
              (url,md5,param,domain,host,a_url,a_md5,a_host,a_xpath,a_title,create_time,
              create_day,create_hour,update_time)
              values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
              ON DUPLICATE KEY UPDATE update_time=values(update_time);
            """
                db_util.executemany_no_commit(exter_insert_sql, exter_list)
            db_util.commit()
        except Exception, e:
            is_success = False
            db_util.rollback()
            traceback.print_exc(e)
Пример #20
0
    def action(self):
        logger = LogUtil().get_logger("download_action", "download_action")
        #1)把队列中的url的HTML内容下载到文件中,每个消费线程每隔5分钟生成一个新的文件。
        r = RequestUtil()
        # hu = HtmlUtil()
        u = Util()
        db_util = DBUtil(_HAINIU_DB)
        time_util = TimeUtil()
        # 通过phandomjs 请求url,返回网页,包括网页的ajax请求
        html = r.http_get_phandomjs(self.act)
        # 拼接要写入的内容
        html = html.replace("\r", "").replace("\n", "\002")
        str1 = self.act + "\001" + html
        str2 = u.get_md5(str1) + "\001" + str1
        # 成功失败标记
        is_success = True
        # 获取时间
        # now_time====>年月日时分秒
        now_time = time.strftime("%Y%m%d,%H,%M,%S").split(",")
        day = now_time[0]
        hour = now_time[1]
        minute = int(now_time[2])
        for i in range(60, -5, -5):
            if minute < i:
                continue
            minute = i
            break

        minute = '0%s' % minute if minute < 10 else minute
        now_minute = '%s%s%s' % (day, hour, minute)

        file_names = os.listdir(_LOCAL_DATA_DIR % ('tmp'))
        logger.info("file_names:%s" % file_names)
        thread_name = self.consumer_thread_name
        logger.info("thread_name:%s" % thread_name)
        last_file_name = ''
        for file_name in file_names:
            tmp = file_name.split("#")[0]
            if tmp == thread_name:
                last_file_name = file_name
                break

        now_file_name = "%s#%s" % (thread_name, now_minute)
        try:
            if last_file_name == '' or last_file_name != now_file_name:
                # 移动老文件
                # if last_file_name != '':
                oldPath = _LOCAL_DATA_DIR % ("tmp/") + last_file_name
                logger.info("oldPath:%s" % oldPath)
                # if os.path.exists(oldPath) and os.path.getsize(oldPath) > 0:
                if last_file_name != '':
                    done_file_name = last_file_name + "#" + str(
                        TimeUtil().get_timestamp())
                    logger.info("last_file_name:%s" % last_file_name)
                    newPath = _LOCAL_DATA_DIR % ("done/") + done_file_name
                    logger.info("newPath:%s" % newPath)
                    shutil.move(oldPath, newPath)
                # 写入新文件
                now_file_name = _LOCAL_DATA_DIR % ("tmp/") + now_file_name
                # if not os.path.exists(_LOCAL_DATA_DIR+'tmp2/'):
                #     os.mkdir(_LOCAL_DATA_DIR+'tmp2/')

                logger.info("now_file_name:%s" % now_file_name)
                f = open(now_file_name, 'a+')
                f.write(str2)
                f.close()
            else:
                last_file_name = _LOCAL_DATA_DIR % ("tmp/") + last_file_name
                logger.info("last_file_name:%s" % last_file_name)
                # 写入老文件时进行换行
                insert_str = "\n" + str2
                f = open(last_file_name, 'a+')
                f.write(insert_str)
                f.close()
        except Exception, e:
            is_success = False
            traceback.print_exc(e)
Пример #21
0
    def action(self, *values):

        # 插入内链表sql语句
        insert_seed_internally='''
        insert into web_seed_internally
        (url,md5,param,domain,host,a_url,a_md5,a_host,a_xpath,a_title,create_time,create_day,create_hour,update_time,status)
        values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE update_time=VALUES(update_time);
        '''
        # 插入外链表sql语句
        insert_seed_externally='''
        insert into web_seed_externally
        (url,md5,param,domain,host,a_url,a_md5,a_host,a_xpath,a_title,create_time,create_day,create_hour,update_time,status)
        values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE update_time=VALUES(update_time);
        '''

        # 获取时间
        a_time = TimeUtil()
        db_util = DBUtil(_ZZ_DB)
        # redis_d = RedisUtill()
        total_count = 0
        in_count = 0
        ex_count = 0
        try:
            # 解析主网页信息
            hu = HtmlUtil()

            domain = hu.get_url_domain(self.act)
            host = hu.get_url_host(self.act)
            u = Util()
            md5 = u.get_md5(self.act)

            # 解析a标签信息
            r = RequestUtil()
            # 通过phandomjs 请求url,返回网页,包括网页的ajax请求
            html = r.http_get_phandomjs(self.act)
            # 可以从HTML或XML文件中提取数据的Python第三方库
            soup = BeautifulSoup(html, 'lxml')
            # a链接dom对象列表
            aset = set()
            # 获取host
            a_host = hu.get_url_host(self.act)

            # a_docs = soup.find_all("a",href=re.compile("^(/|.*"+domain+")"))
            a_docs = soup.find_all("a")

            for a in a_docs:

                total_count += 1
                # 获取a标签的href
                a_url = hu.get_format_url(self.act,a,a_host)
                # 获取a标签的内容
                a_title = a.get_text().strip()
                if a_url == '' or a_title == '':
                    continue

                if aset.__contains__(a_url):
                    continue
                aset.add(a_url)

                # 获取a标签的host
                a_host = hu.get_url_host(a_url)

                # 获取a标签href链接url的md5
                a_md5 = u.get_md5(a_url)

                # 获取a标签所对应的xpath
                a_xpath = hu.get_dom_parent_xpath_js_new(a)

                create_time = a_time.get_timestamp()
                create_day = int(a_time.now_day(format='%Y%m%d'))
                create_hour = int(a_time.now_hour())

                params_sql = [self.act,md5,self.params,domain,host,a_url,a_md5,a_host,a_xpath,a_title,create_time,create_day,create_hour,create_time,0]
                if re.compile("^(/|.*"+domain+")").match(a_url) is not None:
                    db_util.execute(insert_seed_internally, params_sql)
                #
                #     # redis
                #     redis_md5 = u.get_md5(md5+"\001"+a_md5)
                #     find_key = redis_d.get_value_for_key('seed:%s:a_url' % redis_md5)
                #     if find_key == None:
                #         # url,md5,param,domain,host,a_url,a_md5,a_host,a_xpath,a_title,create_time,create_day,create_hour,update_time,status
                #         dicts = {'seed:%s:param' % redis_md5 :self.params, 'seed:%s:a_url' % redis_md5 : a_url,
                #                 'seed:%s:md5' % redis_md5 : md5, 'seed:%s:a_md5' % redis_md5 :a_md5}
                #
                #         dicts_temp = {'seed_temp:%s:param' % redis_md5 :self.params,'seed_temp:%s:a_url' % redis_md5 : a_url,
                #                     'seed_temp:%s:md5' % redis_md5 : md5, 'seed_temp:%s:a_md5' % redis_md5 : a_md5}
                #         redis_d.set_batch_datas(dicts)
                #         redis_d.set_batch_datas(dicts_temp)

                    in_count += 1
                else:
                    db_util.execute(insert_seed_externally, params_sql)
                    ex_count += 1

            r.close_phandomjs()
        except Exception, err:
            db_util.rollback()
            traceback.print_exc(err)