Exemplo n.º 1
0
def save_video_info(release_time='',
                    content='',
                    url='',
                    author='',
                    title='',
                    image_url='',
                    site_name='',
                    play_count=None,
                    comment_count=None,
                    praise_count=None,
                    summary='',
                    time_length=None):
    domain = tools.get_domain(url)
    content_info = {
        'domain': domain,
        'uuid': tools.get_uuid(title, domain),
        'site_name': site_name,
        'image_url': image_url,
        'title': title,
        'author': author,
        'url': url,
        'content': content,
        'release_time': tools.format_date(release_time),
        'play_count': play_count,
        'comment_count': comment_count,
        'praise_count': praise_count,
        'time_length': time_length,
        'record_time': tools.get_current_date(),
        'summary': summary
    }
    log.debug(tools.dumps_json(content_info))

    es.add('video_news', content_info, content_info['uuid'])
Exemplo n.º 2
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']
    offset = remark.get('offset')

    html = tools.get_html_by_webdirver(root_url)
    headers = tools.get_tag(html, 'div', {'class': 'result'}, find_all=True)
    if not headers:
        base_parser.update_url('BAIDU_NEWS_urls', root_url, Constance.DONE)

    for header in headers:
        # 查看更多相关新闻
        regex = ' <span class="c-info"><a.*?href="(.*?)".*?查看更多相关新闻'
        more_news_url = tools.get_info(str(header), regex, fetch_one = True)
        if more_news_url:
            more_news_url = tools.get_full_url('http://news.baidu.com', more_news_url)
            more_news_url = more_news_url.replace('amp;', '')
            base_parser.add_url('BAIDU_NEWS_urls', SITE_ID, more_news_url, depth = 1, remark = {'offset':0})

        url = header.h3.a['href']
        article_extractor = ArticleExtractor(url)
        content = title = release_time = author = website_domain =''
        content = article_extractor.get_content()
        if content:
            title = article_extractor.get_title()
            release_time = article_extractor.get_release_time()
            author = article_extractor.get_author()
            website_domain = tools.get_domain(url)
            uuid = tools.get_uuid(title, website_domain)
            website_name = ''
            website_position = None

            log.debug('''
                uuid         %s
                title        %s
                author       %s
                release_time %s
                domain       %s
                url          %s
                content      %s
                '''%(uuid, title, author, release_time, website_domain, url, '...'))

            # 入库
            if tools.is_have_chinese(content):
                is_continue = self_base_parser.add_news_acticle(uuid, title, author, release_time, website_name , website_domain, website_position, url, content)

                if not is_continue:
                    break
    else:
        # 循环正常结束 该页均正常入库, 继续爬取下页
        offset += 50
        url = tools.replace_str(root_url, 'pn=\d*', 'pn=%d'%offset)
        base_parser.add_url('BAIDU_NEWS_urls', SITE_ID, url, depth = 0, remark = {'offset': offset})

    base_parser.update_url('BAIDU_NEWS_urls', root_url, Constance.DONE)
Exemplo n.º 3
0
def add_website_info(table, site_id, url, name, domain = '', ip = '', address = '', video_license = '', public_safety = '', icp = ''):
    '''
    @summary: 添加网站信息
    ---------
    @param table: 表名
    @param site_id: 网站id
    @param url: 网址
    @param name: 网站名
    @param domain: 域名
    @param ip: 服务器ip
    @param address: 服务器地址
    @param video_license: 网络视听许可证|
    @param public_safety: 公安备案号
    @param icp: ICP号
    ---------
    @result:
    '''

    # 用程序获取domain,ip,address,video_license,public_safety,icp 等信息
    domain = tools.get_domain(url)

    site_info = {
        'site_id':site_id,
        'name':name,
        'domain':domain,
        'url':url,
        'ip':ip,
        'address':address,
        'video_license':video_license,
        'public_safety':public_safety,
        'icp':icp,
        'read_status':0,
        'record_time': tools.get_current_date()
    }
    mongodb.add(table, site_info)
Exemplo n.º 4
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']

    # 获取搜索词比配到的url
    start = 0
    while True:
        urls = mg.search_url(query=root_url,
                             num=50,
                             start=start,
                             pause=random.randint(MIN_SLEEP_TIME,
                                                  MAX_SLEEP_TIME))
        if not urls:
            break

        for url in urls:
            url = url.replace('amp;', '')

            article_extractor = ArticleExtractor(url)
            content = title = release_time = author = website_domain = ''
            content = article_extractor.get_content()
            if content:
                title = article_extractor.get_title()
                release_time = article_extractor.get_release_time()
                author = article_extractor.get_author()
                website_domain = tools.get_domain(url)
                uuid = tools.get_uuid(title, website_domain)
                website_name = ''
                website_position = 35  # 境外

                log.debug('''
                    uuid         %s
                    title        %s
                    author       %s
                    release_time %s
                    domain       %s
                    url          %s
                    content      %s
                    ''' % (uuid, title, author, release_time, website_domain,
                           url, '...'))

                # 入库
                if tools.is_have_chinese(content):
                    is_continue = self_base_parser.add_news_acticle(
                        uuid, title, author, release_time, website_name,
                        website_domain, website_position, url, content)

                    if not is_continue:
                        break
        else:
            # 循环正常结束 该页均正常入库, 继续爬取下页
            start += 50

    base_parser.update_url('google_news_urls', root_url, Constance.DONE)
Exemplo n.º 5
0
def add_html_url(html, depth, spider_depth, website_url, website_name,
                 website_domain, remark):
    # 近一步取待做url
    if depth < spider_depth - 1:
        urls = tools.get_urls(html)
        for url in urls:
            url = tools.get_full_url(website_url, url)
            if website_name == '百度新闻':
                remark['website_name'] = ''
                remark['website_domain'] = tools.get_domain(url)
                remark['website_position'] = None
                base_parser.add_url(SITE_ID, url, depth + 1, remark=remark)
            elif website_domain in url:
                base_parser.add_url(SITE_ID, url, depth + 1, remark=remark)
    def get_task_from_oracle(self):
        tasks = []

        offset = 0
        while True:
            # 取任务
            task_sql = '''
                select *
                  from (select t.id, t.name, t.position, t.url, t.depth, rownum r
                          from TAB_IOPM_SITE t
                         where classify = 1
                           and t.mointor_status = 701
                           and (t.position != 35 or t.position is null)
                           and rownum < {page_size})
                 where r >= {offset}
            '''.format(page_size=offset + ONE_PAGE_SIZE, offset=offset)

            results = self._oracledb.find(task_sql)
            offset += ONE_PAGE_SIZE

            if not results: break

            # 拼装成json格式的url
            for task in results:
                website_id = task[0]
                website_name = task[1]
                website_position = task[2]
                website_url = task[3]
                website_domain = tools.get_domain(website_url)
                spider_depth = task[4]

                remark = {
                    'website_name': website_name,
                    'website_position': website_position,
                    'website_url': website_url,
                    'website_domain': website_domain,
                    'spider_depth': spider_depth
                }
                url_dict = {
                    'site_id': 1,
                    'url': website_url,
                    'depth': 0,
                    'remark': remark,
                    'retry_times': 0
                }

                tasks.append(url_dict)

        return tasks
Exemplo n.º 7
0
def save_baidu_info(release_time='',
                    content='',
                    url='',
                    author='',
                    title='',
                    is_debug=False):
    domain = tools.get_domain(url)
    content_info = {
        'domain': domain,
        'title': title,
        'author': author,
        'url': url,
        'content': content,
        'release_time': release_time,
    }
    log.debug(tools.dumps_json(content_info))
Exemplo n.º 8
0
def is_have_video_by_site(url):
    '''
    @summary: 根据特定网站的特征来判断
    ---------
    @param url:
    ---------
    @result:
    '''

    domain = tools.get_domain(url)

    feas = db.find('FeaVideo_site', {'domain': domain})

    for fea in feas:
        video_fea = fea['video_fea'].split(',')

        if tools.get_info(url, video_fea):
            return True

    return False
Exemplo n.º 9
0
def add_root_url(parser_params={}):
    log.debug('''
        添加根url
        parser_params : %s
        ''' % str(parser_params))

    for task in parser_params:
        website_id = task[0]
        website_name = task[1]
        website_position = task[2]
        website_url = task[3]
        website_domain = tools.get_domain(website_url)

        base_parser.add_url('news_urls',
                            SITE_ID,
                            website_url,
                            remark={
                                'website_name': website_name,
                                'website_position': website_position,
                                'website_url': website_url,
                                'website_domain': website_domain
                            })
Exemplo n.º 10
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']
    website_name = remark['website_name']
    website_position = remark['website_position']
    website_url = remark['website_url']
    website_domain = remark['website_domain']

    html = tools.get_html(root_url)
    if not html:
        base_parser.update_url('news_urls', root_url, Constance.EXCEPTION)
        return

    # 近一步取待做url
    if depth < DEPTH:
        urls = tools.get_urls(html)
        for url in urls:
            url = tools.get_full_url(website_url, url)
            if website_name == '百度新闻':
                remark['website_name'] = ''
                remark['website_domain'] = tools.get_domain(url)
                remark['website_position'] = None
                base_parser.add_url('news_urls',
                                    SITE_ID,
                                    url,
                                    depth + 1,
                                    remark=remark)
            elif website_domain in url:
                base_parser.add_url('news_urls',
                                    SITE_ID,
                                    url,
                                    depth + 1,
                                    remark=remark)

    # 解析网页
    content = title = release_time = author = ''
    article_extractor = ArticleExtractor(root_url, html)
    content = article_extractor.get_content()
    if content:
        title = article_extractor.get_title()
        release_time = article_extractor.get_release_time()
        author = article_extractor.get_author()
        uuid = tools.get_uuid(
            title,
            website_domain) if title != website_name else tools.get_uuid(
                root_url, ' ')

        log.debug('''
            uuid         %s
            title        %s
            author       %s
            release_time %s
            website_name %s
            domain       %s
            position     %s
            url          %s
            content      %s
            ''' % (uuid, title, author, release_time, website_name,
                   website_domain, website_position, root_url, content))

        if tools.is_have_chinese(content):
            # 入库
            self_base_parser.add_news_acticle(uuid, title, author,
                                              release_time, website_name,
                                              website_domain, website_position,
                                              root_url, content)

    log.debug('%s 处理完成' % root_url)
    base_parser.update_url('news_urls', root_url, Constance.DONE)