Python get_html_by_urllib 예제들, utils.tools.get_html_by_urllib Python 예제들

예제 #1

0

파일 보기

 def inner_add_url(url):
     html = tools.get_html_by_urllib(url)
     regexs = 'pg.pageCount = parseInt\(\'(\d*?)\',10\)'
     pages = tools.get_info(html, regexs)
     pages = int(pages[0])
     for i in range(1, pages + 1):
         new_url = url+'=%d' % i
         base_parser.add_url('WWA_search_app_urls', SITE_ID, new_url)

예제 #2

0

파일 보기

파일: soubaidupan_parser.py 프로젝트: xuexiteam/internet-content-detection

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']

    html = tools.get_html_by_urllib(root_url)
    title = '<tr height="25"><td><a href=".*?"  title="(.*?)"'
    video_url = ['<tr height="25"><td><a href="(.*?)"']
    author = ['<a href="user-.*?.html" target="_blank">(.*?)</a>']
    watched_count = ['浏览次数: </span>(.*?)&nbsp']
    file_size = ['资料大小: </span>(.*?)&nbsp']
    download_count = ['下载次数: </span>(.*?)&nbsp']

    titles = tools.get_info(html, title, allow_repeat = True)
    video_urls = tools.get_info(html, video_url, allow_repeat = True)
    authors = tools.get_info(html, author, allow_repeat = True)
    watched_counts = tools.get_info(html, watched_count, allow_repeat = True)
    file_sizes = tools.get_info(html, file_size, allow_repeat= True)
    download_counts = tools.get_info(html, download_count, allow_repeat = True)


    for i in range(len(titles)):
        title = titles[i]
        title = tools.del_html_tag(title)

        video_url = video_urls[i]
        video_url = tools.get_full_url('http://www.sobaidupan.com', video_url)

        author = authors[i]
        watched_count = watched_counts[i]
        file_size = file_sizes[i]
        download_count = download_counts[i]

        log.debug('''
            标题：    %s
            视频地址： %s
            作者：    %s
            观看数    %s
            资料大小  %s
            下载次数  %s
        '''%(title, video_url, author, watched_count, file_size, download_count))

        contained_key, contained_key_count = base_parser.get_contained_key(title, '',
                                                            remark['search_keyword1'],
                                                            remark['search_keyword2'], remark['search_keyword3'])
        if not contained_key:
            continue

        base_parser.add_content_info('VA_content_info', SITE_ID, video_url, title, file_size = file_size,
                                     file_name = title, author = author, watched_count = watched_count,
                                     download_count = download_count, search_type = search_type,
                                     keyword = contained_key, keyword_count = contained_key_count, task_id = remark['task_id'])

    base_parser.update_url('VA_urls', root_url, Constance.DONE)

예제 #3

0

파일 보기

파일: luzhoubaidu_parser.py 프로젝트: xuexiteam/internet-content-detection

def add_root_url(parser_params={}):
    log.debug('''
        添加根url
        parser_params : %s
        ''' % str(parser_params))

    url = "http://www.luzhou.ccoo.cn/tieba/"
    html = tools.get_html_by_urllib(url, 'gb2312')
    regexs = '<input name="maxpage" value="(\d*?)" type="hidden" />'
    pages = tools.get_info(html, regexs)
    pages = int(pages[0])
    for x in range(1, pages + 1):
        url = 'http://www.luzhou.ccoo.cn/tieba/index-0-%d-1.html' % x
        html2 = tools.get_html_by_urllib(url, code='gb2312')
        regexs = '<div class="topic_show_l">(.*?)</a></div>.*?</div>.*?</div>'
        htmls2 = tools.get_info(html2, regexs)
        for ever_html in htmls2:
            regex = '<a href="(.*?)".*?</a>'
            url = tools.get_info(ever_html, regex)
            url = 'http://www.luzhou.ccoo.cn' + url[0]
            base_parser.add_url('op_urls', SITE_ID, url, depth=1)

예제 #4

0

파일 보기

파일: soubaidupan_parser.py 프로젝트: xuexiteam/internet-content-detection

def add_root_url(parser_params = {}):
    log.debug('''
        添加根url
        parser_params : %s
        '''%str(parser_params))


    search_keyword1 = parser_params['search_keyword1']
    search_keyword2 = parser_params['search_keyword2']
    search_keyword3 = parser_params['search_keyword3']

    remark = parser_params

    search_keywords = []
    for str_key1 in search_keyword1:
        for str_key2 in search_keyword2:
            search_keywords.append((str_key1 + str_key2).strip())
    else:
        if not search_keyword1:
            search_keywords = search_keyword2
        if not search_keyword2:
            search_keywords = search_keyword1

    for search_keyword in search_keywords:
        if not search_keyword.strip():
            continue
        # 取页数
        url = 'http://www.sobaidupan.com/search.asp?r=2&wd=%s&p=0&page=0' % search_keyword
        html = tools.get_html_by_urllib(url)
        regex = ['分页：1/(.*?)页'] # 测试0页
        page_count = tools.get_info(html, regex)
        page_count = int(page_count[0]) if page_count else 0
        print(page_count)

        for page in range(0, page_count):
            url = 'http://www.sobaidupan.com/search.asp?r=2&wd=%s&p=0&page=%d'%(search_keyword, page)
            if not base_parser.add_url('VA_urls', SITE_ID, url, remark=remark):
                base_parser.update_url('VA_urls', url, Constance.TODO)

예제 #5

0

파일 보기

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html = tools.get_html_by_urllib(source_url, code='gb2312')
    if html == None:
        base_parser.update_url('op_urls', source_url, Constance.EXCEPTION)
        return

    urls = tools.get_urls(html)
    for url in urls:
        if re.match("http", url):
            new_url = url
        elif re.match('/', url):
            new_url = 'http://www.lzzjw.com' + url
        else:
            new_url = 'http://www.lzzjw.com/' + url

        base_parser.add_url('op_urls', website_id, new_url, depth + 1)

    # 取当前页的文章信息
    # 标题
    regexs = '<h1>(.*?)</h1>'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)

    #时间
    regexs = '<h3>时间：(.*?) 点击'
    release_time = tools.get_info(html, regexs)
    release_time = release_time and release_time[0] or ''
    release_time = tools.del_html_tag(release_time)

    # 作者
    regexs = '<div id="copy">作者：(.*?)来源'
    author = tools.get_info(html, regexs)
    author = author and author[0] or ''
    author = tools.del_html_tag(author)

    # 来源
    regexs = ' <div id="copy">作者：.*?　来源：(.*?)</div>'
    origin = tools.get_info(html, regexs)
    origin = origin and origin[0] or ''
    origin = tools.del_html_tag(origin)

    #点击数
    regexs = 'ID=(.*)'
    times_script_url = tools.get_info(source_url, regexs)
    times_script_url = ''.join(times_script_url)
    times_script_url = 'http://www.lzzjw.com/js/count.asp?id=' + times_script_url
    watched_count_html, request = tools.get_html_by_requests(times_script_url)
    regexs = '\'(\d*?)\''
    watched_count = tools.get_info(watched_count_html, regexs)
    watched_count = watched_count and watched_count[0] or ''
    watched_count = tools.del_html_tag(watched_count)

    # 内容
    regexs = ['<div id="content">(.*?)<div id="copy">']
    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                depth               = %s
                url                 = %s
                title               = %s
                release_time        = %s
                author              = %s
                origin              = %s
                watched_count       = %s
                content             = %s
             ''' % (depth + 1, source_url, title, release_time, author, origin,
                    watched_count, content))

    if content and title:
        base_parser.add_op_info('op_content_info',
                                website_id,
                                url=source_url,
                                title=title,
                                release_time=release_time,
                                author=author,
                                origin=origin,
                                watched_count=watched_count,
                                content=content)

    # 更新source_url为done
    base_parser.update_url('op_urls', source_url, Constance.DONE)