Exemplo n.º 1
0
    def __parse_account_info(self, data, req_url):

        __biz = tools.get_param(req_url, "__biz")

        regex = 'id="nickname">(.*?)</strong>'
        account = tools.get_info(data, regex, fetch_one=True).strip()

        regex = 'profile_avatar">.*?<img src="(.*?)"'
        head_url = tools.get_info(data, regex, fetch_one=True)

        regex = 'class="profile_desc">(.*?)</p>'
        summary = tools.get_info(data, regex, fetch_one=True).strip()

        # 认证信息(关注的账号直接点击查看历史消息,无认证信息)
        regex = '<i class="icon_verify success">.*?</i>(.*?)</span>'
        verify = tools.get_info(data, regex, fetch_one=True)
        verify = verify.strip() if verify else ""

        # 二维码
        regex = 'var username = "" \|\| "(.*?)";'  # ||  需要转译
        qr_code = tools.get_info(data, regex, fetch_one=True)
        qr_code = "http://open.weixin.qq.com/qr/code?username="******"__biz": __biz,
            "account": account,
            "head_url": head_url,
            "summary": summary,
            "qr_code": qr_code,
            "verify": verify,
            "spider_time": tools.get_current_date(),
        }

        if account_data:
            data_pipeline.save_account(account_data)
Exemplo n.º 2
0
    def get_title(self):
        title = ''

        # 处理特殊的网站不规则的标题
        for domain, regex in SPECIAL_TITLE.items():
            if domain in self._url:
                title = tools.get_info(self._html, regex, fetch_one=True)
                break

        if not title:
            regex = '(?i)<title.*?>(.*?)</title>'
            title = tools.get_info(self._html, regex, fetch_one=True)
            title = title[:title.find('_')] if '_' in title else title
            title = title[:title.find('-')] if '-' in title else title
            title = title[:title.find('|')] if '|' in title else title

            if not title:
                regexs = [
                    '<h1.*?>(.*?)</h1>', '<h2.*?>(.*?)</h2>',
                    '<h3.*?>(.*?)</h3>', '<h4.*?>(.*?)</h4>'
                ]
                title = tools.get_info(self._html, regexs, fetch_one=True)

        title = tools.del_html_tag(title)
        return title
Exemplo n.º 3
0
def is_have_video_by_judge(title, content):
    '''
    @summary: 根据title 和 content 来判断 (正负极)
    ---------
    @param title:
    @param content:
    ---------
    @result:
    '''

    text = title + content

    feas = db.find('FeaVideo_judge')

    for fea in feas:
        not_video_fea = fea['not_video_fea'].split(',')
        video_fea = fea['video_fea'].split(',')

        if tools.get_info(text, not_video_fea):
            return False

        if tools.get_info(text, video_fea):
            return True

    return False
def add_root_url(parser_params={}):
    log.debug('''
        添加根url
        parser_params : %s
        ''' % str(parser_params))

    url = 'http://www.1kkk.com'
    html = tools.get_html_by_urllib(url)
    regex = '<li class="">.*?href="(.*?)" target="_parent"><span>.*?</span></a></li>'
    infos = tools.get_info(html, regex)
    china_cartoon = ['//manhua-china//']
    infos = infos + china_cartoon
    for info in infos:
        info = info[:-1]
        url = 'http://www.1kkk.com' + info
        url_fenye = url + '-p'
        urls = url + '-p1'
        html = tools.get_html_by_urllib(urls)
        page_count = '\.\.\.<a href=".*?">(.*?)</a><a href=".*?">下一页</a>'
        page_count = tools.get_info(html, page_count)
        if not page_count:
            while url:
                html = tools.get_html_by_urllib(url)
                url = '<div id="search_fy">.*<a href="(.*?)" style=\'padding: 5px 20px; margin: 0 8px;\'> 下一页 </a>'
                url = tools.get_info(html, url)
                url = ''.join(url)
                url = 'http://www.1kkk.com' + url
                base_parser.add_url('WP_urls', SITE_ID, url)
        else:
            page_count = int(''.join(page_count))
            for page in range(1, page_count + 1):
                url = url_fenye + '%d' % page
                base_parser.add_url('WP_urls', SITE_ID, url)
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html = tools.get_html_by_urllib(source_url)
    if html == None:
        base_parser.update_url('article_urls', source_url, Constance.EXCEPTION)
        return

    # 判断中英文
    regex = '[\u4e00-\u9fa5]+'
    chinese_word = tools.get_info(html, regex)
    if not chinese_word:
        base_parser.update_url('article_urls', source_url, Constance.EXCEPTION)
        return

    urls = tools.get_urls(html)
    fit_url = tools.fit_url(urls, FIT_URLS)
    for url in fit_url:
        # log.debug('url = ' + url)
        base_parser.add_url('article_urls', website_id, url, depth + 1)


    # 取当前页的文章信息
    # 标题

    regexs = '<h1.*?>(.*?)</h1>'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)
    # 内容
    regexs = ['<div id="content">(.*?)<div class="clear"></div>',
              '<div class="article">(.*?)<!--文章操作-->',
              '<div id="video_area">(.*?)<!--文章操作-->',
              '<div class="content">(.*?)<div id="article_edit">'
              ]

    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                depth     = %d
                url       = %s
                title     = %s
                content   = %s
             '''%(depth+1, source_url, title, content))

    if content and title:
        base_parser.add_article_info('article_text_info', website_id, source_url, title, content)

    # 更新source_url为done
    base_parser.update_url('article_urls', source_url, Constance.DONE)
Exemplo n.º 6
0
def add_root_url(keywords):
    log.debug('''
        添加根url
        parser_params : %s
        ''' % str(keywords))
    for keyword in keywords:
        next_keyword = False
        quote_keyword = tools.quote(keyword)
        for page_index in range(1, 10):
            url = 'http://www.soku.com/search_video/q_%s_orderby_2_limitdate_0?spm=a2h0k.8191407.0.0&site=14&' \
                  '_lg=10&page=%s' % (quote_keyword, page_index)
            log.debug('''
                处理: %s
                url : %s''' % (keyword, url))
            html, res = tools.get_html_by_requests(url)
            video_list_title = tools.get_tag(html, 'div', {'class': 'v-thumb'})
            video_list_url = tools.get_tag(html, 'div', {'class': 'v-meta'})
            video_list_time = tools.get_tag(html, 'div',
                                            {'class': 'v-meta-data'})

            if not video_list_title:
                break

            for info_index, video_info in enumerate(video_list_title):
                image_url = tools.get_info(str(video_info),
                                           'src="(.+?)"',
                                           fetch_one=True)
                image_url = 'http:' + image_url
                print(image_url)
                title = tools.get_info(str(video_info),
                                       'alt="(.+?)"',
                                       fetch_one=True)
                print(title)
                url = tools.get_info(str(video_list_url[info_index]),
                                     'href="(.+?)"',
                                     fetch_one=True)
                url = 'http:' + url
                print(url)
                release_time = tools.get_info(str(
                    video_list_time[info_index * 2 + 1]),
                                              'lass="r">(.+?)<',
                                              fetch_one=True)
                release_time = get_release_time(release_time)
                print(release_time)

                is_continue = base_parser.save_video_info(
                    image_url=image_url,
                    url=url,
                    title=title,
                    release_time=release_time,
                    site_name=NAME)

                if not is_continue:
                    next_keyword = True
                    break

            if next_keyword:
                break
Exemplo n.º 7
0
def add_root_url(keywords):
    log.debug('''
        添加根url
        parser_params : %s
        ''' % str(keywords))
    for keyword in keywords:
        print(keyword)
        next_keyword = False
        keyword = tools.quote(keyword)
        for page_index in range(1, 20):
            url = 'http://so.iqiyi.com/so/q_%s_ctg__t_0_page_%s_p_1_qc_0_rd__site__m_4_bitrate_' % (
                keyword, page_index)

            print(url)
            html, res = tools.get_html_by_requests(url)
            video_list_title = tools.get_tag(html, 'a',
                                             {'class': 'figure-180101'})
            video_list_time = tools.get_tag(html, 'div',
                                            {'class': 'result_info'})
            if not video_list_time:
                print('无视频列表  跳出')
                break

            for info_index, video_info in enumerate(video_list_time):
                try:
                    image_url = tools.get_info(str(
                        video_list_title[info_index]),
                                               'src="(.+?)"',
                                               fetch_one=True)
                    title = tools.get_info(str(video_list_title[info_index]),
                                           'title="(.+?)"',
                                           fetch_one=True)
                    url = tools.get_info(str(video_list_title[info_index]),
                                         'href="(.+?)"',
                                         fetch_one=True)
                    release_time = tools.get_tag(
                        video_info,
                        'em', {
                            'class': 'result_info_desc'
                        },
                        find_all=False).get_text()
                    is_continue = base_parser.save_video_info(
                        image_url=image_url,
                        url=url,
                        title=title,
                        release_time=release_time,
                        site_name=NAME)
                    if not is_continue:
                        next_keyword = True
                        break

                except Exception as e:
                    log.error(e)

            if next_keyword:
                break
Exemplo n.º 8
0
    def get_author(self):
        # 不去掉标签匹配
        author = tools.get_info(self._text, AUTHOR_REGEXS_TEXT, fetch_one = True)

        if not author: # 没有匹配到,去掉标签后进一步匹配,有的作者和名字中间有标签
            author = tools.get_info(self.__replace_str(self._text, '<(.|\n)*?>', ' '), AUTHOR_REGEXS_TEXT, fetch_one = True)

        if not author: # 仍没匹配到,则在html的author中匹配
            author = tools.get_info(self._html, AUTHOR_REGEX_TAG, fetch_one = True)

        return author
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html = tools.get_html_by_urllib(source_url)
    if html == None:
        base_parser.update_url('article_urls', source_url, Constance.EXCEPTION)
        return

    # 判断中英文
    regex = '[\u4e00-\u9fa5]+'
    chinese_word = tools.get_info(html, regex)
    if not chinese_word:
        base_parser.update_url('article_urls', source_url, Constance.EXCEPTION)
        return
    urls = tools.get_urls(html, STOP_URLS)

    urls = tools.fit_url(urls, "cctv.com")
    for url in urls:
        # log.debug('url = ' + url)
        base_parser.add_url('article_urls', website_id, url, depth + 1)

    # 取当前页的文章信息
    # 标题

    regexs = '<h1><!--repaste.title.begin-->(.*?)<!--repaste.title.end-->'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)
    # 内容
    regexs = ['<!--repaste.body.begin-->(.*?)<!--repaste.body.end-->']

    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                depth     = %d
                url       = %s
                title     = %s
                content   = %s
             ''' % (depth + 1, source_url, title, content))

    if content and title:
        base_parser.add_article_info('article_text_info', website_id,
                                     source_url, title, content)

    # 更新source_url为done
    base_parser.update_url('article_urls', source_url, Constance.DONE)
Exemplo n.º 10
0
    def __parse_account_info(self, data, req_url):
        '''
        @summary:
        ---------
        @param data:
        ---------
        @result:
        '''
        __biz = tools.get_param(req_url, '__biz')
        WechatAction._current_account_biz = __biz

        regex = 'id="nickname">(.*?)</strong>'
        account = tools.get_info(data, regex, fetch_one=True).strip()

        regex = 'profile_avatar">.*?<img src="(.*?)"'
        head_url = tools.get_info(data, regex, fetch_one=True)

        regex = 'class="profile_desc">(.*?)</p>'
        summary = tools.get_info(data, regex, fetch_one=True).strip()

        # 认证信息(关注的账号直接点击查看历史消息,无认证信息)
        regex = '<i class="icon_verify success">.*?</i>(.*?)</span>'
        verify = tools.get_info(data, regex, fetch_one=True)
        verify = verify.strip() if verify else ''

        # 二维码
        regex = 'var username = "" \|\| "(.*?)";'  # ||  需要转译
        qr_code = tools.get_info(data, regex, fetch_one=True)
        qr_code = 'http://open.weixin.qq.com/qr/code?username='******'__biz':
            __biz,
            'account':
            account,
            'head_url':
            head_url,
            'summary':
            summary,
            'qr_code':
            qr_code,
            'verify':
            verify,
            'account_id':
            WechatAction._account_info.pop(__biz)
            if __biz in WechatAction._account_info.keys() else '',
            'record_time':
            tools.get_current_date()
        }

        if not WechatAction._wechat_service.is_exist('wechat_account', __biz):
            WechatAction._wechat_service.add_account_info(account_info)
Exemplo n.º 11
0
    def get_release_time_old(self):

        if self._content_start_pos and self._content_end_pos:
            content = self.__replace_str('\n'.join(self._paragraphs[self._content_start_pos  - RELEASE_TIME_OFFSET: self._content_end_pos + RELEASE_TIME_OFFSET]), '<(.|\n)*?>', '<>')
        else:
            content = self.__replace_str(self._text, '<(.|\n)*?>', '<>')

        release_time = tools.get_info(content, DAY_TIME_REGEXS, fetch_one = True)
        if not release_time:
            release_time = tools.get_info(self.__replace_str(self._text, '<(.|\n)*?>', '<>'), DAY_TIME_REGEXS, fetch_one = True)

        release_time = tools.format_date(release_time)

        return release_time
Exemplo n.º 12
0
    def get_article_content(self, data, req_url):
        log.debug('获取文章内容')

        if data:  # 被验证不详实的文章 首次不反回内容,跳转到https://mp.weixin.qq.com/mp/rumor
            req_url = req_url.replace('amp;', '')
            mid = tools.get_param(req_url, 'mid') or tools.get_param(
                req_url, 'appmsgid')  # 图文消息id 同一天发布的图文消息 id一样
            idx = tools.get_param(req_url, 'idx') or tools.get_param(
                req_url, 'itemidx')  # 第几条图文消息 从1开始
            article_id = mid + idx  # 用mid和idx 拼接 确定唯一一篇文章 如mid = 2650492260  idx = 1,则article_id = 26504922601
            WechatAction._current_aritcle_id = article_id  # 记录当前文章的id 为获取评论信息时找对应的文章id使用
            print('当前id' + WechatAction._current_aritcle_id)
            regex = '(<div class="rich_media_content ".*?)<script nonce'
            content = tools.get_info(data, regex, fetch_one=True)
            if content:
                # 缓存文章内容
                WechatAction._article_info[article_id]['content'] = content
                # 取公众号名
                regex = '<title>(.*?)</title>'
                account = tools.get_info(data, regex, fetch_one=True)
                WechatAction._article_info[article_id]['account'] = account

            else:  # 被验证不实的文章,不会请求观看点赞数,此时直接入库
                regex = '<title>(.*?)</title>'
                content = tools.get_info(data, regex, fetch_one=True)
                WechatAction._article_info[article_id]['content'] = content

                # 入库
                print('被验证不实的文章,不会请求观看点赞数,此时直接入库')
                WechatAction._wechat_service.add_article_info(
                    WechatAction._article_info.pop(article_id))

            # 如果下一页是文章列表的链接, 替换文章列表中的appmsg_token,防止列表链接过期
            if (len(WechatAction._todo_urls)
                    == 1) and ('/mp/profile_ext'
                               in WechatAction._todo_urls[-1]):
                regex = 'appmsg_token = "(.*?)"'
                appmsg_token = tools.get_info(data, regex,
                                              fetch_one=True).strip()

                WechatAction._todo_urls[-1] = tools.replace_str(
                    WechatAction._todo_urls[-1], 'appmsg_token=.*?&',
                    'appmsg_token=%s&' % appmsg_token)

            return self.__open_next_page()

        else:
            # 无文章内容
            pass
def parser_program(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']

    # 解析
    html, request = tools.get_html_by_requests(root_url)
    if not html:
        base_parser.update_url('PROGRAM_urls', root_url, Constance.EXCEPTION)
        return

    regex = '<li class="v-item-v5.*?">(.*?)</li>'
    video_blocks = tools.get_info(html, regex)
    for video_block in video_blocks:
        regex = '<a class="u-video" href="(.*?)"'
        program_url = tools.get_info(video_block, regex, fetch_one = True)
        program_id = program_url[program_url.find('b/') + 2 : program_url.rfind('/')]
        program_url = 'http://www.mgtv.com/h/%s.html'%program_id

        regex = '<img class="u-image" src="(.*?)"'
        image_url = tools.get_info(video_block, regex, fetch_one = True)

        regex = 'em class="u-time">(.*?)</em>'
        episode = tools.get_info(video_block, regex, fetch_one = True)

        regex = '<a class="u-title".*?>(.*?)</a>'
        title = tools.get_info(video_block, regex, fetch_one = True)

        regex = '<span class="u-desc">(.*?)</span>'
        actors_block = tools.get_info(video_block, regex, fetch_one = True)
        regex = '<a .*?>(.*?)</a?'
        actors = tools.get_info(actors_block, regex)
        actors = '/'.join(actors) if actors else '暂无'

        detail_html, r = tools.get_html_by_requests(program_url)
        regex = '<em class="label">简介.*?<span>(.*?)</span>'
        summary = tools.get_info(detail_html, regex, fetch_one = True) if detail_html else ''

        log.debug('''
            program_url %s
            image_url   %s
            episode     %s
            title       %s
            actors      %s
            summary     %s
            '''%(program_url, image_url, episode, title, actors, summary))

        program_mongo_id = base_parser.add_program_info('PROGRAM_info', site_id, title, program_url, image_url, episode, directors = '', actors = actors, summary = summary, release_time = '')

        # 获取集信息url  没月份参数默认是最近月份的数据
        episode_detail_url = 'http://pcweb.api.mgtv.com/variety/showlist?collection_id=' + program_id
        base_parser.add_url('PROGRAM_urls', SITE_ID, episode_detail_url, depth = 1, remark = {'program_mongo_id' : program_mongo_id, 'program_id' : program_id})

    base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE)
Exemplo n.º 14
0
    def is_have_new_article(self, account_id='', account=''):
        '''
        @summary: 检查公众号今日是否发文
        ---------
        @param account_id:
        @param account:
        ---------
        @result:
        '''

        account_block = self.__get_account_blocks(account_id, account)
        if account_block == constance.VERIFICATION_CODE:
            return constance.VERIFICATION_CODE

        regex = "timeConvert\('(\d*?)'\)"
        release_time = tools.get_info(account_block, regex, fetch_one=True)

        if release_time:
            release_time = int(release_time)
            release_time = tools.timestamp_to_date(release_time)
            log.debug("最近发文时间 %s" % release_time)

            if release_time >= tools.get_current_date('%Y-%m-%d'):
                return constance.UPDATE
            else:
                return constance.NOT_UPDATE

        else:
            return constance.ERROR
def add_root_url(parser_params = {}):
    log.debug('''
        添加根url
        parser_params : %s
        '''% str(parser_params))

    _db = base_parser.MongoDB()
    _db.set_unique_key('PROGRAM_EPISODE_info', 'episode_url')
    _db.update('PROGRAM_urls', {'depth': 0, 'site_id': SITE_ID}, {'status': 0}, multi=True)

    for page_num in range(1, 14):
        urls = [
                'http://list.youku.com/category/show/c_85_g_热门网综_s_1_d_1_p_%d.html' % page_num,
                'http://list.youku.com/category/show/c_97_g_优酷出品_s_1_d_1_p_%d.html' % page_num,
                'http://list.youku.com/category/show/c_96_g_优酷出品_s_1_d_1_p_%d.html' % page_num,
                ]
        for url in urls:
            print(url)
            print('********************************************************')
            html = tools.get_html_by_urllib(url)
            if tools.get_info(html, ['小酷没有筛选到相关视频']):
                continue
            links = tools.get_tag(html, 'div', {'class': 'p-thumb'})
            for link in links:
                try:
                    link = link.a['href']
                    link = tools.get_full_url('http:', link)
                    link_html = tools.get_html_by_urllib(link)
                    link = tools.get_tag(link_html, 'a', {'class': 'desc-link'}, find_all=False)
                    link = link['href']
                    link = tools.get_full_url('http:', link)
                    base_parser.add_url('PROGRAM_urls', SITE_ID, link, depth=0)
                except Exception as e:
                    log.error(e)
                    print(link_html)
def parser_program_url(url_info):
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']
    classify = remark['classify']

    # 解析
    html, request = tools.get_html_by_requests(root_url)
    if not html:
        base_parser.update_url('PROGRAM_urls', root_url, Constance.EXCEPTION)
        return

    program_blocks = tools.get_tag(html, 'li', {'class': "list_item"})
    for program_block in program_blocks:
        program_block = str(program_block)

        # 地址
        regex = 'r-props="{id: \'(.*?)\''
        program_id = tools.get_info(program_block, regex, fetch_one=True)
        program_url = 'http://v.qq.com/detail/5/%s.html' % program_id
        base_parser.add_url("PROGRAM_urls",
                            site_id,
                            program_url,
                            depth=1,
                            remark={
                                'program_id': program_id,
                                'classify': classify
                            })

    base_parser.update_url("PROGRAM_urls", root_url, Constance.DONE)
def add_root_url(search_keyword1=[], search_keyword2=[], search_keyword3=[]):
    log.debug(
        '''
        添加根url
        search_keyword1 = %s
        search_keyword2 = %s
        search_keyword3 = %s
        ''' %
        (str(search_keyword1), str(search_keyword2), str(search_keyword3)))

    remark = {
        'search_keyword1': search_keyword1,
        'search_keyword2': search_keyword2,
        'search_keyword3': search_keyword3
    }

    search_keywords = search_keyword1 + search_keyword2

    for search_keyword in search_keywords:
        # 取页数
        url = 'https://movie.douban.com/subject_search?start=0&search_text=%s&cat=1002' % search_keyword
        html = tools.get_html_by_urllib(url)
        regex = '<div class="paginator">.*<a href.*?>(.*?)</a><span class="next"'
        page_count = tools.get_info(html, regex)
        page_count = int(page_count[0]) if page_count else 0
        print(page_count)

        for page in range(0, page_count):
            url = 'https://movie.douban.com/subject_search?start=%d&search_text=%s&cat=1002' % (
                page * 15, search_keyword)
            if not base_parser.add_url('VA_urls', SITE_ID, url, remark=remark):
                base_parser.update_url('VA_urls', url, Constance.TODO)
Exemplo n.º 18
0
    def __init__(self, remote_url, local_save_path, project_path, main_lnk_paths, sync_files = [], ignore_files = []):
        '''
        @summary: 更新代码初始化函数
        ---------
        @param remote_url: 远程代码发布地址
        @param local_save_path: 代码下载路径
        @param project_path: 本地项目路径
        @param main_lnk_paths: 本地项目执行文件快捷方式地址
        @param sync_files: 同步的文件 .* 表示同步全部
        @param ignore_files: 忽略的文件
        ---------
        @result:
        '''

        self._remote_url = remote_url
        self._local_save_path = local_save_path
        self._project_path = project_path
        self._main_lnk_paths = main_lnk_paths
        self._sync_files = sync_files
        self._ignore_files = ignore_files

        self._remote_zip_url = ''
        self._tag = ''
        self._zip_path = ''
        self._unpack_path = ''

        self._project_name = tools.get_info(remote_url, '/([^/]*?)/releases', fetch_one = True)
        self._tag_json = tools.get_json(tools.read_file(VERSION_FILE)) or {}
Exemplo n.º 19
0
def add_root_url(search_keyword1=[], search_keyword2=[], search_keyword3=[]):
    log.debug(
        '''
        添加根url
        search_keyword1 = %s
        search_keyword2 = %s
        search_keyword3 = %s
        ''' %
        (str(search_keyword1), str(search_keyword2), str(search_keyword3)))

    remark = {
        'search_keyword1': search_keyword1,
        'search_keyword2': search_keyword2,
        'search_keyword3': search_keyword3
    }

    search_keywords = search_keyword1 + search_keyword2

    for search_keyword in search_keywords:
        # 取页数
        url = 'http://www.sobaidupan.com/search.asp?r=2&wd=%s&p=0&page=0' % search_keyword
        html = tools.get_html_by_urllib(url)
        regex = ['分页:1/(.*?)页']  # 测试0页
        page_count = tools.get_info(html, regex)
        page_count = int(page_count[0]) if page_count else 0
        print(page_count)

        for page in range(0, page_count):
            url = 'http://www.sobaidupan.com/search.asp?r=2&wd=%s&p=0&page=%d' % (
                search_keyword, page)
            if not base_parser.add_url('VA_urls', SITE_ID, url, remark=remark):
                base_parser.update_url('VA_urls', url, Constance.TODO)
Exemplo n.º 20
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']
    offset = remark.get('offset')

    html = tools.get_html_by_webdirver(root_url)
    headers = tools.get_tag(html, 'div', {'class': 'result'}, find_all=True)
    if not headers:
        base_parser.update_url('BAIDU_NEWS_urls', root_url, Constance.DONE)

    for header in headers:
        # 查看更多相关新闻
        regex = ' <span class="c-info"><a.*?href="(.*?)".*?查看更多相关新闻'
        more_news_url = tools.get_info(str(header), regex, fetch_one = True)
        if more_news_url:
            more_news_url = tools.get_full_url('http://news.baidu.com', more_news_url)
            more_news_url = more_news_url.replace('amp;', '')
            base_parser.add_url('BAIDU_NEWS_urls', SITE_ID, more_news_url, depth = 1, remark = {'offset':0})

        url = header.h3.a['href']
        article_extractor = ArticleExtractor(url)
        content = title = release_time = author = website_domain =''
        content = article_extractor.get_content()
        if content:
            title = article_extractor.get_title()
            release_time = article_extractor.get_release_time()
            author = article_extractor.get_author()
            website_domain = tools.get_domain(url)
            uuid = tools.get_uuid(title, website_domain)
            website_name = ''
            website_position = None

            log.debug('''
                uuid         %s
                title        %s
                author       %s
                release_time %s
                domain       %s
                url          %s
                content      %s
                '''%(uuid, title, author, release_time, website_domain, url, '...'))

            # 入库
            if tools.is_have_chinese(content):
                is_continue = self_base_parser.add_news_acticle(uuid, title, author, release_time, website_name , website_domain, website_position, url, content)

                if not is_continue:
                    break
    else:
        # 循环正常结束 该页均正常入库, 继续爬取下页
        offset += 50
        url = tools.replace_str(root_url, 'pn=\d*', 'pn=%d'%offset)
        base_parser.add_url('BAIDU_NEWS_urls', SITE_ID, url, depth = 0, remark = {'offset': offset})

    base_parser.update_url('BAIDU_NEWS_urls', root_url, Constance.DONE)
Exemplo n.º 21
0
 def inner_add_url(url):
     html = tools.get_html_by_urllib(url)
     regexs = 'pg.pageCount = parseInt\(\'(\d*?)\',10\)'
     pages = tools.get_info(html, regexs)
     pages = int(pages[0])
     for i in range(1, pages + 1):
         new_url = url+'=%d' % i
         base_parser.add_url('WWA_search_app_urls', SITE_ID, new_url)
Exemplo n.º 22
0
    def get_content1(self):
        '''
        方法一
        @summary:
        基于文本密度查找正文
            1、将html去标签,将空格和换行符外的其他空白符去掉
            2、统计连续n段文字的长度,此处用于形容一定区域的文本密度
            3、将文本最密集处当成正文的开始和结束位置
            4、在正文开始处向上查找、找到文本密度小于等于正文文本密度阈值值,算为正文起始位置。该算法文本密度阈值值为文本密度值的最小值
            5、在正文开始处向下查找、找到文本密度小于等于正文文本密度阈值值,算为正文结束位置。该算法文本密度阈值值为文本密度值的最小值

        去除首页等干扰项:
            1、正文一般都包含p标签。此处统计p标签内的文字数占总正文文字数的比例。超过一定阈值,则算为正文
        待解决:
            翻页 如:http://mini.eastday.com/a/171205202028050-3.html
        ---------
        ---------
        @result:
        '''
        if USEFUL_TAG:
            html = self.__replace_str(self._text, r'(?!{useful_tag})<(.|\n)+?>'.format(useful_tag = '|'.join(USEFUL_TAG)))
        else:
            html = self.__replace_str(self._text, '<(.|\n)*?>')
        paragraphs = html.split('\n')
        # for i, paragraph in enumerate(paragraphs):
        #     print(i, paragraph)

        # 统计连续n段的文本密度
        paragraph_lengths = [len(self.__del_html_tag(paragraph)) for paragraph in paragraphs]
        # paragraph_lengths = [len(paragraph.strip()) for paragraph in paragraphs]
        paragraph_block_lengths = [sum(paragraph_lengths[i : i + MAX_PARAGRAPH_DISTANCE]) for i in range(len(paragraph_lengths))]  # 连续n段段落长度的总和(段落块),如段落长度为[0,1,2,3,4] 则连续三段段落长度为[3,6,9,3,4]

        self._content_center_pos = content_start_pos = content_end_pos = paragraph_block_lengths.index(max(paragraph_block_lengths)) #文章的开始和结束位置默认在段落块文字最密集处
        min_paragraph_block_length = MIN_PARAGRAPH_LENGHT * MAX_PARAGRAPH_DISTANCE
        # 段落块长度大于最小段落块长度且数组没有越界,则看成在正文内。开始下标继续向上查找
        while content_start_pos > 0 and paragraph_block_lengths[content_start_pos] > min_paragraph_block_length:
            content_start_pos -= 1

        # 段落块长度大于最小段落块长度且数组没有越界,则看成在正文内。结束下标继续向下查找
        while content_end_pos < len(paragraph_block_lengths) and paragraph_block_lengths[content_end_pos] > min_paragraph_block_length:
            content_end_pos += 1

        # 处理多余的换行和空白符
        content = paragraphs[content_start_pos : content_end_pos]
        content = '\n'.join(content)
        content = self.__del_unnecessary_character(content)

        # 此处统计p标签内的文字数占总正文文字数的比例。超过一定阈值,则算为正文
        paragraphs_text_len = len(self.__del_html_tag(''.join(tools.get_info(content, '<p.*?>(.*?)</p>'))))
        content_text_len = len(self.__del_html_tag(content))
        if content_text_len and content_text_len > MIN_COUNTENT_WORDS and ((paragraphs_text_len / content_text_len) > MIN_PARAGRAPH_AND_CONTENT_PROPORTION):
            self._content_start_pos = content_start_pos
            self._content_end_pos = content_end_pos
            self._paragraphs = paragraphs
            # print(content_start_pos, content_end_pos, self._content_center_pos)
            return content
        else:
            return ''
Exemplo n.º 23
0
def spider_picture(p_url, end):
    for i in range(1,11):
        i = str(i)
        url = p_url+i+end
        html, r = tools.get_html_by_requests(url)
        regex = 'title=".*?".*?src = "(.*?)".*?<div class="wrapper-listTitle">'
        img_urls = tools.get_info(html, regex)
        regex_name = 'rseat="dsjp7".*?title="(.*?)".*?src = ".*?"'
        names = tools.get_info(html, regex_name)
        j=0
        for img_url in img_urls:
            name = names[j]
            name = tools.del_html_tag(name)
            j=j+1
            #print(img_url,'---',name,'****',j)
            FILE_LOCAL_PATH = 'd:'
            sto_path = '/picture/' + name + '.jpg'
            tools.download_file(img_url, FILE_LOCAL_PATH, sto_path)
 def inner_add_url(url, remark):
     html = tools.get_html_by_urllib(url)
     regex = '<li><span></span><a  href="(.*?)">.*?</a></li>'
     infos = tools.get_info(html, regex)
     for info in infos:
         info = ''.join(info)
         type_url = 'http://shouji.baidu.com' + info
         type_html = tools.get_html_by_urllib(type_url)
         page_count = '<div class="pager">.*">(.*?)</a>.*?<li class="next">'
         page_count = tools.get_info(type_html, page_count)
         page_count = ''.join(page_count)
         if not page_count:
             page_count = '1'
         page_count = int(page_count)
         for page in range(1, page_count + 1):
             url = type_url + 'list_%d.html' % page
             if not base_parser.add_url(
                     'GameApp_urls', SITE_ID, url, remark=remark):
                 base_parser.update_url('GameApp_urls', url, Constance.TODO)
 def add_root_url(url, start, end):
     html, r = tools.get_html_by_requests(url)
     page_regex = '<div class="ssPages area">.*>(\d*?)</a>.*?<a title="下一页"'
     pages = tools.get_info(html, page_regex)
     pages = pages and pages[0] or ''
     if pages:
         pages = int(pages)
         for page in range(1, pages+1):
             url = start+str(page)+end
             base_parser.add_url('PROGRAM_urls', SITE_ID, url)
Exemplo n.º 26
0
 def add_root_urls(url):
     html, r = tools.get_html_by_requests(url)
     # print(html)
     regex = '<div class="site-piclist_pic">(.*?)</li>'
     html_infos = tools.get_info(html, regex)
     s = 0
     for info in html_infos:
         regex = 'href = "(.*?)" class="site-piclist_pic_link"'
         url = tools.get_info(info, regex)
         url = url and url[0] or ''
         regex = 'rseat="bigTitle.*?title="(.*?)"'
         name = tools.get_info(info, regex)
         name = name and name[0] or ''
         name = tools.del_html_tag(name)
         video_download_url = get_download_url(url)
         FILE_LOCAL_PATH = 'd:'
         sto_path = '/videos/' + name + '.mp4'
         tools.download_file(video_download_url, FILE_LOCAL_PATH, sto_path)
         print(video_download_url, name)
Exemplo n.º 27
0
    def copy_file(self):
        unpack_file_root_path = tools.get_next_path(self._unpack_path)
        file_list = tools.walk_file(self._unpack_path)
        for file in file_list:
            if tools.get_info(file, self._sync_files) and not tools.get_info(file, self._ignore_files):
                file_relative_path = file.replace(unpack_file_root_path, '')
                move_to_path = self._project_path + file_relative_path

                is_success = tools.copy_file(file, move_to_path)
                log.debug('''
                        复制文件 %s
                        至       %s
                        是否成功 %s
                        '''%(file, move_to_path, is_success))
                if not is_success:
                    log.error('同步失败:{project_name} ({per_tag} -> {current_tag})'.format(project_name = self._project_name, per_tag = self.__get_per_tag(), current_tag = self._tag))
                    break
        else:
            log.info('同步成功:{project_name} ({per_tag} -> {current_tag})'.format(project_name = self._project_name, per_tag = self.__get_per_tag(), current_tag = self._tag))
            self.__record_current_tag(self._tag)
Exemplo n.º 28
0
 def inner_add_url(base_url, url, remark):
     html = tools.get_html_by_urllib(base_url)
     regex = 'pg.pageCount = (.*?);'
     page_count = tools.get_info(html, regex, allow_repeat=True)
     page_count = ''.join(page_count)
     page_count = round(float(page_count))
     page_count = int(page_count)
     for i in range(0, page_count+1):
         url = url % i
         if not base_parser.add_url('GameApp_urls', SITE_ID, url, remark=remark):
             base_parser.update_url('GameApp_urls', url, Constance.TODO)
 def inner_add_url(url):
     html = tools.get_html_by_urllib(url)
     regex = '<input type="hidden" class="total-page" value="(\d*?)" />'
     pages = tools.get_info(html, regex)
     #print(pages)
     pages = pages[0]
     if pages:
         pages = int(pages)
         for i in range(1, pages + 1):
             new_url = url + '%d' % i
             base_parser.add_url('WWA_search_app_urls', SITE_ID, new_url)
Exemplo n.º 30
0
        def get_release_time_in_paragraph(paragraph_pos):
            if self._paragraphs:
                while paragraph_pos >= 0:
                    content = self.__replace_str(self._paragraphs[paragraph_pos], '<(.|\n)*?>', '<>')
                    release_time = tools.get_info(content, DAY_TIME_REGEXS, fetch_one = True)
                    if release_time:
                        return tools.format_date(release_time)

                    paragraph_pos -= 1

            return None