def __init__(self, url, html = None, language='zh'):
        self._html = html
        self._url = url

        if not html:
            self._html = tools.get_html(url)

        self._text = self.__del_html_tag(self._html, save_useful_tag = True)
示例#2
0
    def __init__(self, url, html=None, language='zh'):
        self._html = html
        self._url = url

        self._content_start_pos = ''
        self._content_end_pos = ''
        self._content_center_pos = ''
        self._paragraphs = ''

        if not html:
            self._html = tools.get_html(url)

        self._text = self.__del_html_tag(self._html, save_useful_tag=True)
示例#3
0
def delete_site(request):
    site_id = request.POST.get("site_id", None)
    if site_id:
        site = Site.objects.filter(pk=int(site_id)).first()
        if site:
            site.delete()
            msg = '删除成功'
            html = tools.get_html(html_path="app/msg.html",
                                  data={
                                      'msg': msg,
                                      'msg_type': 'success'
                                  })
            return jsonSuccess(msg=msg, data={"html": html})

    return jsonFailed(code=401, msg="没有找到该站点!该站点可能已经被删除")
示例#4
0
def parser(url_info):
    root_url, depth, remark, website_name, website_position, website_url, website_domain, spider_depth = parser_url_info(
        url_info)
    html = tools.get_html(root_url)
    if not html:
        log.debug('请求url失败')
        # base_parser.update_url('news_urls', root_url, Constance.EXCEPTION)
        return

    # 近一步取待做url
    add_html_url(html, depth, spider_depth, website_url, website_name,
                 website_domain, remark)

    # 解析网页
    parser_article(root_url, html, website_name, website_domain,
                   website_position)
示例#5
0
    def __init__(self, url, html = None, language='zh'):
        self._html = html
        self._url = url
        self._content_start_pos = ''
        self._content_end_pos = ''
        self._content_center_pos = ''
        self._paragraphs = ''
        if not html:
            self._html = tools.get_html(url)
        self._text = self.__del_html_tag(self._html)

        self.stripper = re.compile(r'\s+')
        self.anchor_ratio_limit = 0.3
        self.impurity_threshold = 30

        self.doc = lxml.html.fromstring(self._text)
        self.region = Region(self.doc)
示例#6
0
def add_site(request):
    form = SiteForm(request.POST)
    if form.is_valid():
        site_id = request.POST.get("site_id")
        name = form.cleaned_data.get('name', None)
        href = form.cleaned_data.get('href', None)
        coding = form.cleaned_data.get('coding', None)
        restart = form.cleaned_data.get('restart', None)
        deploy = form.cleaned_data.get('deploy', None)
        update_cert = form.cleaned_data.get('update_cert', None)
        copy = form.cleaned_data.get('copy', None)

        need_verification = True if request.POST.get(
            "custom-switch-checkbox") == 'on' else False
        overseas = True if request.POST.get(
            "overseas-switch-checkbox") == 'on' else False

        if site_id:
            site = Site.objects.filter(id=int(site_id)).first()
            action = "change"
            site_tr_html = None
            if site:
                site.name = name
                site.href = href
                site.coding = coding
                site.need_verification = need_verification
                site.restart = restart
                site.deploy = deploy
                site.update_cert = update_cert
                site.copy = copy
                site.overseas = overseas
                site.save()

                msg = "修改成功"
                msg_type = "success"
                site_tr_html = tools.get_html('app/site-tr.html',
                                              {'site': site})
            else:
                msg = "修改失败,没有找到该站点,也许站点已经被删除"
                msg_type = "danger"
        else:
            site = Site.objects.create(name=name,
                                       href=href,
                                       coding=coding,
                                       need_verification=need_verification,
                                       copy=copy,
                                       deploy=deploy,
                                       restart=restart,
                                       update_cert=update_cert,
                                       overseas=overseas)
            msg = "添加成功"
            msg_type = "success"
            action = "create"
            site_tr_html = tools.get_html('app/site-tr.html', {'site': site})

        msg_html = tools.get_html('app/msg.html', {
            'msg': msg,
            'msg_type': msg_type
        })
        return jsonSuccess(msg="添加成功", data={"site_tr_html": site_tr_html,\
                            'msg_html': msg_html, 'action': action, 'site_id': site.id})

    else:
        errors = form.get_errors(data_type='list')
        msg = errors[0].get("message", None) if errors else '添加失败'
        msg_html = tools.get_html('app/msg.html', {
            'msg': msg,
            'msg_type': 'danger'
        })
        return jsonFailed(code=401, msg=msg, data={'msg_html': msg_html})
        # 'http://cnews.chinadaily.com.cn/2017-12/06/content_35230092.htm',

        # 'http://e.gmw.cn/2017-12/04/content_26998661.htm',
        # 'http://www.sohu.com/a/208241102_570245',
        # 'http://cnews.chinadaily.com.cn/2017-12/06/content_35230092.htm',
        # 'http://news.eastday.com/eastday/13news/auto/news/society/20171206/u7ai7256226.html',
        # 'http://cj.sina.com.cn/article/detail/6185269244/510492',
        # 'http://0575gwy.com/index.php/Index/show/id/2130',
        # 'http://hdmedicine.com.cn/News_info.aspx?News_Id=787&CateId=24',
        # 'http://www.qz001.gov.cn/info/view/86ec076d71a44869ab71e00e5707f89e',
        # 'http://payh.gov.cn/Art/Art_2/Art_2_795.aspx',
        'http://qiushi.nbgxedu.com/show.aspx?id=d479b45a-1747-4f60-83f3-f1e2dc85a0d2'

    ]
    for url in urls:
        html = tools.get_html(url)

        article_extractor = ArticleExtractor(url, html)
        title = article_extractor.get_title()
        release_time = article_extractor.get_release_time()
        author = article_extractor.get_author()
        content = article_extractor.get_content()
        print('---------------------------')
        print(url)
        print('title : ', title)
        print('release_time: ', release_time)
        print('author', author)
        print('content : ',content)
        print('---------------------------')

示例#8
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']
    website_name = remark['website_name']
    website_position = remark['website_position']
    website_url = remark['website_url']
    website_domain = remark['website_domain']

    html = tools.get_html(root_url)
    if not html:
        base_parser.update_url('news_urls', root_url, Constance.EXCEPTION)
        return

    # 近一步取待做url
    if depth < DEPTH:
        urls = tools.get_urls(html)
        for url in urls:
            url = tools.get_full_url(website_url, url)
            if website_name == '百度新闻':
                remark['website_name'] = ''
                remark['website_domain'] = tools.get_domain(url)
                remark['website_position'] = None
                base_parser.add_url('news_urls',
                                    SITE_ID,
                                    url,
                                    depth + 1,
                                    remark=remark)
            elif website_domain in url:
                base_parser.add_url('news_urls',
                                    SITE_ID,
                                    url,
                                    depth + 1,
                                    remark=remark)

    # 解析网页
    content = title = release_time = author = ''
    article_extractor = ArticleExtractor(root_url, html)
    content = article_extractor.get_content()
    if content:
        title = article_extractor.get_title()
        release_time = article_extractor.get_release_time()
        author = article_extractor.get_author()
        uuid = tools.get_uuid(
            title,
            website_domain) if title != website_name else tools.get_uuid(
                root_url, ' ')

        log.debug('''
            uuid         %s
            title        %s
            author       %s
            release_time %s
            website_name %s
            domain       %s
            position     %s
            url          %s
            content      %s
            ''' % (uuid, title, author, release_time, website_name,
                   website_domain, website_position, root_url, content))

        if tools.is_have_chinese(content):
            # 入库
            self_base_parser.add_news_acticle(uuid, title, author,
                                              release_time, website_name,
                                              website_domain, website_position,
                                              root_url, content)

    log.debug('%s 处理完成' % root_url)
    base_parser.update_url('news_urls', root_url, Constance.DONE)
示例#9
0
def parser_video_info(root_url, depth, site_id, remark):
    program_id = remark.get('program_id')
    chan_name = remark.get('chan_name')
    program_name = remark.get('program_name')
    program_type = remark.get('program_type')
    is_need_update = not remark.get('image_url') or False

    html, r = tools.get_html_by_requests(root_url)
    if not html:
        base_parser.update_url('mms_urls', root_url, Constance.EXCEPTION)
        return

    regex = '(<li class="list_item".*?</li>)'
    video_blocks = tools.get_info(html, regex)
    for video_block in video_blocks:
        regex = '<a class="figure  figure-180236.*?href="(.*?)"'
        url = tools.get_info(video_block, regex, fetch_one=True)

        regex = '<img width="140" height="187" alt="(.*?)"'
        name = tools.get_info(video_block, regex, fetch_one=True)

        if (not url) or (not program_name in name):
            continue

        regex = '<em class="fs12 c999">&nbsp;(.*?)</em>'
        release_year = tools.get_info(video_block, regex, fetch_one=True)

        regex = '<label class="result_info_lbl">.*?</label>[^<]*?<a data-searchpingback-elem="link.*?>(.*?)</a>'
        director = tools.get_info(video_block, regex, fetch_one=True)

        html = tools.get_html(url)

        # 节目类别
        regex = '<a href=.*?class="channelTag".*?>(.*?)</a>'
        video_type = tools.get_info(html, regex, fetch_one=True)

        # if program_type != '其他' and video_type and program_type != video_type:
        #     # print(video_type, name)
        #     continue

        regex = [
            '<div class="info-img">.*?<img src="(.*?)"',
            '<div class="result_pic pr" >.*?<img.*?src="(.*?)"'
        ]
        image_url = tools.get_info(html, regex, fetch_one=True)

        regex = '<em>导演:.*?"director">(.*?)</a>'
        director = director or tools.get_info(html, regex, fetch_one=True)

        regex = [
            '<p class="episodeIntro-time" itemprop="datePublished">.*?<span>(.*?)</span>'  #,
            '<em class="ml50">年份:</em><a>(.*?)</a>',
            '<em>更新至.*?>(.*?)</a>'
        ]
        release_year = release_year or tools.get_info(
            html, regex, fetch_one=True)

        regex = '<em>类型.*?<a href.*?>(.*?)</a>'
        classify = tools.get_info(html, regex, fetch_one=True)

        regex = '<em>电视台:</em><span>(.*?)</span>'
        institution = tools.get_info(html, regex, fetch_one=True)

        # 简介
        regex = [
            'data-moreorless="moreinfo".*?<span class="briefIntroTxt">(.*?)</span>',
            '<span class="briefIntroTxt">(.*?)</span>',
            '<span class="showMoreText" data-moreorless="moreinfo".*?简介:</em>(.*?)</span>'
        ]
        description = tools.get_info(html, regex, fetch_one=True)

        # 演员
        regex = [
            '<div class="headImg-top">.*?<img title="(.*?)"',
            '<div class="headImg-top">.*?<img.*?alt="(.*?)"'
        ]
        actor = tools.get_info(html, regex, split=',')

        # 節目id
        regex = 'data-score-tvid="(.*?)"'
        video_id = tools.get_info(html, regex, fetch_one=True)

        # 评分
        score_url = 'http://score-video.iqiyi.com/beaver-api/get_sns_score?qipu_ids={video_id}&appid=21&tvid={video_id}&pageNo=1'.format(
            video_id=video_id)
        score_html, r = tools.get_html_by_requests(score_url)
        regex = '"sns_score":(.*?)}'
        score = tools.get_info(score_html, regex, fetch_one=True)

        log.debug(
            '''
            url:       %s
            名称:      %s
            id:        %s
            贴图:      %s
            导演:       %s
            节目类别    %s
            类型:      %s
            电视台:    %s
            年份:      %s
            简介:      %s
            演员:       %s
            评分:      %s
            ''' %
            (url, name, video_id, image_url, director, video_type, classify,
             institution, release_year, description, actor, score))

        if is_need_update:
            sql = '''
                update tab_mms_program t set
                    t.image_url = '%s',
                    t.director = '%s',
                    t.description = '%s',
                    t.score = %s,
                    t.actor = '%s'
                where t.program_id = %d
            ''' % (image_url, director, description, score, actor, program_id)
            print(sql)
            db.update(sql)

        # 评论区类评论http://www.iqiyi.com/a_19rrhcvhph.html
        parser_comment_article(html, video_id, program_id, url)
        # 剧情讨论http://www.iqiyi.com/a_19rrhebm2l.html
        parser_first_page_article(html, program_id, url)
        # 取wall_id, feed_id, sns_time 翻页
        regex = "\['wallId'\] = \"(.*?)\""
        wall_id = tools.get_info(html, regex, fetch_one=True)
        regex = "\['feedId'\] = (\d*?);"
        feed_id = tools.get_info(html, regex, fetch_one=True)
        regex = "\['snsTime'\] = (\d*?);"
        sns_time = tools.get_info(html, regex, fetch_one=True)
        if wall_id:
            parser_next_page_article(video_id, wall_id, feed_id, sns_time, url)
        break  # 找到了想要查找到的节目, 后面的不继续爬取评论  跳出

    base_parser.update_url('mms_urls', root_url, Constance.DONE)