Python del_html_tag 예제들, utils.tools.del_html_tag Python 예제들

예제 #1

0

파일 보기

파일: xinhua_parser.py 프로젝트: xuexiteam/internet-content-detection

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html = tools.get_html_by_urllib(source_url)
    if html == None:
        base_parser.update_url('article_urls', source_url, Constance.EXCEPTION)
        return

    # 判断中英文
    regex = '[\u4e00-\u9fa5]+'
    chinese_word = tools.get_info(html, regex)
    if not chinese_word:
        base_parser.update_url('article_urls', source_url, Constance.EXCEPTION)
        return

    urls = tools.get_urls(html)
    fit_url = tools.fit_url(urls, FIT_URLS)
    for url in fit_url:
        # log.debug('url = ' + url)
        base_parser.add_url('article_urls', website_id, url, depth + 1)


    # 取当前页的文章信息
    # 标题

    regexs = '<h1.*?>(.*?)</h1>'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)
    # 内容
    regexs = ['<div id="content">(.*?)<div class="clear"></div>',
              '<div class="article">(.*?)<!--文章操作-->',
              '<div id="video_area">(.*?)<!--文章操作-->',
              '<div class="content">(.*?)<div id="article_edit">'
              ]

    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                depth     = %d
                url       = %s
                title     = %s
                content   = %s
             '''%(depth+1, source_url, title, content))

    if content and title:
        base_parser.add_article_info('article_text_info', website_id, source_url, title, content)

    # 更新source_url为done
    base_parser.update_url('article_urls', source_url, Constance.DONE)

예제 #2

0

파일 보기

파일: cctv_parser.py 프로젝트: xuexiteam/internet-content-detection

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html = tools.get_html_by_urllib(source_url)
    if html == None:
        base_parser.update_url('article_urls', source_url, Constance.EXCEPTION)
        return

    # 判断中英文
    regex = '[\u4e00-\u9fa5]+'
    chinese_word = tools.get_info(html, regex)
    if not chinese_word:
        base_parser.update_url('article_urls', source_url, Constance.EXCEPTION)
        return
    urls = tools.get_urls(html, STOP_URLS)

    urls = tools.fit_url(urls, "cctv.com")
    for url in urls:
        # log.debug('url = ' + url)
        base_parser.add_url('article_urls', website_id, url, depth + 1)

    # 取当前页的文章信息
    # 标题

    regexs = '<h1><!--repaste.title.begin-->(.*?)<!--repaste.title.end-->'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)
    # 内容
    regexs = ['<!--repaste.body.begin-->(.*?)<!--repaste.body.end-->']

    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                depth     = %d
                url       = %s
                title     = %s
                content   = %s
             ''' % (depth + 1, source_url, title, content))

    if content and title:
        base_parser.add_article_info('article_text_info', website_id,
                                     source_url, title, content)

    # 更新source_url为done
    base_parser.update_url('article_urls', source_url, Constance.DONE)

예제 #3

0

파일 보기

def add_root_url(keywords):
    log.debug('''
        添加根url
        parser_params : %s
        ''' % str(keywords))
    for keyword in keywords:
        next_keyword = False
        for page_index in range(1, 10):
            url = 'http://so.video.sina.com.cn/interface/s?from=video&wd=%s&s_id=w00001&p=%s&n=20&s=1' \
                  % (keyword, page_index)
            info_json = tools.get_json_by_requests(url)
            video_info_list = info_json['list']
            if not video_info_list:
                print(url)
                break
            for video_info in video_info_list:
                image_url = video_info['thumburl']
                title = tools.del_html_tag(video_info['videoname'])
                url = video_info['url']
                release_time = video_info['showtime']

                is_continue = base_parser.save_video_info(
                    image_url=image_url,
                    url=url,
                    title=title,
                    release_time=release_time,
                    site_name=NAME)
                if not is_continue:
                    next_keyword = True
                    break

            if next_keyword:
                break

예제 #4

0

파일 보기

    def get_title(self):
        title = ''

        # 处理特殊的网站不规则的标题
        for domain, regex in SPECIAL_TITLE.items():
            if domain in self._url:
                title = tools.get_info(self._html, regex, fetch_one=True)
                break

        if not title:
            regex = '(?i)<title.*?>(.*?)</title>'
            title = tools.get_info(self._html, regex, fetch_one=True)
            title = title[:title.find('_')] if '_' in title else title
            title = title[:title.find('-')] if '-' in title else title
            title = title[:title.find('|')] if '|' in title else title

            if not title:
                regexs = [
                    '<h1.*?>(.*?)</h1>', '<h2.*?>(.*?)</h2>',
                    '<h3.*?>(.*?)</h3>', '<h4.*?>(.*?)</h4>'
                ]
                title = tools.get_info(self._html, regexs, fetch_one=True)

        title = tools.del_html_tag(title)
        return title

예제 #5

0

파일 보기

def get_content(mblog):
    try:
        content = tools.del_html_tag(mblog['text'])  # 正文
    except:
        content = ''
    finally:
        return content

예제 #6

0

파일 보기

파일: soubaidupan_parser.py 프로젝트: xuexiteam/internet-content-detection

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']

    html = tools.get_html_by_urllib(root_url)
    title = '<tr height="25"><td><a href=".*?"  title="(.*?)"'
    video_url = ['<tr height="25"><td><a href="(.*?)"']
    author = ['<a href="user-.*?.html" target="_blank">(.*?)</a>']
    watched_count = ['浏览次数: </span>(.*?)&nbsp']
    file_size = ['资料大小: </span>(.*?)&nbsp']
    download_count = ['下载次数: </span>(.*?)&nbsp']

    titles = tools.get_info(html, title, allow_repeat = True)
    video_urls = tools.get_info(html, video_url, allow_repeat = True)
    authors = tools.get_info(html, author, allow_repeat = True)
    watched_counts = tools.get_info(html, watched_count, allow_repeat = True)
    file_sizes = tools.get_info(html, file_size, allow_repeat= True)
    download_counts = tools.get_info(html, download_count, allow_repeat = True)


    for i in range(len(titles)):
        title = titles[i]
        title = tools.del_html_tag(title)

        video_url = video_urls[i]
        video_url = tools.get_full_url('http://www.sobaidupan.com', video_url)

        author = authors[i]
        watched_count = watched_counts[i]
        file_size = file_sizes[i]
        download_count = download_counts[i]

        log.debug('''
            标题：    %s
            视频地址： %s
            作者：    %s
            观看数    %s
            资料大小  %s
            下载次数  %s
        '''%(title, video_url, author, watched_count, file_size, download_count))

        contained_key, contained_key_count = base_parser.get_contained_key(title, '',
                                                            remark['search_keyword1'],
                                                            remark['search_keyword2'], remark['search_keyword3'])
        if not contained_key:
            continue

        base_parser.add_content_info('VA_content_info', SITE_ID, video_url, title, file_size = file_size,
                                     file_name = title, author = author, watched_count = watched_count,
                                     download_count = download_count, search_type = search_type,
                                     keyword = contained_key, keyword_count = contained_key_count, task_id = remark['task_id'])

    base_parser.update_url('VA_urls', root_url, Constance.DONE)

예제 #7

0

파일 보기

def spider_picture(p_url, end):
    for i in range(1,11):
        i = str(i)
        url = p_url+i+end
        html, r = tools.get_html_by_requests(url)
        regex = 'title=".*?".*?src = "(.*?)".*?<div class="wrapper-listTitle">'
        img_urls = tools.get_info(html, regex)
        regex_name = 'rseat="dsjp7".*?title="(.*?)".*?src = ".*?"'
        names = tools.get_info(html, regex_name)
        j=0
        for img_url in img_urls:
            name = names[j]
            name = tools.del_html_tag(name)
            j=j+1
            #print(img_url,'---',name,'****',j)
            FILE_LOCAL_PATH = 'd:'
            sto_path = '/picture/' + name + '.jpg'
            tools.download_file(img_url, FILE_LOCAL_PATH, sto_path)

예제 #8

0

파일 보기

 def add_root_urls(url):
     html, r = tools.get_html_by_requests(url)
     # print(html)
     regex = '<div class="site-piclist_pic">(.*?)</li>'
     html_infos = tools.get_info(html, regex)
     s = 0
     for info in html_infos:
         regex = 'href = "(.*?)" class="site-piclist_pic_link"'
         url = tools.get_info(info, regex)
         url = url and url[0] or ''
         regex = 'rseat="bigTitle.*?title="(.*?)"'
         name = tools.get_info(info, regex)
         name = name and name[0] or ''
         name = tools.del_html_tag(name)
         video_download_url = get_download_url(url)
         FILE_LOCAL_PATH = 'd:'
         sto_path = '/videos/' + name + '.mp4'
         tools.download_file(video_download_url, FILE_LOCAL_PATH, sto_path)
         print(video_download_url, name)

예제 #9

0

파일 보기

파일: violate_picture.py 프로젝트: xuexiteam/internet-content-detection

def spider_picture(p_url, end):
    for i in range(1, 7):
        i = str(i)
        url = p_url + i + end
        #print(url)
        html, r = tools.get_html_by_requests(url)
        #print(html)
        regex = '<a class="figure.*?<img.*?src="(.*?)"/>'
        img_urls = tools.get_info(html, regex)

        regex_name = 'data-widget-searchlist-tvname="(.*?)"'
        names = tools.get_info(html, regex_name)
        j = 0
        for img_url in img_urls:
            name = names[j]
            name = tools.del_html_tag(name)
            j = j + 1
            # if not re.match(".jpg", img_url):
            #     img_url = img_url+'.jpg'
            #print(img_url,'---',name,'****',j)
            FILE_LOCAL_PATH = 'd:'
            sto_path = '/ViolatePicture/' + name + '.jpg'
            tools.download_file(img_url, FILE_LOCAL_PATH, sto_path)

예제 #10

0

파일 보기

파일: baidu_parser.py 프로젝트: cash2one/internet-content-detection

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']

    html = tools.get_html_by_webdirver(root_url)
    headers = tools.get_tag(html, 'h3', {'class': 't'})

    for i in range(0, len(headers)):
        title = tools.get_text(headers[i])
        title = tools.del_html_tag(title)
        if tools.re.compile('的相关视频在线观看_百度视频').findall(title):
            continue

        try:
            ssurl = headers[i].a["href"]
        except:
            continue
        r = tools.requests.head(ssurl)
        url = r.headers['Location']

        try:
            img = headers[i].next_sibling()[0].img['src']
        except:
            img = ''

        try:
            release_time = headers[i].next_sibling()[0]
            release_time = ''.join(
                tools.re.compile('\d\d\d\d年\d+?月\d+?日').findall(
                    str(release_time)))
            if not release_time:
                release_time = headers[i].next_sibling()[1]
                release_time = ''.join(
                    tools.re.compile('\d\d\d\d年\d+?月\d+?日').findall(
                        str(release_time)))
                if not release_time:
                    release_time = headers[i].next_sibling()[2]
                    release_time = ''.join(
                        tools.re.compile('\d\d\d\d年\d+?月\d+?日').findall(
                            str(release_time)))
                    if not release_time:
                        release_time = headers[i].next_sibling()[3]
                        release_time = ''.join(
                            tools.re.compile('\d\d\d\d年\d+?月\d+?日').findall(
                                str(release_time)))
            release_time = release_time.replace('年', '-').replace('月',
                                                                  '-').replace(
                                                                      '日', '')
        except:
            release_time = ''

        content = ''
        for content in headers[i].next_sibling():
            content = tools.get_tag(content,
                                    'div', {'class': 'c-abstract'},
                                    find_all=False)
            if content:
                content = tools.get_text(content)
                break
            else:
                content = ''

        log.debug('''
            标题：   %s
            内容：   %s
            原文url：%s
            图片url：%s
            日期：   %s
               ''' % (title, content, url, img, release_time))

        contained_key, contained_key_count = base_parser.get_contained_key(
            title, content, remark['search_keyword1'],
            remark['search_keyword2'], remark['search_keyword3'])
        if not contained_key:
            continue

        is_video1 = base_parser.is_have_video_by_site(url)
        if not is_video1:
            is_video2 = base_parser.is_have_video_by_judge(title, content)
            if is_video2:
                html2 = tools.get_html_by_requests(url)
                is_video3 = base_parser.is_have_video_by_common(html2)
                if not is_video3:
                    continue
            else:
                continue

        base_parser.add_content_info('VA_content_info',
                                     SITE_ID,
                                     url=url,
                                     title=title,
                                     content=content,
                                     image_url=img,
                                     release_time=release_time,
                                     search_type=search_type,
                                     keyword=contained_key,
                                     keyword_count=contained_key_count)
    base_parser.update_url('VA_urls', root_url, Constance.DONE)

예제 #11

0

파일 보기

    def cluster_week_hot(self,
                         day_hot,
                         hot_value=None,
                         article_count=None,
                         vip_count=None,
                         negative_emotion_count=None,
                         weight=None):
        '''
        @summary: 聚类
        ---------
        @param hot:每日热点信息
        @param hot_value: 一条舆情的热度 （不为空时表示该条每日热点为更新热点，那么7日热点已经聚过此热点， 热度应该只加该条舆情的热度）
        @param article_count:
        @param vip_count:
        @param negative_emotion_count:
        @param weight:
        ---------
        @result:
        '''

        article_text = day_hot.get("TITLE")  # + hot.get("CONTENT")
        release_time = day_hot.get("RELEASE_TIME")

        article_text = tools.del_html_tag(article_text)

        hots = self._get_week_hots(article_text, release_time)

        # 找最相似的热点
        similar_hot = None
        max_similarity = 0
        for hot_info in hots:
            hot = hot_info.get('_source')
            hot_text = hot.get('TITLE')  # + hot.get('CONTENT')
            hot_text = tools.del_html_tag(hot_text)

            temp_similarity = compare_text(article_text, hot_text)
            if temp_similarity > MIN_SIMILARITY and temp_similarity > max_similarity:
                similar_hot = hot
                max_similarity = temp_similarity

            break  #hots 按照匹配值排序后，第一个肯定是最相似的，无需向后比较

        if similar_hot:  # 找到相似的热点
            if similar_hot["ID"] != day_hot["ID"]:  # 防止同一个舆情 比较多次
                data = {}

                # 更新热点的热度与文章数
                data['HOT'] = similar_hot['HOT'] + (hot_value
                                                    or day_hot.get('HOT'))
                data["ARTICLE_COUNT"] = similar_hot["ARTICLE_COUNT"] + (
                    article_count or day_hot.get('ARTICLE_COUNT'))

                # 更新主流媒体数量及负面舆情数量
                data["VIP_COUNT"] = similar_hot['VIP_COUNT'] + (
                    vip_count or day_hot.get('VIP_COUNT'))
                data["NEGATIVE_EMOTION_COUNT"] = similar_hot[
                    'NEGATIVE_EMOTION_COUNT'] + (
                        negative_emotion_count
                        or hot.get('NEGATIVE_EMOTION_COUNT'))

                # 更新相关度
                # data['WEIGHT'] = similar_hot['WEIGHT'] + (weight or day_hot['WEIGHT'])

                # 更新 hot_day_ids
                if not hot_value:
                    data["HOT_DAY_IDS"] = similar_hot[
                        'HOT_DAY_IDS'] + ',' + day_hot['ID']

                # 更新热点
                self._es.update_by_id("tab_iopm_hot_week_info",
                                      data_id=similar_hot.get("ID"),
                                      data=data)

            # 返回热点id
            return similar_hot.get("ID")
        else:
            # 将该舆情添加为热点
            hot_info = deepcopy(day_hot)

            # 处理事件类型
            del_tag_content = tools.del_html_tag(hot_info['CONTENT'])
            text = hot_info['TITLE'] + del_tag_content
            contain_event_ids = self._event_filter.find_contain_event(text)
            hot_info['EVENT_IDS'] = ','.join(contain_event_ids)

            hot_info['HOT_DAY_IDS'] = day_hot.get("ID")

            self._es.add('tab_iopm_hot_week_info',
                         hot_info,
                         data_id=hot_info['ID'])

            # 返回热点id
            return hot_info['ID']

예제 #12

0

파일 보기

파일: wechat_sogou.py 프로젝트: yunsite/wechat-spider-1

    def __get_account_blocks(self, account_id='', account=''):
        keyword = account_id or account  # 账号id优先

        log.debug('search keywords ' + keyword)

        cookie = self._sogou_cookies_manager.get_cookie()

        headers = {
            "Upgrade-Insecure-Requests": "1",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
            "Cache-Control": "max-age=0",
            "Connection": "keep-alive",
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            "Accept-Language": "zh-CN,zh;q=0.8",
            "Accept-Encoding": "gzip, deflate",
            "Cookie": cookie[1] if cookie else
            "ABTEST=5|1518054397|v1; SNUID=EAEB52552E2B4B87BB3903692F2AC2DE; IPLOC=CN1100; SUID=C5C47C7B6E2F940A000000005A7BABFD; JSESSIONID=aaa2WHQuoILPuc70EEQfw; SUID=C5C47C7B2313940A000000005A7BABFE; SUV=00BC2C447B7CC4C55A7BABFE845F5410",
            "Host": "weixin.sogou.com"
        }

        proxies = ip_proxies.get_proxies()
        headers["User-Agent"] = ip_proxies.get_user_agent()

        url = 'http://weixin.sogou.com/weixin?type=1&s_from=input&query=%s&ie=utf8&_sug_=n&_sug_type_=' % (
            keyword)
        html, request = tools.get_html_by_requests(
            url, headers=headers)  #, proxies = proxies)

        # 公众号信息块
        regex = '<!-- a -->(.*?)<!-- z -->'
        account_blocks = tools.get_info(html, regex)

        regex = '<input type=text name="c" value="" placeholder="(.*?)" id="seccodeInput">'
        check_info = tools.get_info(html, regex, fetch_one=True)
        if check_info:
            log.debug('''取公众号列表 : %s
                         url : %s
                      ''' % (check_info, url))

            self._sogou_cookies_manager.set_cookie_un_available(cookie)
            self._sogou_cookies_manager.monitor_cookies()

            # return constance.VERIFICATION_CODE
        else:
            self._sogou_cookies_manager.set_cookie_available(cookie)

        for account_block in account_blocks:
            regex = '<a.*?account_name.*?>(.*?)</a>'
            account = tools.get_info(account_block, regex, fetch_one=True)
            account = tools.del_html_tag(account)

            regex = '<label name="em_weixinhao">(.*?)</label>'
            account_id = tools.get_info(account_block, regex, fetch_one=True)

            if account.lower() == keyword.lower() or account_id.lower(
            ) == keyword.lower():
                return account_block
        else:
            return ''

예제 #13

0

파일 보기

파일: base_parser.py 프로젝트: xuexiteam/internet-content-detection

def add_WWA_search_app_info(table,
                            site_id,
                            url,
                            title='',
                            summary='',
                            update_info='',
                            score='',
                            author='',
                            app_url='',
                            image_url='',
                            software_size='',
                            tag='',
                            platform='',
                            download_count='',
                            release_time='',
                            language='',
                            sensitive_id='',
                            read_status=0):
    '''
    @summary:
    ---------
    @param title: 标题
    @param site_id: 网站id
    @param summary: 简介
    @param update_info: 更新信息
    @param socre: 评分
    @param author: 作者
    @param url: 原文url
    @param app_url: app下载的url
    @param image_url : 图片url（多个url逗号分割）
    @param classify_id: 分类
    @param software_size: 大小
    @param tag: 版本 |
    @param platform: 平台（ios / android）
    @param download_count:下载次数
    @param release_time: 发布时间
    @param record_time: 记录时间
    @param sensitive_id: varchar|||敏感信息id（多个敏感信息id用逗号分割）
    @param read_status: 读取状态（0没读， 1读取）
    ---------
    @result:
    '''

    # 过滤掉不符合的app
    from db.oracledb import OracleDB
    oracle_db = OracleDB()

    sql = 'select keyword from TAB_MVMS_SEARCH_INFO t where search_type = 703'
    results = oracle_db.find(sql)  #[('天天快报,今日头条,黑龙江',)]

    is_usefull = False

    text_content = title + summary + update_info + author
    for result in results:
        keywords = result[0]
        keywords = keywords.split(',')
        for keyword in keywords:
            if keyword in text_content:
                is_usefull = True
                break
        if is_usefull:
            break

    if not is_usefull:
        return

    if language == '中文':
        language = 601
    elif language == '英文':
        language = 602
    else:
        language = 603

    title = tools.del_html_tag(title)

    gameApp_info_dict = {
        'site_id': site_id,
        'url': url,
        'summary': tools.del_html_tag(summary, except_line_break=True),
        'title': title,
        'update_info': tools.del_html_tag(update_info, except_line_break=True),
        'score': score,
        'author': author,
        'app_url': app_url,
        'image_url': image_url,
        'software_size': software_size,
        'tag': tag,
        'platform': platform,
        'download_count': download_count,
        'release_time': release_time,
        'record_time': tools.get_current_date(),
        'language': language,
        'sensitive_id': sensitive_id,
        'read_status': 0,
        'sexy_image_status': '',
        'sexy_image_url': '',
        'image_pron_status': 0
    }
    db.add(table, gameApp_info_dict)

예제 #14

0

파일 보기

파일: luzhou_parser.py 프로젝트: xuexiteam/internet-content-detection

    base_parser.update_url('op_urls', source_url, Constance.DONE)

    # # 解析
    # html, request = tools.get_html_by_requests(root_url)
    # if not html:
    #     base_parser.update_url('urls', root_url, Constance.EXCEPTION)


if __name__ == '__main__':
    depth = 1
    url = 'http://www.lzzjw.com/List.asp?ID=13781'
    html = tools.get_html_by_requests(url, code='gb2312')
    regexs = '<h2 class="title">(.*?)</h2>'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)

    # 时间
    regexs = '<span>发布时间：(.*?)</span>'
    release_time = tools.get_info(html, regexs)
    release_time = release_time and release_time[0] or ''
    release_time = tools.del_html_tag(release_time)

    # 文章来源
    regexs = '<span>文章来源：(.*?)</span>'
    origin = tools.get_info(html, regexs)
    origin = origin and origin[0] or ''
    origin = tools.del_html_tag(origin)

    # 点击数
    regexs = '<span>点击数.*?src="(.*?)"></script>'

예제 #15

0

파일 보기

파일: sichuan_police_parser.py 프로젝트: xuexiteam/internet-content-detection

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html, request = tools.get_html_by_requests(source_url, code='gb2312')

    regexs = 'charset=(.*?)"'
    code = tools.get_info(html, regexs)
    code = code and code[0] or 'gb2312'
    html, request = tools.get_html_by_requests(source_url, code=code)
    if html == None:
        base_parser.update_url('op_urls', source_url, Constance.EXCEPTION)
        return

    urls = tools.get_urls(html)

    for url in urls:
        if re.match("http", url):
            new_url = url
        elif re.match('/', url):
            new_url = 'http://www.scpolicec.edu.cn' + url
        else:
            new_url = 'http://www.scpolicec.edu.cn/' + url
        base_parser.add_url('op_urls', website_id, new_url, depth + 1)


    # 取当前页的文章信息
    # 标题

    regexs = ['<div class="main_title">(.*?)<div class="top_about">', '<h1>(.*?)</h1>', '<title>(.*?)</title>',
              '<div class="contentPageTitle">(.*?)</div>']
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)

    #时间
    regexs = ['<div class="top_about"><a editurl=\'.*?\'>(.*?)</a>','<small>时间:</small>(.*?)<small>',
              '<h2><span>更新时间：(.*?)</span>']
    release_time = tools.get_info(html, regexs)
    release_time = release_time and release_time[0] or ''
    if not release_time:
            regexs = '</a> 发布时间：(.*?) 点击数'
            release_time = tools.get_info(html, regexs)
            release_time = release_time and release_time[0] or ''
            release_time = tools.format_date(release_time)


    # #作者
    regexs = ['作者:(.*?) 【']
    author = tools.get_info(html, regexs)
    author = author and author[0] or ''
    #author = tools.del_html_tag(author)

    #文章来源
    regexs = '来源:(.*?)</a>'
    origin = tools.get_info(html, regexs)
    origin = origin and origin[0] or ''
    origin = tools.del_html_tag(origin)

    # #点击数
    regexs = ['浏览:<font id="hits">(\d*?)</font>次', '点击数：(\d*?)&#xA;发表时间']
    watched_count = tools.get_info(html, regexs)
    watched_count = watched_count and watched_count[0] or ''
    watched_count = tools.del_html_tag(watched_count)

    # 内容
    regexs = ['<p style="text-align: center;">(.*?)</table>', '<div class="contentPageContent">(.*?)</table>'
              '<div id="endtext" style="width:900px;">(.*?)<div id="pages"></div>',
              '<div id="articleContnet">(.*?)<div class="page_css">']
    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                    depth               = %s
                    url                 = %s
                    title               = %s
                    release_time        = %s
                    author              = %s
                    origin              = %s
                    watched_count       = %s
                    content             = %s
                 ''' % (depth, source_url, title, release_time, author, origin, watched_count, content))

    if content and title:
        base_parser.add_op_info('op_content_info', website_id,url=source_url, title=title, release_time=release_time, author=author,
                                origin=origin, watched_count=watched_count, content=content)
    # 更新source_url为done
    base_parser.update_url('op_urls', source_url, Constance.DONE)

예제 #16

0

파일 보기

파일: yingyongbao_parser.py 프로젝트: xuexiteam/internet-content-detection

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']

    html = tools.get_html_by_urllib(root_url)
    regex = '<div class="app-info clearfix">(.*?)</li>'
    infos = tools.get_info(html, regex)
    for info in infos:
        title = 'target="_blank" class="name ofh">(.*?)</a>'
        title = tools.get_info(info, title, allow_repeat=True)
        title = ''.join(title)

        url = '<a href="(.*?)" target="_blank" class="app-info-icon">'
        url = tools.get_info(info, url)
        url = ''.join(url)
        url = 'http://sj.qq.com/myapp/' + url

        app_html = tools.get_html_by_urllib(url)
        del_app_html_tag = tools.del_html_tag(app_html)

        app_info = '<div class="det-app-data-info">(.*?)</div>'
        app_info = tools.get_info(app_html, app_info, allow_repeat=True)
        summary = app_info[0]

        update_info = app_info[1] if len(app_info) > 1 else ''

        score = '<div class="com-blue-star-num">(.*?)分</div>'
        score = tools.get_info(app_html, score)
        score = float(''.join(score))

        author = '开发商：(.*?)查看权限需'
        author = tools.get_info(del_app_html_tag, author)
        author = ''.join(author)

        app_url = 'ex_url="(.*?)"'
        app_url = tools.get_info(info, app_url)
        app_url = ''.join(app_url)

        image_url = '<img data-original="(.*?)" .*?>'
        image_url = tools.get_info(info, image_url)
        image_urls = '<img data-src=\'(.*?)\' src="" id=\'.*?\'/>'
        image_urls = tools.get_info(app_html, image_urls)
        image_url = ','.join(image_url + image_urls)

        classify_id = remark

        software_size = '<span class="size">(.*?)</span>'
        software_size = tools.get_info(info, software_size)
        software_size = ''.join(software_size)

        tag = '版本号：(.*?)更新时间'
        tag = tools.get_info(del_app_html_tag, tag)
        tag = ''.join(tag)

        platform = 'android'

        download_count = '<div class="det-ins-num">(.*?)下载</div>'
        download_count = tools.get_info(app_html, download_count)
        download_count = ''.join(download_count)

        release_time = '<div class="det-othinfo-data" id="J_ApkPublishTime" data-apkPublishTime="(\d*?)"></div>'
        release_time = tools.get_info(app_html, release_time)
        release_time = int(''.join(release_time))
        x = time.localtime(release_time)
        release_time = time.strftime("%Y-%m-%d", x)

        language = ''

        log.debug('''
                   标题:            %s
                   原文url:         %s
                   简介:            %s
                   更新:            %s
                   评分:            %.1f
                   作者:            %s
                   app下载的url:    %s
                   图片url:         %s
                   分类:            %s
                   大小:            %s
                   版本:            %s
                   平台:            %s
                   下载次数:        %s
                   发布时间:        %s
                   语言             %s
                   ''' % (title, url, summary, update_info, score, author,
                          app_url, image_url, classify_id, software_size, tag,
                          platform, download_count, release_time, language))

        base_parser.add_game_app_info('GameApp_content_info',
                                      site_id,
                                      url,
                                      title=title,
                                      summary=summary,
                                      update_info=update_info,
                                      score=score,
                                      author=author,
                                      app_url=app_url,
                                      image_url=image_url,
                                      classify_id=classify_id,
                                      software_size=software_size,
                                      tag=tag,
                                      platform=platform,
                                      download_count=download_count,
                                      release_time=release_time,
                                      language=language,
                                      sensitive_id='')

    base_parser.update_url('GameApp_urls', root_url, Constance.DONE)

예제 #17

0

파일 보기

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html = tools.get_html_by_urllib(source_url)
    if html == None:
        base_parser.update_url('op_urls', source_url, Constance.EXCEPTION)
        return

    urls = tools.get_urls(html)
    for url in urls:
        base_parser.add_url('op_urls', website_id, url, depth + 1)


    # 取当前页的文章信息
    # 标题

    regexs = '<strong class="NameTxt"><a >(.*?)</a></strong>'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)

    #时间
    regexs = '发表时间：(.*?)&nbsp;'
    release_time = tools.get_info(html, regexs)
    release_time = release_time and release_time[0] or ''
    release_time = tools.del_html_tag(release_time)

    #作者
    author = '编辑：(.*?)</div>'
    author = tools.get_info(html, regexs)
    author = release_time and author[0] or ''
    author = tools.del_html_tag(author)

    #文章来源
    regexs = '来源：(.*?)&nbsp'
    origin = tools.get_info(html, regexs)
    origin = origin and origin[0] or ''
    origin = tools.del_html_tag(origin)

    #点击数
    regexs = '评论：<span class="style1">(\d*?)</span>'
    watched_count = tools.get_info(html, regexs)
    watched_count = watched_count and watched_count[0] or ''
    watched_count = tools.del_html_tag(watched_count)

    # 内容
    regexs = ['<td height="2" class="graphic10">(.*?)来源']
    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                depth               = %s
                url                 = %s
                title               = %s
                release_time        = %s
                author              = %s
                origin              = %s
                watched_count       = %s
                content             = %s
             ''' % (depth, source_url, title, release_time, author, origin, watched_count, content))

    if content and title:
        base_parser.add_op_info('op_content_info', website_id, url=source_url, title=title, release_time=release_time, author=author,
                                origin=origin, watched_count=watched_count, content=content)

    # 更新source_url为done
    base_parser.update_url('op_urls', source_url, Constance.DONE)

예제 #18

0

파일 보기

파일: sichuanhuagong_parser.py 프로젝트: xuexiteam/internet-content-detection

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html, request = tools.get_html_by_requests(source_url, code='gb2312')
    if html == None:
        base_parser.update_url('op_urls', source_url, Constance.EXCEPTION)
        return

    urls = tools.get_urls(html)
    for url in urls:
        if re.match("http", url):
            new_url = url
        elif re.match('/', url):
            new_url = 'http://www.sccc.edu.cn/new' + url
        else:
            new_url = 'http://www.sccc.edu.cn/new/' + url
        base_parser.add_url('op_urls', website_id, new_url, depth + 1)
    regexs = '<script type="text/javascript" language="JavaScript" src="(.*?)"'
    urls = tools.get_info(html, regexs)
    for url in urls:
        new_url = 'http://www.sccc.edu.cn/new/' + url
        base_parser.add_url('op_urls', website_id, new_url, depth + 1)

    # 取当前页的文章信息
    # 标题

    regexs = 'td height="60" align="center" valign="bottom" class="nrbt">(.*?)</td>'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)

    #时间
    regexs = '<td height="3" align="center" valign="top">(.*?)</td>'
    release_time = tools.get_info(html, regexs)
    release_time = release_time and release_time[1] or ''

    # #作者
    regexs = '<td width="250">(.*?)</td>'
    author = tools.get_info(html, regexs)
    author = author and author[0] or ''
    #author = tools.del_html_tag(author)

    #文章来源
    regexs = '<td width="300">(.*?)</td>'
    origin = tools.get_info(html, regexs)
    origin = origin and origin[0] or ''
    origin = tools.del_html_tag(origin)

    # #点击数
    regexs = ' <td>阅读(\d*?)次</td>'
    watched_count = tools.get_info(html, regexs)
    watched_count = watched_count and watched_count[0] or ''
    watched_count = tools.del_html_tag(watched_count)

    # 内容
    regexs = ['<td class="nr">(.*?)</td>']
    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                    depth               = %s
                    url                 = %s
                    title               = %s
                    release_time        = %s
                    author              = %s
                    origin              = %s
                    watched_count       = %s
                    content             = %s
                 ''' % (depth + 1, source_url, title, release_time, author,
                        origin, watched_count, content))

    if content and title:
        base_parser.add_op_info('op_content_info',
                                website_id,
                                source_url,
                                title=title,
                                release_time=release_time,
                                author=author,
                                origin=origin,
                                watched_count=watched_count,
                                content=content)
    # 更新source_url为done
    base_parser.update_url('op_urls', source_url, Constance.DONE)

예제 #19

0

파일 보기

파일: netdisk_parser.py 프로젝트: xuexiteam/internet-content-detection

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']

    html, requests = tools.get_html_by_requests(root_url, headers=HEADER)
    titles = tools.get_tag(
        html, 'div', {'id': tools.re.compile('id_cse_content_item_mid_.')})

    for i in range(0, len(titles)):
        try:
            url = tools.get_tag(titles[i].previous_sibling.previous_sibling,
                                'a',
                                find_all=False)
            url = url['href']

            html2 = tools.get_html_by_urllib(url)
            regexs = ['<title>(.+?)</title>']
            mark = ''.join(tools.get_info(html2, regexs))
            regexs = ['不存在', '取消']
            if not tools.get_info(mark, regexs):
                title = tools.get_text(
                    titles[i].previous_sibling.previous_sibling)
                title = tools.del_html_tag(title)
                info = tools.get_text(titles[i])

                file_name = tools.del_html_tag(''.join(
                    tools.get_info(info, '文件名:(.+?)文')))

                file_size = tools.del_html_tag(''.join(
                    tools.get_info(info, '文件大小:(.+?)分')))

                author = tools.del_html_tag(''.join(
                    tools.get_info(info, '分享者:(.+?)时')))

                release_time = ''.join(tools.get_info(info,
                                                      '时间:(.+?)下')).replace(
                                                          '\n', '')

                download_count = tools.del_html_tag(''.join(
                    tools.get_info(info, '下载次数:(.+?)\.')))

        except:
            continue

            log.debug('''
                标题：    %s
                文件大小：%s
                文件名字：%s
                作者：    %s
                原文url： %s
                下载数量：%s
                日期：    %s
                   ''' % (title, file_size, file_name, author, url,
                          download_count, release_time))

            contained_key, contained_key_count = base_parser.get_contained_key(
                title, '', remark['search_keyword1'],
                remark['search_keyword2'], remark['search_keyword3'])
            if not contained_key:
                continue

            base_parser.add_content_info('VA_content_info',
                                         SITE_ID,
                                         url,
                                         title,
                                         file_size=file_size,
                                         file_name=file_name,
                                         author=author,
                                         release_time=release_time,
                                         download_count=download_count,
                                         search_type=search_type,
                                         keyword=contained_key,
                                         keyword_count=contained_key_count,
                                         task_id=remark['task_id'])
    base_parser.update_url('VA_urls', root_url, Constance.DONE)

예제 #20

0

파일 보기

파일: baidu_mobile_assistant_parser.py 프로젝트: xuexiteam/internet-content-detection

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']

    html = tools.get_html_by_urllib(root_url)
    regex = '<div class="app">(.*?)</li>'
    infos = tools.get_info(html, regex)
    for info in infos:
        try:
            image_url = '<img src="(.*?)" alt=".*?" />'
            image_url = tools.get_info(info, image_url)
            #image_url = ''.join(image_url)

            title = '<div class="top">(.*?)</a>'
            title = tools.get_info(info, title)
            title = ''.join(title)
            title = tools.del_html_tag(title)

            download_count = '<span class="download-num">(.*?)</span>次下载</em>'
            download_count = tools.get_info(info, download_count)
            download_count = ''.join(download_count)

            software_size = '<span class="size">(.*?)</span>'
            software_size = tools.get_info(info, software_size)
            software_size = ''.join(software_size)

            app_url = 'data_url="(.*?)"'
            app_url = tools.get_info(info, app_url)
            app_url = ''.join(app_url)

            url = '<a target="_blank" href="(.*?)">'
            url = tools.get_info(info, url)
            url = ''.join(url)
            url = 'http://shouji.baidu.com' + url
            app_html = tools.get_html_by_urllib(url)

            tag = '<span class="version">版本: (.*?)</span>'
            tag = tools.get_info(app_html, tag)
            tag = ''.join(tag)

            summary = [
                '<p class="content content_hover">(.*?)<span class="occupied">',
                '<p class="content">(.*?)</p>'
            ]
            summary = tools.get_info(app_html, summary)
            summary = ''.join(summary)

            image_urls = 'class="imagefix" src="(.*?)" alt=".*?-应用截图" /></li>'
            image_urls = tools.get_info(app_html, image_urls)
            image_url = ','.join(image_url + image_urls)

            score = '<span class="star-percent" style="width:(\d*?)%"></span>'
            score = tools.get_info(app_html, score)
            score = float(''.join(score))
            score = round(float(score / 20), 1)

            platform = 'android'

            language = '中文'

            log.debug('''
                       标题:            %s
                       原文url:         %s
                       简介:            %s
                       评分:            %.1f
                       app下载的url:    %s
                       图片url:         %s
                       大小:            %s
                       版本:            %s
                       平台:            %s
                       下载次数:         %s
                       语言:            %s
                       ''' %
                      (title, url, summary, score, app_url, image_url,
                       software_size, tag, platform, download_count, language))

            base_parser.add_WWA_search_app_info('WWA_search_app_content_info',
                                                site_id,
                                                url,
                                                title=title,
                                                summary=summary,
                                                score=score,
                                                app_url=app_url,
                                                image_url=image_url,
                                                software_size=software_size,
                                                tag=tag,
                                                platform=platform,
                                                download_count=download_count,
                                                language=language,
                                                sensitive_id='')

        except Exception as e:
            log.error(e)

    base_parser.update_url('WWA_search_app_urls', root_url, Constance.DONE)

예제 #21

0

파일 보기

파일: weibo_article_parser.py 프로젝트: striver-ing/mms

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    user_id = url_info['remark']['user_id']
    head_url = url_info['remark']['head_url']
    user_name = url_info['remark']['user_name']
    gender = url_info['remark']['gender']
    program_id = url_info['remark']['program_id']

    page_count = 50
    is_continue = True

    for i in range(0, page_count + 1):
        if not is_continue: break

        weibo_content_url = root_url + '&page=%d' % i

        headers = {
            "Cache-Control":
            "max-age=0",
            "Cookie":
            "_T_WM=e0a91a3ed6286a67e649ce567fbbd17a; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D2304131560851875_-_WEIBO_SECOND_PROFILE_WEIBO%26fid%3D100103type%253D401%2526q%253D%26uicode%3D10000011",
            "Accept-Language":
            "zh-CN,zh;q=0.8",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
            "Host":
            "m.weibo.cn",
            "Accept-Encoding":
            "gzip, deflate, br",
            "Upgrade-Insecure-Requests":
            "1",
            "Connection":
            "keep-alive",
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"
        }
        html = tools.get_json_by_requests(weibo_content_url, headers=headers)

        cards = tools.get_json_value(html, 'data.cards')
        if len(cards) < 2:
            base_parser.update_url('mms_urls', root_url, Constance.DONE)
            return

        for card in cards:
            mblog = tools.get_json_value(card, 'mblog')
            if not mblog:
                continue

            url = tools.get_json_value(card, 'scheme')
            article_id = tools.get_json_value(mblog, 'id')
            article_url = 'https://m.weibo.cn/status/' + article_id

            headers = {
                "User-Agent":
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
                "Accept":
                "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
                "Accept-Encoding": "gzip, deflate, br",
                "Cookie":
                "_T_WM=e0a91a3ed6286a67e649ce567fbbd17a; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D100103type%253D401%2526q%253D%26fid%3D2304131560851875_-_WEIBO_SECOND_PROFILE_WEIBO%26uicode%3D10000011",
                "Host": "m.weibo.cn",
                "Accept-Language": "zh-CN,zh;q=0.8",
                "Upgrade-Insecure-Requests": "1",
                "Connection": "keep-alive"
            }
            origin_html, r = tools.get_html_by_requests(url, headers=headers)
            if not origin_html:
                continue

            # 精确到具体时分秒 需进入到article_url
            release_time = mblog['created_at']
            release_time = tools.format_time(release_time)
            # release_time = get_release_time(mblog)
            release_time = tools.format_date(release_time)

            come_from = tools.get_json_value(mblog, 'source')
            regexs = ['"text": "(.+?)",']
            content = ''.join(tools.get_info(origin_html, regexs))
            # content = tools.del_html_tag(content)
            content = content.replace('\\', '')

            regexs = ['"pic_ids": \[(.*?)\],']
            image_url = ''.join(tools.get_info(origin_html, regexs))
            image_url = tools.del_html_tag(image_url).replace('\"',
                                                              '').replace(
                                                                  '\\n', '')
            if image_url:
                image_url = image_url.split(',')
                for i in range(len(image_url)):
                    image_url[i] = 'http://wx2.sinaimg.cn/large/' + image_url[
                        i] + '.jpg'

                image_url = ','.join(image_url)

            regexs = ['"stream_url": "(.*?)"']
            video_url = ''.join(tools.get_info(origin_html, regexs))
            transpond_count = tools.get_json_value(mblog, 'reposts_count')
            praise_count = tools.get_json_value(mblog, 'attitudes_count')
            comments_count = tools.get_json_value(mblog, 'comments_count')

            log.debug('''
                原文地址：     %s
                博主ID：       %s
                文章id         %s
                发布时间：     %s
                来自：         %s
                内容：         %s
                图片地址：     %s
                视频地址：     %s
                评论数：       %s
                转发数：       %s
                点赞数：       %s
                ''' % (article_url, user_id, article_id, release_time,
                       come_from, content, image_url, video_url,
                       comments_count, transpond_count, praise_count))

            if self_base_parser.add_article(article_id,
                                            head_url,
                                            user_name,
                                            release_time,
                                            None,
                                            content,
                                            image_url,
                                            None,
                                            praise_count,
                                            comments_count,
                                            program_id=program_id,
                                            gender=gender,
                                            url=article_url,
                                            info_type=1,
                                            emotion=random.randint(0, 2),
                                            collect=0,
                                            source='新浪微博'):

                if comments_count > 0:
                    parser_comment(article_id)
            else:
                is_continue = False
                break

    base_parser.update_url('mms_urls', root_url, Constance.DONE)

예제 #22

0

파일 보기

파일: magnet_parser.py 프로젝트: xuexiteam/internet-content-detection

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']

    html, requests = tools.get_html_by_requests(root_url)
    titles = tools.get_tag(html, 'h3')
    video_infos = tools.get_tag(html, 'dt')
    for i in range(0, len(titles)):
        title = tools.get_text(titles[i])
        title = tools.del_html_tag(title)
        try:
            url = titles[i].a['href']
        except:
            continue
        url = 'http://www.bturls.net' + url

        release_time = video_infos[i].span
        release_time = tools.get_text(release_time)

        file_size = video_infos[i].span.next_sibling.next_sibling
        file_size = tools.get_text(file_size)

        watched_count = video_infos[
            i].span.next_sibling.next_sibling.next_sibling.next_sibling
        watched_count = tools.get_text(watched_count)

        regexs = ['t/(.+?)\.']
        magnet_link = 'magnet:?xt=urn:btih:' + ''.join(
            tools.get_info(url, regexs))

        log.debug(
            '''
            标题：    %s
            文件大小：%s
            原文url： %s
            观看数量：%s
            磁力链接：%s
            日期：    %s
               ''' %
            (title, file_size, url, watched_count, magnet_link, release_time))

        contained_key, contained_key_count = base_parser.get_contained_key(
            title, '', remark['search_keyword1'], remark['search_keyword2'],
            remark['search_keyword3'])
        if not contained_key:
            continue

        base_parser.add_content_info('VA_content_info',
                                     SITE_ID,
                                     url,
                                     title,
                                     file_size=file_size,
                                     release_time=release_time,
                                     watched_count=watched_count,
                                     magnet_link=magnet_link,
                                     search_type=search_type,
                                     keyword=contained_key,
                                     keyword_count=contained_key_count,
                                     task_id=remark['task_id'])
    base_parser.update_url('VA_urls', root_url, Constance.DONE)

예제 #23

0

파일 보기

파일: sichuan_police_parser.py 프로젝트: xuexiteam/internet-content-detection

if __name__ == '__main__':
    # depth=1
    url = "http://scjyzsjy.ncss.org.cn/job/index"
    html, request = tools.get_html_by_requests(url, code='gb2312')

    regexs = 'charset=(.*?)"'
    code = tools.get_info(html, regexs)
    code = code and code[0] or 'gb2312'
    html, request = tools.get_html_by_requests(url, code=code)
    print(code)

    regexs = '<div class="main_title">(.*?)<div class="top_about">'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)

    # 时间
    regexs = ['<div class="top_about"><a editurl=\'.*?\'>(.*?)</a>']
    release_time = tools.get_info(html, regexs)
    release_time = release_time and release_time[0] or ''
    if not release_time:
        regexs = '<small>时间:</small>(.*?)<small>'
        release_time = tools.get_info(html, regexs)
        release_time = release_time and release_time[0] or ''
        if not release_time:
            regexs = '</a> 发布时间：(.*?) 点击数'
            release_time = tools.get_info(html, regexs)
            release_time = release_time and release_time[0] or ''
            release_time = tools.format_date(release_time)
    #

예제 #24

0

파일 보기

파일: article_sync.py 프로젝트: Boris-code/csr

    def deal_article(self, article_list):
        '''
        @summary:处理article
        ---------
        @param article_list:
        ---------
        @result:
        '''
        article_infos = []
        # 补全剩余的信息
        for article_info in article_list:
            # 互动量
            # print(tools.dumps_json(article_info))
            article_info['INTERACTION_COUNT'] = (
                article_info['UP_COUNT']
                or 0) + (article_info['TRANSMIT_COUNT']
                         or 0) + (article_info['REVIEW_COUNT']
                                  or 0) + (article_info['COMMENT_COUNT'] or 0)

            # 线索关键词比对
            del_tag_content = tools.del_html_tag(article_info['CONTENT'])
            text = article_info['TITLE'] + del_tag_content
            # print(text)
            keywords, clues_ids, zero_ids, first_id, second_ids, keyword_clues = self._compare_keywords.get_contained_keys(
                text)

            article_info['KEYWORDS'] = keywords
            article_info['CLUES_IDS'] = clues_ids
            article_info['ZERO_ID'] = zero_ids
            article_info['FIRST_ID'] = first_id
            article_info['SECOND_ID'] = second_ids
            article_info['KEYWORDS_COUNT'] = len(keyword_clues)
            article_info['KEYWORD_CLUES_ID'] = str(keyword_clues)

            # 线索与舆情中间表
            article_clues_srcs = []
            if clues_ids:
                for clues_id in clues_ids.split(','):
                    article_clues_src = self.get_article_clues_src()
                    article_clues_src['ID'] = tools.get_uuid(
                        clues_id, article_info['ID'])
                    article_clues_src['CLUES_ID'] = clues_id
                    article_clues_src['ARTICLE_ID'] = article_info['ID']

                    article_clues_srcs.append(article_clues_src)
                    self._es.add_batch(article_clues_srcs, "ID",
                                       'tab_iopm_article_clues_src')

            # 情感分析 (1 正 2 负 3 中立， 百度：0:负向，1:中性，2:正向)
            emotion = self._emotion.get_emotion(del_tag_content)
            if emotion == 0:
                emotion = 2

            elif emotion == 1:
                emotion = 3

            elif emotion == 2:
                emotion = 1

            else:
                emotion = 3

            article_info['EMOTION'] = emotion

            # 主流媒体
            is_vip = self._vip_checked.is_vip(
                article_info['URL']) or self._vip_checked.is_vip(
                    article_info['WEBSITE_NAME'])
            article_info["IS_VIP"] = is_vip

            # 计算相关度
            if article_info['CLUES_IDS']:
                url = IOPM_SERVICE_ADDRESS + 'related_sort'
                data = {
                    'article_id':
                    article_info['ID'],  # 文章id
                    'clues_ids':
                    article_info['CLUES_IDS'],  # 线索ids
                    'may_invalid':
                    0,  #是否可能无效（微博包含@ 或者#）
                    'vip_count':
                    article_info['IS_VIP'],  # 主流媒体数
                    'negative_emotion_count':
                    1 if article_info['EMOTION'] == 2 else 0,  # 负面情感数
                    'zero_ids':
                    article_info['ZERO_ID']
                }

                result = tools.get_json_by_requests(url, data=data)
                article_info['WEIGHT'] = result.get('weight', 0)
            else:
                article_info['WEIGHT'] = 0

            # 词语图
            word_cloud = self._word_cloud.get_word_cloud(del_tag_content)
            article_info['WORD_CLOUD'] = tools.dumps_json(word_cloud)

            # 摘要
            if not article_info['SUMMARY']:
                article_info['SUMMARY'] = self._summary.get_summary(
                    del_tag_content)

            # 统计相似文章 热点
            if article_info['INFO_TYPE'] == 3:  # 微博
                article_info['TITLE'] = article_info['SUMMARY'][:30]

            article_info['HOT_ID'] = self._hot_sync.get_hot_id(article_info)

            log.debug('''
                title         %s
                release_time  %s
                url           %s
                匹配的关键字：%s
                线索id        %s
                一级分类      %s
                二级分类      %s
                三级分类      %s
                关键词-线索   %s
                ''' % (article_info['TITLE'], article_info['RELEASE_TIME'],
                       article_info["URL"], keywords, clues_ids, zero_ids,
                       first_id, second_ids, keyword_clues))

            # print(tools.dumps_json(article_info))
            article_infos.append(article_info)

        # article入库
        print('article入库')
        # print(tools.dumps_json(article_infos))
        self._es.add_batch(article_infos, "ID", 'tab_iopm_article_info')

예제 #25

0

파일 보기

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html, request = tools.get_html_by_requests(source_url)
    if html == None:
        base_parser.update_url('op_urls', source_url, Constance.EXCEPTION)
        return

    urls = tools.get_urls(html)
    for url in urls:
        if re.match("http", url):
            new_url = url
        elif re.match("/", url):
            new_url = 'http://www.naxi.gov.cn' + url
        else:
            new_url = 'http://www.naxi.gov.cn/' + url
        base_parser.add_url('op_urls', website_id, new_url, depth + 1)

    # 取当前页的文章信息
    # 标题

    regexs = '<DIV class=news_conent_two_title>(.*?)</DIV>'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)

    #时间
    regexs = '<SPAN>日期：(.*?)</SPAN>'
    release_time = tools.get_info(html, regexs)
    release_time = release_time and release_time[0] or ''

    #文章来源
    regexs = '<SPAN>来源：(.*?)</SPAN>'
    origin = tools.get_info(html, regexs)
    origin = origin and origin[0] or ''
    origin = tools.del_html_tag(origin)

    # #点击数
    regexs = '<SPAN>点击数：(\d*?)</SPAN>'
    watched_count = tools.get_info(html, regexs)
    watched_count = watched_count and watched_count[0] or ''
    watched_count = tools.del_html_tag(watched_count)

    # 内容
    regexs = [
        '<DIV id=news_conent_two_text class=news_conent_two_text>(.*?)</DIV>'
    ]
    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                depth               = %s
                url                 = %s
                title               = %s
                release_time        = %s
                origin              = %s
                watched_count       = %s
                content             = %s
             ''' % (depth + 1, source_url, title, release_time, origin,
                    watched_count, content))

    if content and title:
        base_parser.add_op_info('op_content_info',
                                website_id,
                                url=source_url,
                                title=title,
                                release_time=release_time,
                                origin=origin,
                                watched_count=watched_count,
                                content=content)

    # 更新source_url为done
    base_parser.update_url('op_urls', source_url, Constance.DONE)

예제 #26

0

파일 보기

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html, request = tools.get_html_by_requests(source_url)
    if html == None:
        base_parser.update_url('op_urls', source_url, Constance.EXCEPTION)
        return
    urls = tools.get_urls(html)

    for url in urls:
        if re.match("http", url):
            new_url = url
        else:
            new_url = 'http://www.xuyong.gov.cn'+url
        base_parser.add_url('op_urls', website_id, new_url, depth + 1)

    # 取当前页的文章信息
    # 标题

    regexs = '<td  class="titlestyle1037"  align="center">(.*?)</td></tr>'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)

    #时间
    regexs = '<span  class="timestyle1037" >(.*?)</span>'
    release_time = tools.get_info(html, regexs)
    release_time = release_time and release_time[0] or ''
    release_time = tools.format_date(release_time)

    # #作者
    # regexs = '<span>作者：(.*?)</span>'
    # author = tools.get_info(html, regexs)
    # author = author and author[0] or ''
    # author = tools.del_html_tag(author)

    #文章来源
    # regexs = '采编:　(.*?)阅读'
    # origin = tools.get_info(html, regexs)
    # origin = origin and origin[0] or ''
    # origin = tools.del_html_tag(origin)

    # #点击数
    # regexs = '阅读:(\d*?)次'
    # watched_count = tools.get_info(html, regexs)
    # watched_count = watched_count and watched_count[0] or ''
    # watched_count = tools.del_html_tag(watched_count)

    # 内容
    regexs = ['<tr><td  class="contentstyle1037" >(.*?) <tr><td  class="pagestyle1037"  align="left">']
    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                depth               = %s
                url                 = %s
                title               = %s
                release_time        = %s
                content             = %s
             ''' % (depth+1, source_url, title, release_time,  content))

    if content and title:
        base_parser.add_op_info('op_content_info', website_id, url=source_url, title=title,
                                release_time=release_time, content=content)

    # 更新source_url为done
    base_parser.update_url('op_urls', source_url, Constance.DONE)

예제 #27

0

파일 보기

파일: luzhou_parser.py 프로젝트: xuexiteam/internet-content-detection

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html, request = tools.get_html_by_requests(source_url)
    if html == None:
        base_parser.update_url('op_urls', source_url, Constance.EXCEPTION)
        return

    urls = tools.get_urls(html)
    for url in urls:
        if re.match("http", url):
            new_url = url
        elif re.match("&#xD;&#xA", url):
            regex = '.*?(/GovPublicInfo.+?000)'
            new_url = tools.get_info(url, regex)
            new_url = new_url[0]
            new_url = 'http://www.luzhou.gov.cn' + new_url
        else:
            new_url = 'http://www.luzhou.gov.cn' + url
        base_parser.add_url('op_urls', website_id, new_url, depth + 1)

    # 取当前页的文章信息
    # 标题
    regexs = '<h2 class="title">(.*?)</h2>'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)

    #时间
    regexs = '<span>发布时间：(.*?)</span>'
    release_time = tools.get_info(html, regexs)
    release_time = release_time and release_time[0] or ''
    release_time = tools.format_date(release_time)

    #文章来源
    regexs = '<span>文章来源：(.*?)</span>'
    origin = tools.get_info(html, regexs)
    origin = origin and origin[0] or ''
    origin = tools.del_html_tag(origin)

    #点击数
    regexs = '<span>点击数.*?src="(.*?)"></script>'
    times_script_url = tools.get_info(html, regexs)
    times_script_url = ''.join(times_script_url)
    times_script_url = 'http://www.luzhou.gov.cn' + times_script_url
    watched_count_html, request = tools.get_html_by_requests(times_script_url)
    regexs = '\'(\d*?)\''
    watched_count = tools.get_info(watched_count_html, regexs)
    watched_count = watched_count and watched_count[0] or ''
    watched_count = tools.del_html_tag(watched_count)

    # 内容
    regexs = ['<div class="conTxt">(.*?)</div>']
    content = tools.get_info(html, regexs)
    content = content and content[0] or ''
    content = tools.del_html_tag(content)

    log.debug('''
                depth               = %s
                url                 = %s
                title               = %s
                release_time        = %s
                origin              = %s
                watched_count       = %s
                content             = %s
             ''' % (depth + 1, title, source_url, release_time, origin,
                    watched_count, content))

    if content and title:
        base_parser.add_op_info('op_content_info',
                                website_id,
                                url=source_url,
                                title=title,
                                release_time=release_time,
                                origin=origin,
                                watched_count=watched_count,
                                content=content)

    # 更新source_url为done
    base_parser.update_url('op_urls', source_url, Constance.DONE)

예제 #28

0

파일 보기

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html = tools.get_html_by_urllib(source_url)
    if html == None:
        base_parser.update_url('article_urls', source_url, Constance.EXCEPTION)
        return

    # 判断中英文
    regex = '[\u4e00-\u9fa5]+'
    chinese_word = tools.get_info(html, regex)
    if not chinese_word:
        base_parser.update_url('article_urls', source_url, Constance.DONE)
        return

    # 取当前页面的全部url
    urls = tools.get_urls(html, STOP_URLS)

    # 过滤掉外链接 添加到数据库
    fit_url = tools.fit_url(urls, "ifeng.com")
    for url in fit_url:
        # log.debug('url = ' + url)
        base_parser.add_url('article_urls', url, website_id, depth + 1)

    # 取当前页的文章信息
    # 标题
    regexs = '<h1.*?>(.*?)</h1>'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.del_html_tag(title)
    # 内容
    regexs = [
        '<div id="main_content".*?>(.*?)</div>',
        '<div class="yc_con_l">(.*?)<div class="txt_share_box"',
        '<div id="slide_no_insert_default"></div>(.*?)</div>'
    ]

    content = tools.get_info(html, regexs)
    content = content and content[0] or ''

    content = tools.del_html_tag(content)

    log.debug('''
                depth     = %d
                url       = %s
                title     = %s
                content   = %s
             ''' % (depth + 1, source_url, title, content))

    if content and title:
        base_parser.add_article_info('article_text_info', website_id,
                                     source_url, title, content)

    # 更新source_url为done
    base_parser.update_url('article_urls', source_url, Constance.DONE)

예제 #29

0

파일 보기

파일: wechat_sogou.py 프로젝트: yunsite/wechat-spider-1

    def get_biz(self, account_id='', account=''):
        '''
        @summary: 获取公众号的__biz参数
        ---------
        @param account_id:
        @param account:
        ---------
        @result:
        '''
        account_block = self.__get_account_blocks(account_id, account)
        if account_block == constance.VERIFICATION_CODE:
            return constance.VERIFICATION_CODE

        keyword = account_id or account

        regex = '<a.*?account_name.*?>(.*?)</a>'
        account = tools.get_info(account_block, regex, fetch_one=True)
        account = tools.del_html_tag(account)

        regex = '<label name="em_weixinhao">(.*?)</label>'
        account_id = tools.get_info(account_block, regex, fetch_one=True)

        regex = '<a.*?account_name.*?href="(.*?)">'
        account_url = tools.get_info(account_block, regex, fetch_one=True)
        account_url = account_url.replace('&amp;', "&")

        # 取biz
        headers = {
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            "Accept-Language":
            "zh-CN,zh;q=0.8",
            "Host":
            "mp.weixin.qq.com",
            "Connection":
            "keep-alive",
            "Referer":
            "http://weixin.sogou.com/weixin?type=1&s_from=input&query=%s&ie=utf8&_sug_=n&_sug_type_="
            % keyword,
            "Cookie":
            account_url,
            "Accept-Encoding":
            "gzip, deflate, br",
            "Cache-Control":
            "max-age=0",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
            "Upgrade-Insecure-Requests":
            "1"
        }

        # proxies = ip_proxies.get_proxies()
        # headers["User-Agent"] = ip_proxies.get_user_agent()

        html, request = tools.get_html_by_requests(
            account_url)  #, proxies = proxies)
        regex = '<div class="weui_cells_tips">(.*?)</div>'
        check_info = tools.get_info(html, regex, fetch_one=True)
        if check_info:
            log.debug('''取公众号文章页 : %s
                         url : %s
                      ''' % (check_info, account_url))
            return ''

        regex = 'var biz = "(.*?)"'

        __biz = tools.get_info(html, regex, fetch_one=True)

        log.debug('''
            公众号名称          %s
            公众号账号          %s
            账号url             %s
            __biz               %s
            ''' % (account, account_id, account_url, __biz))

        return __biz

예제 #30

0

파일 보기

def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    website_id = url_info['site_id']
    description = url_info['remark']

    html = tools.get_html_by_urllib(source_url, code='GB2312')
    if html == None:
        base_parser.update_url('article_urls', source_url, Constance.EXCEPTION)
        return

    # 判断中英文
    regex = '[\u4e00-\u9fa5]+'
    chinese_word = tools.get_info(html, regex)
    if not chinese_word:
        base_parser.update_url('article_urls', source_url, Constance.EXCEPTION)
        return

    # 取当前页面的全部url
    urls = tools.get_urls(html, STOP_URLS)

    # 过滤掉外链接 添加到数据库
    fit_url = tools.fit_url(urls, "people.com.cn")
    for url in fit_url:
        # log.debug('url = ' + url)
        base_parser.add_url('article_urls', website_id, url, depth + 1)

    # 取当前页的文章信息
    # 标题

    regexs = '<h1>(.*?)</h1>'
    title = tools.get_info(html, regexs)
    title = title and title[0] or ''
    title = tools.replace_str(title, '&.*?;')
    # 内容
    regexs = [
        'div class="box_pic"></div>(.*?)<div class="box_pic"></div>',
        '<div class="content clear clearfix">(.*?)<div class="edit clearfix">',
        '<div class="show_text">(.*?)<div class="edit">'
    ]

    content = tools.get_info(html, regexs)
    content = content and content[0] or ''

    content = tools.del_html_tag(content)

    log.debug('''
                    depth     =  %d
                    source_url = %s
                    title     =  %s
                    content   =  %s
                 ''' % (depth, source_url, title, content))

    if content and title:
        base_parser.add_article_info('article_text_info', website_id,
                                     source_url, title, content)

    # 更新source_url为done
    base_parser.update_url('article_urls', source_url, Constance.DONE)