示例#1
0
    def getArticle(self, config_=None):
        """\

        """
        # load test case data
        self.loadData()
        self.loadHtml()

        # basic configuration
        # no image fetching
        config = self.getConfig()
        if config is not None:
            if isinstance(config_, dict):
                for k, v in list(config_.items()):
                    if hasattr(config, k):
                        setattr(config, k, v)
        self.parser = config.get_parser()

        # target language
        # needed for non english language most of the time
        target_language = self.data.get('target_language')
        if target_language:
            config.target_language = target_language
            config.use_meta_language = False

        # read in the basic image...
        with open(
                '{}/data/images/50850547cc7310bc53e30e802c6318f1'.format(
                    CURRENT_PATH), 'rb') as fobj:
            img_content = fobj.read()

        # read in another, blank image
        with open('{}/data/images/blank.jpeg'.format(CURRENT_PATH),
                  'rb') as fobj:
            blank_img = fobj.read()

        # run goose
        g = Goose(config=config)

        with requests_mock.Mocker(real_http=False) as m:

            # load images for those tests
            m.get('http://go.com/images/465395/', content=blank_img)
            m.get('http://bla.com/images/465395/', content=blank_img)
            m.get(
                'http://md0.libe.com/photo/465395/?modified_at=1351411813&ratio_x=03&ratio_y=02&width=476',
                content=img_content)
            # if the url is not given in the result json, use the raw_html parameter.
            if "url" in self.data:
                m.get(self.data['url'], text=self.html)
                return g.extract(url=self.data['url'])
            else:
                return g.extract(raw_html=self.html)
示例#2
0
 def parse_detail(self, response):
     # 学术讲座 http://www.cqupt.edu.cn/cqupt/news_detail.shtml?id=155176964575282691
     # 列表 API http://www.cqupt.edu.cn/getPublicPage.do 外加参数 cookie
     # js 动态加载,详情API http://www.cqupt.edu.cn/getPublicNotic.do?id=155176964575282691
     item_loader = CquptSpiderItemLoader(item=CquptSpiderItem(),
                                         response=response)
     g = Goose({'stopwords_class': StopWordsChinese})
     content = g.extract(raw_html=response.text)
     item_loader.add_value('url', response.url)
     item_loader.add_value('url_obj_id', response.url)
     item_loader.add_xpath('html_title', '/html/head/title/text()')
     item_loader.add_value('crawl_time', datetime.datetime.now())
     if len(content.cleaned_text) < self.main_content_min_length:
         # 正文长度不够,认为是导航页或者列表页
         # 尝试解析SEO 信息
         item_loader.add_xpath(
             'meta_description',
             "/html/head/meta[@name='description']/@content")
         item_loader.add_xpath(
             'meta_keywords',
             "/html/head/meta[@name='keywords']/@content | "
             "/html/head/meta[@name='Keywords']/@content")
         item_loader.add_value('tags', content.title)
     else:
         item_loader.add_value('meta_keywords', content.meta_keywords)
         item_loader.add_value('meta_description', content.meta_description)
         item_loader.add_value('title', content.title)
         item_loader.add_value('create_date', content.publish_date)
         item_loader.add_value('authors', content.authors)
         item_loader.add_value('top_image', content.top_image)
         item_loader.add_value('tags', content.tags)
         item_loader.add_value('content', content.cleaned_text)
     item = item_loader.load_item()
     return item
示例#3
0
    def gooseChineseExample(self):

        data_list = []
        # 文章地址
        num = 0
        for url in self.Baiduurl:
            # 初始化,设置中文分词
            g = Goose({'stopwords_class': StopWordsChinese})
            # 获取文章内容
            article = g.extract(url=url)
            # 获取标题
            title = article.title
            data_list.append('标题: ' + title)
            # 获取来源
            source = self.getSource()
            data_list.append('来源: ' + str(source[num]))
            # 发布时间
            Time = self.getTime()
            data_list.append('发布时间: ' + str(Time[num]))
            # 显示正文
            text = article.cleaned_text
            data_list.append('文本: ' + text)
            data_list.append(
                '============================================================================='
            )
            num += 1
        data_list = '\n'.join(data_list)
        print(data_list)
示例#4
0
 def get_news_result_cnt(self, news_url):
     config = Configuration()
     config.http_proxies = {'http': self.proxy, 'https': self.proxy}
     config.browser_user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'
     config.stopwords_class = StopWordsChinese
     config.http_proxies = {'http': self.proxy, 'https': self.proxy}
     g = Goose(config)
     article = g.extract(news_url)
     try:
         published_time = int(
             parse(article.publish_date).timestamp() *
             1000) if article.publish_date else None
     except:
         published_time = int(
             time.mktime(time.strptime(article.publish_date, "%Y年%m月%d日")) *
             1000) if article.publish_date else None
     news_post = dict(doc_id=md5(
         article.final_url.encode('utf-8')).hexdigest(),
                      keyword='',
                      url=article.final_url,
                      title=article.title,
                      platform='news',
                      content=article.cleaned_text,
                      author=article.authors,
                      source=self.source,
                      published_time=published_time,
                      spi_time=int(time.time() * 1000))
     return news_post
示例#5
0
def gooseChineseExample():
    g = Goose({'stopwords_class': StopWordsChinese})
    url = "https://item.btime.com/36a0f17i0489keqltn35q96p4lr?from=haozcxw"
    article = g.extract(url=url)
    print(article.title)
    print(article.meta_description)
    print(article.cleaned_text[:150])
示例#6
0
    def get_content(link):
        link = link
        g = Goose({
            'use_meta_language': False,
            'target_language': 'id',
            'enable_image_fetching': True,
        })
        extract = g.extract(url=link)

        content = extract.cleaned_text
        content = GetContent.remove_publisher(content)
        content = content.replace('."', '. ')
        content = content.replace('\n', ' ').replace('   ', ' ').replace('  ', ' ').replace("\'", "").strip('-').strip()
        content = re.sub(r'[^\x00-\x7F]+', '', content)
        content = content.replace(' ...', '.').replace('.. .', '. ')
        content = GetContent.brut_split(content)
        content = content.replace('.CO', '').replace('.COM', '').replace('. CO', '').replace('. COM', '')
        content = content.strip('.').strip() + '.'
        content = GetContent.remove_baca(content)
        spoiler = content[:150] + '...'
        try:
            image = extract.top_image
            image_src = image.src
        except:
            image_src = ''

        if len(content) <= 500:
            return "Not Valid"
        else:
            return content, spoiler, image_src
示例#7
0
def get_article_content(url):
    try:
        logger.info("Getting article content of " + url + " with Goose")
        goose_config = {
            'browser_user_agent': 'Mozilla',
            'parser_class': 'lxml',  # soup or lxml for parsing xml and html
            # 'enable_image_fetching': True,
            'http_timeout': browser_timeout
        }

        if config["proxy"]["enabled"].lower() == "true":
            goose_config["http_proxy"] = config["proxy"]["http_ip_port"]
            goose_config["https_proxy"] = config["proxy"]["https_ip_port"]

        g = Goose(goose_config)
        logger.debug("Goose current parser is {}".format(
            g.config.get_parser()))
        article = g.extract(url=url)
        logger.debug("Extracted content of article from {}".format(url))
        content = article.cleaned_text.replace("\n", " ")
        cleaned_text = article.cleaned_text
        paragraphs_list = list()
        paragraphs_list = paragraphs_list + cleaned_text.split('\n')

        logger.debug(content)

        return {"content": content, "paragraphs_list": paragraphs_list}
    except Exception as e:
        logging.exception(
            "Error getting article's content from {}".format(url))
        erroneous_urls.append({"url": url, "error": "Unable to get content"})
        content = ""
        return {"content": content, "paragraphs_list": list()}
示例#8
0
class HTMLExtractor:
    """Extract information from html

    Currently, it can extract title and links using bs4, and content using goose3.
    """
    def __init__(self, html):
        """
        @param html: str
        """
        self.html = html
        self.soup = BeautifulSoup(html, 'lxml')
        self.goose = Goose({'enable_image_fetching': False})

    def extract_title(self):
        """Extract title from html

        @return title: str
        """
        return self.soup.title.get_text()

    def extract_links(self):
        """Extract links from html

        @return links: list
        """
        link_elements = self.soup.find_all('a')
        return [link_element['href'] for link_element in link_elements]

    def extract_content(self):
        """Extract pure content from html

        @return conent: str
        """
        return self.goose.extract(raw_html=self.html).cleaned_text
示例#9
0
def extract_article_information_from_html(html):
    """
    This methods gets a website the HTML as string and extracts the text of
    the article

    :param html: a HTML object from package requests
    :return: the article information
    """
    article_information = {}

    # run with newspaper
    article_newspaper = Article('')
    article_newspaper.set_html(html)
    article_newspaper.parse()

    article_information["summary"] = article_newspaper.summary
    article_information["author"] = str(article_newspaper.authors).strip('[]')
    article_information["tags"] = article_newspaper.tags
    article_information["title"] = article_newspaper.title

    newspaper_text = article_newspaper.text
    # run with newsplease
    # article_newsplease = NewsPlease.from_html(html)
    # newsplease_text = article_newsplease.cleaned_text
    # run with goose
    goose_extractor = Goose()
    goose_extractor = goose_extractor.extract(raw_html=html)
    article_goose = goose_extractor.cleaned_text
    if len(newspaper_text.split(" ")) > len(article_goose.split(" ")):
        article_information["text"] = newspaper_text
    else:
        article_information["text"] = article_goose
    return article_information
示例#10
0
def gather_informations():
    url = request.args['url']
    g = Goose(
        config={
            'local_storage_path':
            './data/' if os.environ.get('BUCKET_HOST') is not None else '.',
            'enable_image_fetching':
            True
        })
    try:
        goose_response = g.extract(url=url)
        print(goose_response.top_image)
    except Exception as e:
        print('error')
        return jsonify({'error': True, 'message': e}), 500

    response_img = ''
    if goose_response.top_image is not None:
        response_img = goose_response.top_image.src
    return jsonify({
        'title': goose_response.title,
        'urlRequested': url,
        'text': goose_response.cleaned_text[:200],
        'mainImage': response_img
    }), 200
示例#11
0
class PageReaderGoose(PageReaderBase):
    def __init__(self, url, lang="en"):
        PageReaderBase.__init__(self, url=url, lang=lang)
        self.text_property = "cleaned_text"
        self.title_property = "title"
        self.authors_property = "authors"
        self.publish_date_property = "publish_date"
        self.html_property = "raw_html"
        self.dom_property = "doc"

        if lang is None:
            self.g = Goose()
        else:
            lang = lang[:2].lower()
            if lang == "en":
                self.g = Goose()
            else:
                if lang == "zh":
                    stopwords_class = StopWordsChinese
                elif lang == "ko":
                    stopwords_class = StopWordsKorean
                elif lang == "ar":
                    stopwords_class = StopWordsArabic
                self.g = Goose({'stopwords_class': stopwords_class})

    def _read(self):
        if self.article is None:
            try:
                self.article = self.g.extract(url=self.url)
            except:
                logger.info(
                    "failed when loading article content for {}\nError: {}".
                    format(self.url, traceback.format_exc()))
        return self.article
示例#12
0
def get_paragrams(search_res):
    """
        Args:
            search_res:返回一组搜索结果和链接

        Returns:
            clean_res:返回所有链接的正文段落

        Raises:
            e:文章段落分割异常
    """
    paras = []
    goose = Goose({'browser_user_agent': 'Mozilla', 'parser_class': 'soup',
                   'stopwords_class': StopWordsChinese})  # 设置goose参数
    for ind, res_elem in enumerate(search_res):
        try:
            res_herf = res_elem[1]
            if get_access_result(target_url=res_herf) == None:  # 测试是否可以访问
                print('Can\'t access to website:'+res_herf)
                continue
            article = goose.extract(url=res_herf)  # 正文提取 异常处理
            paras.extend(list(article.cleaned_text.split()))  # 分割成段
        except Exception as e:
            print("Fail to split paragrams in", res_elem[1], end='  ')
            print(e)
            continue
    return paras
def get_links(website_url: str) -> list:
    '''
        It used to get the links in a web page
    :param website_url: the URL of website that is going to be extracted
    :return: a link list, after a rough selection
    '''
    links = set()
    # create goose and bs4 instance,
    g = Goose()
    try:
        main_page = g.extract(url=website_url)
        soup = BeautifulSoup(main_page.raw_html, 'lxml')
        # Get the link
        for line in soup.find_all('a'):
            link = line.get('href')
            if link is not None:
                # very few start with empty space in the head, so delete it
                link = link.strip(' ')
                links.add(link)
        print('Extracted: ', website_url)
    except Exception as e:
        # Print the error message if failed to extract
        print('Fail to extract: ', website_url, '   Error:', str(e))
    if len(links) == 0:
        print(
            'Warning! Function: get_links() output empty list when extracting ',
            website_url)
    return list(links)
示例#14
0
    def get_news(
            self):  # 실제로 url에 들어가 기사들을 읽어온다 , 첫번째 카테고리만으로 검색했을때 데이터를 가져와준다
        #categories 는 1,2,3숫자를 받는다(여러개 가능)
        print('기사 추출 시작')
        for url in self.urls:
            try:
                category = self.categories[self.choose_category - 1]

                g = Goose({'stopwords_class': StopWordsKorean})
                article = g.extract(url=url)
                title = article.title
                #print(title)
                content = self.read_article_contents(url)
                if content == "":
                    continue
                print(content)
                self.article_info["category"] = category
                self.article_info["contents"] = content
                self.article_info["title"] = title
                self.article_info["url"] = url
                self.articles.append(self.article_info)
                self.num_article += 1
            except:
                continue

        return self.articles
示例#15
0
    def getArticle(self):
        """\

        """
        # load test case data
        self.loadData()
        self.loadHtml()

        # basic configuration
        # no image fetching
        config = self.getConfig()
        self.parser = config.get_parser()

        # target language
        # needed for non english language most of the time
        target_language = self.data.get('target_language')
        if target_language:
            config.target_language = target_language
            config.use_meta_language = False

        with requests_mock.Mocker(real_http=True) as m:
            m.get(self.data['url'], text=self.html)
            # run goose
            g = Goose(config=config)
            return g.extract(url=self.data['url'])
示例#16
0
def summarize(url):
    g = Goose()
    article = g.extract(url=url)
    clean = article.cleaned_text
    stopword_set = set(stopwords.words("english"))
    sentence_list = nltk.sent_tokenize(clean)

    word_frequencies = {}
    for word in nltk.word_tokenize(clean):
        if word not in stopword_set:
            if word not in word_frequencies.keys():
                word_frequencies[word] = 1
            else:
                word_frequencies[word] += 1

    maximum_frequency = max(word_frequencies.values())

    for word in word_frequencies.keys():
        word_frequencies[word] = (word_frequencies[word]/maximum_frequency)
    sentence_scores = {}
    for sent in sentence_list:
        for word in nltk.word_tokenize(sent.lower()):
            if word in word_frequencies.keys():
                if len(sent.split(' ')) < 30:
                    if sent not in sentence_scores.keys():
                        sentence_scores[sent] = word_frequencies[word]
                    else:
                        sentence_scores[sent] += word_frequencies[word]

    summary_sentences = heapq.nlargest(4, sentence_scores, key=sentence_scores.get)

    summary = ' '.join(summary_sentences)

    return summary
示例#17
0
    def read_articles(self, headlines=None, save_continuously=False, save_dir=""):
        if headlines is None:
            headlines = self.headlines
        extractor = Goose()
        for date, daily_news in headlines.items():
            # Shuffle since if there are too many some will be ignored
            # and we want the ignored ones to be randomly deselected
            shuffle(daily_news)

            news_read = []
            for new in daily_news:
                try:
                    body = extractor.extract(url=new["link"]).cleaned_text
                    news_read.append({**new, "body": body})
                    if len(self.news) == settings["max_news_per_day"]:
                        break
                except NetworkError:
                    logger.error("Page not found in {}".format(new["link"]))
                except MissingSchema:
                    logger.warning("Couldn't read link {}".format(new["link"]))
                    logger.warning("  Reason: string 'http://' might be missing")
                except Exception as e:
                    logger.warning("Unknown exception while trying to read {}".format(new["link"]))
                    logger.warning("   {}".format(e))
            if len(news_read) > 0:
                self.news[date] = news_read
                if save_continuously:
                    if save_dir == "":
                        logger.warning("Please provide a save directory")
                    else:
                        self.save_news(save_dir, {date: news_read})
        logger.info("From {} headlines, {} of their articles where correctly downloaded".format(
            sum([len(headers) for headers in self.headlines.values()]),
            sum([len(day_news) for day_news in self.news.values()])))
        return self.news
示例#18
0
 def get_news_result_cnt(self, news_url, keyword=''):
     config = Configuration()
     config.http_proxies = {
         'http': self.proxy,
         'https': self.proxy
     }
     config.browser_user_agent = self.ua
     config.stopwords_class = StopWordsChinese
     config.http_proxies = {
         'http': self.proxy,
         'https': self.proxy
     }
     g = Goose(config)
     article = g.extract(news_url)
     text_html = article.raw_html
     text_tree = etree.HTML(text_html)
     if article.cleaned_text:
         cont = article.cleaned_text
     else:
         cont = ''.join(text_tree.xpath('//div[@class="col-md-10 col-xs-12 detailNews"]/p//text()')).replace('\xa0',
                                                                                                             '')
     art_title = article.title
     news_post = dict(
         doc_id=md5(article.final_url.encode('utf-8')).hexdigest(),
         keyword=keyword,
         url=article.final_url,
         title=art_title,
         platform='news',
         content=cont,
         author=article.authors,
         source=self.source,
         published_time=int(parse(article.publish_date).timestamp() * 1000) if article.publish_date else None,
         spi_time=int(time.time() * 1000)
     )
     return news_post
def get_articles(path,
                 news_website='https://www.yahoo.com/news/',
                 max_articles=150):
    # articles should be saved in /articles file
    # See Goose and newspaper3k documentation for explanation on how to use
    # these packages (tried to use Beautiful Soup for this but was
    # frustratingly difficult, since it seems that the newspages load when
    # someone is actually on the side, instead of automatically loading
    # everything).
    # https://github.com/goose3/goose3 for goose documentation
    os.chdir(path)
    paper = newspaper.build(news_website)
    g = Goose()
    i = 0
    for article in paper.articles:
        if 'html' in article.url:
            i += 1
            print(article.url)
            print(i)
            if i != max_articles:
                url = article.url
                article_extr = g.extract(url=url)
                file = open('title-{}.txt'.format(i), 'w')
                file.write(article_extr.title)
                file.close()
                file = open('article-{}.txt'.format(i), 'w')
                file.write(article_extr.cleaned_text)
                file.close()
                file = open('topic-{}.txt'.format(i), 'w')
                file.write(article_extr.domain)
                file.close()
            else:
                break
        else:
            continue
示例#20
0
def get_text():
    g = Goose()
    url = "https://item.btime.com/36a0f17i0489keqltn35q96p4lr?from=haozcxw"
    article = g.extract(url=url)
    print(article.title)
    print(article.cleaned_text)
    return article.cleaned_text
示例#21
0
文件: scraper.py 项目: radumazilu/lys
def goose_scraper(link):
    '''
    Returns cleaned text using the python goose3 api
    '''
    g = Goose()
    article = g.extract(link)
    return article.cleaned_text
示例#22
0
	def parse_item(self, response):
	
		self.iter_count += 1
		
		html = response.body
		
		# Objeto Goose para extraer datos de la pagina
		goose_extractor = Goose()
		article = goose_extractor.extract(raw_html=html)
		
		# Comprobar que la pagina contenga (por lo menos) un header h2 con la palabra 'Examples', para saber si es un tropo o no
		if(response.css('h2').re('.Examples:.')):
			self.trope_count+=1
			follow = True
			json_file = self.generate_json(article)
			self.create_files(json_file, 'tropo')
			
			# Archivo para comprobar los tropos indexados
			#with open(self.final_directory + 'trope_list.txt', 'a+', encoding='utf-8') as fp:
			#	fp.write(response.url+'\n')
			
		else:
			self.non_trope_count += 1
			if('Laconic' in response.url):
				print('Encontrado un Laconic!')
				self.laconic_count += 1
				json_file = self.generate_json(article)
				self.create_files(json_file, 'laconic')
			else:
				print('Enlace ignorado! (no era un tropo)')
			follow = False
		
		# Cerrar objeto goose
		goose_extractor.close()
示例#23
0
class AuthorExtractor(object):
    def __init__(self, html):
        self._html = html
        self._auhtor = ''
        self._g = Goose()

    def get_auhtor_method1(self):
        self._article = self._g.extract(raw_html=self._html)
        auhtors = self._article.authors
        if auhtors:
            self._auhtor = auhtors[0]

    def get_auhtor_method2(self):
        self._auhtor = get_info(self._html, AUTHOR_REGEXS_TEXT, fetch_one=True)
        if not self._auhtor:  # 没有匹配到,去掉标签后进一步匹配,有的作者和名字中间有标签
            self._auhtor = get_info(replace_str(self._html, '<(.|\n)*?>', ' '),
                                    AUTHOR_REGEXS_TEXT,
                                    fetch_one=True)

        if not self._auhtor:  # 仍没匹配到,则在html的author中匹配
            self._auhtor = get_info(self._html,
                                    AUTHOR_REGEX_TAG,
                                    fetch_one=True)

    def get_author(self):
        self.get_auhtor_method1()
        if not self._auhtor:
            self.get_auhtor_method2()
        return self._auhtor
def textExtractor(urlList):
    """
    Extract texts from tweets urls, back with tid with extracted text list
    :param urlList: filtered url list
    :return: a list contain twitter ID with all text extracted from url links
    """
    # urlList: list of urls with tid
    print('start text extraction from url')
    g = Goose()
    if urlList:
        textList = []
        time_out = time.process_time() + 5

        while time.process_time() <= time_out:
            for url in urlList:
                print(url[0])
                try:  # 10 min timeout, in case url not working properly or taking too long
                    article = g.extract(url=url[1])
                    text = article.cleaned_text
                    textList.append((url[0], text))
                    # with open(
                    #         r"C:\\Users\\no281\\Documents\\harVeyTwitter\\articalExtracted\\test\\" + str(
                    #             url[0]) + ".txt", 'w') as outfile:
                    #     outfile.write(text)
                    # outfile.close()
                except:
                    print('url break, continue')
    return textList
示例#25
0
def content_extractor():
    if request.method == 'GET':
        return "<h1>Yes, the server's running</h1>"
    if request.method == 'POST':
        # to handle the absurd CORS problems - figure out how to do JSON
        data = str(request.data, encoding='utf-8')

        # actual content extraction
        url = data
        g = Goose(config={'enable_image_fetching': True})
        article = g.extract(url=url)
        # when you have in extension form, `data` will be the targetSiteURL's
        # raw html. Hence you'll have the following commansds:
            # raw_html = data
            # article = g.extract(raw_html=raw_html)
        # Right now, though, goose handles getting the html

        # if image available send that also
        img_src = ""
        if article.top_image:
            img_src = article.top_image.src

        res_dict = {
            'title': article.title,
            'img_src': img_src,
            'content': article.cleaned_text
        }
        response = jsonify(res_dict)
        return response
示例#26
0
def fullNews(link, feed):

    g = Goose()
    try:
        article = g.extract(url=link)
        createfiles(feed, article.title, article.cleaned_text)
    except:
        print('error')
示例#27
0
def body(url):
    g = Goose()
    article = g.extract(url=url)
    article = str(article.cleaned_text)
    article = article.replace('"', '')
    article = " ".join(article.split())
    article = str(article)
    return article
示例#28
0
def get_reading():
    global body
    try:
        g = Goose({'browser_user_agent': useragent_generator()})
        reading = g.extract(url=BASE_URL)
        body = reading.cleaned_text
    except:
        body = 'None'
示例#29
0
def body(url):
	g = Goose()
	article = g.extract(url=url)
	article = str(article.cleaned_text)
	article = article.replace('"','')
	article = " ".join(article.split())
	article = str(article)
	return article
示例#30
0
 def _extract_content(self, html):
     ContentExtractor.calculate_best_node = calculate_best_node
     ContentExtractor.post_cleanup = post_cleanup
     g = Goose({'enable_image_fetching': False})
     article = g.extract(raw_html=html)
     ContentExtractor.calculate_best_node = f1
     ContentExtractor.post_cleanup = f2
     return article.cleaned_text
示例#31
0
	def extract_article(self):
		'''
		returns a goose article object
		'''

		gooser = Goose()
		article = gooser.extract(url = self.url)
		return article
示例#32
0
    schneier = pickle.load(inf)

g = Goose()


def get_headline(article):
    '''example input => '<a name="12">Comments from Readers</a>'''
    headline = article.split("</h4>")[0]
    headline = BeautifulSoup(headline, 'html.parser').get_text()
    return headline  # e.g. Comments from Readers


def get_pubdate(url_page):
    '''example input => /crypto-gram/archives/2007/0315.html'''
    yyyy, mody = page.replace(".html","").split("/")[-2:]
    mo = mody[0:2]
    dy = mody[2:4]
    return "{}-{}-{}".format(yyyy, mo, dy)


with open("out.json", "w") as of:
    for page in schneier:
        for article in schneier[page].split('<h4>')[1:]:
            out = defaultdict()
            body = article.split("</h4>")[1]
            out["pubdate"] = get_pubdate(page)
            out["headline"] = get_headline(article)
            out["text"] = g.extract(raw_html=body).cleaned_text
            out["url"] = "https://www.schneier.com/" + page
            json.dump(out, of)
            of.write("\n")
示例#33
0
# coding: utf-8

# In[3]:


from goose3 import Goose
from goose3.text import StopWordsChinese
# 初始化,设置中文分词
g = Goose({'stopwords_class': StopWordsChinese})
# 文章地址
url = 'https://mp.weixin.qq.com/s/zflbcF5PS06QC5YJXpiviQ'
# 获取文章内容
article = g.extract(url=url)
# 标题
print('标题:', article.title)
# 显示正文
print(article.cleaned_text)


# In[6]:


url = 'http://edition.cnn.com/2012/02/22/world/europe/uk-occupy-london/index.html?hpt=ieu_c2'
g = Goose({
    'browser_user_agent': 'Version/5.1.2 Safari/534.52.7',
    'http_timeout': 15
})
article = g.extract(url=url)
print(article.meta_description)
print(article.meta_keywords)