def getArticle(self, config_=None): """\ """ # load test case data self.loadData() self.loadHtml() # basic configuration # no image fetching config = self.getConfig() if config is not None: if isinstance(config_, dict): for k, v in list(config_.items()): if hasattr(config, k): setattr(config, k, v) self.parser = config.get_parser() # target language # needed for non english language most of the time target_language = self.data.get('target_language') if target_language: config.target_language = target_language config.use_meta_language = False # read in the basic image... with open( '{}/data/images/50850547cc7310bc53e30e802c6318f1'.format( CURRENT_PATH), 'rb') as fobj: img_content = fobj.read() # read in another, blank image with open('{}/data/images/blank.jpeg'.format(CURRENT_PATH), 'rb') as fobj: blank_img = fobj.read() # run goose g = Goose(config=config) with requests_mock.Mocker(real_http=False) as m: # load images for those tests m.get('http://go.com/images/465395/', content=blank_img) m.get('http://bla.com/images/465395/', content=blank_img) m.get( 'http://md0.libe.com/photo/465395/?modified_at=1351411813&ratio_x=03&ratio_y=02&width=476', content=img_content) # if the url is not given in the result json, use the raw_html parameter. if "url" in self.data: m.get(self.data['url'], text=self.html) return g.extract(url=self.data['url']) else: return g.extract(raw_html=self.html)
def parse_detail(self, response): # 学术讲座 http://www.cqupt.edu.cn/cqupt/news_detail.shtml?id=155176964575282691 # 列表 API http://www.cqupt.edu.cn/getPublicPage.do 外加参数 cookie # js 动态加载,详情API http://www.cqupt.edu.cn/getPublicNotic.do?id=155176964575282691 item_loader = CquptSpiderItemLoader(item=CquptSpiderItem(), response=response) g = Goose({'stopwords_class': StopWordsChinese}) content = g.extract(raw_html=response.text) item_loader.add_value('url', response.url) item_loader.add_value('url_obj_id', response.url) item_loader.add_xpath('html_title', '/html/head/title/text()') item_loader.add_value('crawl_time', datetime.datetime.now()) if len(content.cleaned_text) < self.main_content_min_length: # 正文长度不够,认为是导航页或者列表页 # 尝试解析SEO 信息 item_loader.add_xpath( 'meta_description', "/html/head/meta[@name='description']/@content") item_loader.add_xpath( 'meta_keywords', "/html/head/meta[@name='keywords']/@content | " "/html/head/meta[@name='Keywords']/@content") item_loader.add_value('tags', content.title) else: item_loader.add_value('meta_keywords', content.meta_keywords) item_loader.add_value('meta_description', content.meta_description) item_loader.add_value('title', content.title) item_loader.add_value('create_date', content.publish_date) item_loader.add_value('authors', content.authors) item_loader.add_value('top_image', content.top_image) item_loader.add_value('tags', content.tags) item_loader.add_value('content', content.cleaned_text) item = item_loader.load_item() return item
def gooseChineseExample(self): data_list = [] # 文章地址 num = 0 for url in self.Baiduurl: # 初始化,设置中文分词 g = Goose({'stopwords_class': StopWordsChinese}) # 获取文章内容 article = g.extract(url=url) # 获取标题 title = article.title data_list.append('标题: ' + title) # 获取来源 source = self.getSource() data_list.append('来源: ' + str(source[num])) # 发布时间 Time = self.getTime() data_list.append('发布时间: ' + str(Time[num])) # 显示正文 text = article.cleaned_text data_list.append('文本: ' + text) data_list.append( '=============================================================================' ) num += 1 data_list = '\n'.join(data_list) print(data_list)
def get_news_result_cnt(self, news_url): config = Configuration() config.http_proxies = {'http': self.proxy, 'https': self.proxy} config.browser_user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36' config.stopwords_class = StopWordsChinese config.http_proxies = {'http': self.proxy, 'https': self.proxy} g = Goose(config) article = g.extract(news_url) try: published_time = int( parse(article.publish_date).timestamp() * 1000) if article.publish_date else None except: published_time = int( time.mktime(time.strptime(article.publish_date, "%Y年%m月%d日")) * 1000) if article.publish_date else None news_post = dict(doc_id=md5( article.final_url.encode('utf-8')).hexdigest(), keyword='', url=article.final_url, title=article.title, platform='news', content=article.cleaned_text, author=article.authors, source=self.source, published_time=published_time, spi_time=int(time.time() * 1000)) return news_post
def gooseChineseExample(): g = Goose({'stopwords_class': StopWordsChinese}) url = "https://item.btime.com/36a0f17i0489keqltn35q96p4lr?from=haozcxw" article = g.extract(url=url) print(article.title) print(article.meta_description) print(article.cleaned_text[:150])
def get_content(link): link = link g = Goose({ 'use_meta_language': False, 'target_language': 'id', 'enable_image_fetching': True, }) extract = g.extract(url=link) content = extract.cleaned_text content = GetContent.remove_publisher(content) content = content.replace('."', '. ') content = content.replace('\n', ' ').replace(' ', ' ').replace(' ', ' ').replace("\'", "").strip('-').strip() content = re.sub(r'[^\x00-\x7F]+', '', content) content = content.replace(' ...', '.').replace('.. .', '. ') content = GetContent.brut_split(content) content = content.replace('.CO', '').replace('.COM', '').replace('. CO', '').replace('. COM', '') content = content.strip('.').strip() + '.' content = GetContent.remove_baca(content) spoiler = content[:150] + '...' try: image = extract.top_image image_src = image.src except: image_src = '' if len(content) <= 500: return "Not Valid" else: return content, spoiler, image_src
def get_article_content(url): try: logger.info("Getting article content of " + url + " with Goose") goose_config = { 'browser_user_agent': 'Mozilla', 'parser_class': 'lxml', # soup or lxml for parsing xml and html # 'enable_image_fetching': True, 'http_timeout': browser_timeout } if config["proxy"]["enabled"].lower() == "true": goose_config["http_proxy"] = config["proxy"]["http_ip_port"] goose_config["https_proxy"] = config["proxy"]["https_ip_port"] g = Goose(goose_config) logger.debug("Goose current parser is {}".format( g.config.get_parser())) article = g.extract(url=url) logger.debug("Extracted content of article from {}".format(url)) content = article.cleaned_text.replace("\n", " ") cleaned_text = article.cleaned_text paragraphs_list = list() paragraphs_list = paragraphs_list + cleaned_text.split('\n') logger.debug(content) return {"content": content, "paragraphs_list": paragraphs_list} except Exception as e: logging.exception( "Error getting article's content from {}".format(url)) erroneous_urls.append({"url": url, "error": "Unable to get content"}) content = "" return {"content": content, "paragraphs_list": list()}
class HTMLExtractor: """Extract information from html Currently, it can extract title and links using bs4, and content using goose3. """ def __init__(self, html): """ @param html: str """ self.html = html self.soup = BeautifulSoup(html, 'lxml') self.goose = Goose({'enable_image_fetching': False}) def extract_title(self): """Extract title from html @return title: str """ return self.soup.title.get_text() def extract_links(self): """Extract links from html @return links: list """ link_elements = self.soup.find_all('a') return [link_element['href'] for link_element in link_elements] def extract_content(self): """Extract pure content from html @return conent: str """ return self.goose.extract(raw_html=self.html).cleaned_text
def extract_article_information_from_html(html): """ This methods gets a website the HTML as string and extracts the text of the article :param html: a HTML object from package requests :return: the article information """ article_information = {} # run with newspaper article_newspaper = Article('') article_newspaper.set_html(html) article_newspaper.parse() article_information["summary"] = article_newspaper.summary article_information["author"] = str(article_newspaper.authors).strip('[]') article_information["tags"] = article_newspaper.tags article_information["title"] = article_newspaper.title newspaper_text = article_newspaper.text # run with newsplease # article_newsplease = NewsPlease.from_html(html) # newsplease_text = article_newsplease.cleaned_text # run with goose goose_extractor = Goose() goose_extractor = goose_extractor.extract(raw_html=html) article_goose = goose_extractor.cleaned_text if len(newspaper_text.split(" ")) > len(article_goose.split(" ")): article_information["text"] = newspaper_text else: article_information["text"] = article_goose return article_information
def gather_informations(): url = request.args['url'] g = Goose( config={ 'local_storage_path': './data/' if os.environ.get('BUCKET_HOST') is not None else '.', 'enable_image_fetching': True }) try: goose_response = g.extract(url=url) print(goose_response.top_image) except Exception as e: print('error') return jsonify({'error': True, 'message': e}), 500 response_img = '' if goose_response.top_image is not None: response_img = goose_response.top_image.src return jsonify({ 'title': goose_response.title, 'urlRequested': url, 'text': goose_response.cleaned_text[:200], 'mainImage': response_img }), 200
class PageReaderGoose(PageReaderBase): def __init__(self, url, lang="en"): PageReaderBase.__init__(self, url=url, lang=lang) self.text_property = "cleaned_text" self.title_property = "title" self.authors_property = "authors" self.publish_date_property = "publish_date" self.html_property = "raw_html" self.dom_property = "doc" if lang is None: self.g = Goose() else: lang = lang[:2].lower() if lang == "en": self.g = Goose() else: if lang == "zh": stopwords_class = StopWordsChinese elif lang == "ko": stopwords_class = StopWordsKorean elif lang == "ar": stopwords_class = StopWordsArabic self.g = Goose({'stopwords_class': stopwords_class}) def _read(self): if self.article is None: try: self.article = self.g.extract(url=self.url) except: logger.info( "failed when loading article content for {}\nError: {}". format(self.url, traceback.format_exc())) return self.article
def get_paragrams(search_res): """ Args: search_res:返回一组搜索结果和链接 Returns: clean_res:返回所有链接的正文段落 Raises: e:文章段落分割异常 """ paras = [] goose = Goose({'browser_user_agent': 'Mozilla', 'parser_class': 'soup', 'stopwords_class': StopWordsChinese}) # 设置goose参数 for ind, res_elem in enumerate(search_res): try: res_herf = res_elem[1] if get_access_result(target_url=res_herf) == None: # 测试是否可以访问 print('Can\'t access to website:'+res_herf) continue article = goose.extract(url=res_herf) # 正文提取 异常处理 paras.extend(list(article.cleaned_text.split())) # 分割成段 except Exception as e: print("Fail to split paragrams in", res_elem[1], end=' ') print(e) continue return paras
def get_links(website_url: str) -> list: ''' It used to get the links in a web page :param website_url: the URL of website that is going to be extracted :return: a link list, after a rough selection ''' links = set() # create goose and bs4 instance, g = Goose() try: main_page = g.extract(url=website_url) soup = BeautifulSoup(main_page.raw_html, 'lxml') # Get the link for line in soup.find_all('a'): link = line.get('href') if link is not None: # very few start with empty space in the head, so delete it link = link.strip(' ') links.add(link) print('Extracted: ', website_url) except Exception as e: # Print the error message if failed to extract print('Fail to extract: ', website_url, ' Error:', str(e)) if len(links) == 0: print( 'Warning! Function: get_links() output empty list when extracting ', website_url) return list(links)
def get_news( self): # 실제로 url에 들어가 기사들을 읽어온다 , 첫번째 카테고리만으로 검색했을때 데이터를 가져와준다 #categories 는 1,2,3숫자를 받는다(여러개 가능) print('기사 추출 시작') for url in self.urls: try: category = self.categories[self.choose_category - 1] g = Goose({'stopwords_class': StopWordsKorean}) article = g.extract(url=url) title = article.title #print(title) content = self.read_article_contents(url) if content == "": continue print(content) self.article_info["category"] = category self.article_info["contents"] = content self.article_info["title"] = title self.article_info["url"] = url self.articles.append(self.article_info) self.num_article += 1 except: continue return self.articles
def getArticle(self): """\ """ # load test case data self.loadData() self.loadHtml() # basic configuration # no image fetching config = self.getConfig() self.parser = config.get_parser() # target language # needed for non english language most of the time target_language = self.data.get('target_language') if target_language: config.target_language = target_language config.use_meta_language = False with requests_mock.Mocker(real_http=True) as m: m.get(self.data['url'], text=self.html) # run goose g = Goose(config=config) return g.extract(url=self.data['url'])
def summarize(url): g = Goose() article = g.extract(url=url) clean = article.cleaned_text stopword_set = set(stopwords.words("english")) sentence_list = nltk.sent_tokenize(clean) word_frequencies = {} for word in nltk.word_tokenize(clean): if word not in stopword_set: if word not in word_frequencies.keys(): word_frequencies[word] = 1 else: word_frequencies[word] += 1 maximum_frequency = max(word_frequencies.values()) for word in word_frequencies.keys(): word_frequencies[word] = (word_frequencies[word]/maximum_frequency) sentence_scores = {} for sent in sentence_list: for word in nltk.word_tokenize(sent.lower()): if word in word_frequencies.keys(): if len(sent.split(' ')) < 30: if sent not in sentence_scores.keys(): sentence_scores[sent] = word_frequencies[word] else: sentence_scores[sent] += word_frequencies[word] summary_sentences = heapq.nlargest(4, sentence_scores, key=sentence_scores.get) summary = ' '.join(summary_sentences) return summary
def read_articles(self, headlines=None, save_continuously=False, save_dir=""): if headlines is None: headlines = self.headlines extractor = Goose() for date, daily_news in headlines.items(): # Shuffle since if there are too many some will be ignored # and we want the ignored ones to be randomly deselected shuffle(daily_news) news_read = [] for new in daily_news: try: body = extractor.extract(url=new["link"]).cleaned_text news_read.append({**new, "body": body}) if len(self.news) == settings["max_news_per_day"]: break except NetworkError: logger.error("Page not found in {}".format(new["link"])) except MissingSchema: logger.warning("Couldn't read link {}".format(new["link"])) logger.warning(" Reason: string 'http://' might be missing") except Exception as e: logger.warning("Unknown exception while trying to read {}".format(new["link"])) logger.warning(" {}".format(e)) if len(news_read) > 0: self.news[date] = news_read if save_continuously: if save_dir == "": logger.warning("Please provide a save directory") else: self.save_news(save_dir, {date: news_read}) logger.info("From {} headlines, {} of their articles where correctly downloaded".format( sum([len(headers) for headers in self.headlines.values()]), sum([len(day_news) for day_news in self.news.values()]))) return self.news
def get_news_result_cnt(self, news_url, keyword=''): config = Configuration() config.http_proxies = { 'http': self.proxy, 'https': self.proxy } config.browser_user_agent = self.ua config.stopwords_class = StopWordsChinese config.http_proxies = { 'http': self.proxy, 'https': self.proxy } g = Goose(config) article = g.extract(news_url) text_html = article.raw_html text_tree = etree.HTML(text_html) if article.cleaned_text: cont = article.cleaned_text else: cont = ''.join(text_tree.xpath('//div[@class="col-md-10 col-xs-12 detailNews"]/p//text()')).replace('\xa0', '') art_title = article.title news_post = dict( doc_id=md5(article.final_url.encode('utf-8')).hexdigest(), keyword=keyword, url=article.final_url, title=art_title, platform='news', content=cont, author=article.authors, source=self.source, published_time=int(parse(article.publish_date).timestamp() * 1000) if article.publish_date else None, spi_time=int(time.time() * 1000) ) return news_post
def get_articles(path, news_website='https://www.yahoo.com/news/', max_articles=150): # articles should be saved in /articles file # See Goose and newspaper3k documentation for explanation on how to use # these packages (tried to use Beautiful Soup for this but was # frustratingly difficult, since it seems that the newspages load when # someone is actually on the side, instead of automatically loading # everything). # https://github.com/goose3/goose3 for goose documentation os.chdir(path) paper = newspaper.build(news_website) g = Goose() i = 0 for article in paper.articles: if 'html' in article.url: i += 1 print(article.url) print(i) if i != max_articles: url = article.url article_extr = g.extract(url=url) file = open('title-{}.txt'.format(i), 'w') file.write(article_extr.title) file.close() file = open('article-{}.txt'.format(i), 'w') file.write(article_extr.cleaned_text) file.close() file = open('topic-{}.txt'.format(i), 'w') file.write(article_extr.domain) file.close() else: break else: continue
def get_text(): g = Goose() url = "https://item.btime.com/36a0f17i0489keqltn35q96p4lr?from=haozcxw" article = g.extract(url=url) print(article.title) print(article.cleaned_text) return article.cleaned_text
def goose_scraper(link): ''' Returns cleaned text using the python goose3 api ''' g = Goose() article = g.extract(link) return article.cleaned_text
def parse_item(self, response): self.iter_count += 1 html = response.body # Objeto Goose para extraer datos de la pagina goose_extractor = Goose() article = goose_extractor.extract(raw_html=html) # Comprobar que la pagina contenga (por lo menos) un header h2 con la palabra 'Examples', para saber si es un tropo o no if(response.css('h2').re('.Examples:.')): self.trope_count+=1 follow = True json_file = self.generate_json(article) self.create_files(json_file, 'tropo') # Archivo para comprobar los tropos indexados #with open(self.final_directory + 'trope_list.txt', 'a+', encoding='utf-8') as fp: # fp.write(response.url+'\n') else: self.non_trope_count += 1 if('Laconic' in response.url): print('Encontrado un Laconic!') self.laconic_count += 1 json_file = self.generate_json(article) self.create_files(json_file, 'laconic') else: print('Enlace ignorado! (no era un tropo)') follow = False # Cerrar objeto goose goose_extractor.close()
class AuthorExtractor(object): def __init__(self, html): self._html = html self._auhtor = '' self._g = Goose() def get_auhtor_method1(self): self._article = self._g.extract(raw_html=self._html) auhtors = self._article.authors if auhtors: self._auhtor = auhtors[0] def get_auhtor_method2(self): self._auhtor = get_info(self._html, AUTHOR_REGEXS_TEXT, fetch_one=True) if not self._auhtor: # 没有匹配到,去掉标签后进一步匹配,有的作者和名字中间有标签 self._auhtor = get_info(replace_str(self._html, '<(.|\n)*?>', ' '), AUTHOR_REGEXS_TEXT, fetch_one=True) if not self._auhtor: # 仍没匹配到,则在html的author中匹配 self._auhtor = get_info(self._html, AUTHOR_REGEX_TAG, fetch_one=True) def get_author(self): self.get_auhtor_method1() if not self._auhtor: self.get_auhtor_method2() return self._auhtor
def textExtractor(urlList): """ Extract texts from tweets urls, back with tid with extracted text list :param urlList: filtered url list :return: a list contain twitter ID with all text extracted from url links """ # urlList: list of urls with tid print('start text extraction from url') g = Goose() if urlList: textList = [] time_out = time.process_time() + 5 while time.process_time() <= time_out: for url in urlList: print(url[0]) try: # 10 min timeout, in case url not working properly or taking too long article = g.extract(url=url[1]) text = article.cleaned_text textList.append((url[0], text)) # with open( # r"C:\\Users\\no281\\Documents\\harVeyTwitter\\articalExtracted\\test\\" + str( # url[0]) + ".txt", 'w') as outfile: # outfile.write(text) # outfile.close() except: print('url break, continue') return textList
def content_extractor(): if request.method == 'GET': return "<h1>Yes, the server's running</h1>" if request.method == 'POST': # to handle the absurd CORS problems - figure out how to do JSON data = str(request.data, encoding='utf-8') # actual content extraction url = data g = Goose(config={'enable_image_fetching': True}) article = g.extract(url=url) # when you have in extension form, `data` will be the targetSiteURL's # raw html. Hence you'll have the following commansds: # raw_html = data # article = g.extract(raw_html=raw_html) # Right now, though, goose handles getting the html # if image available send that also img_src = "" if article.top_image: img_src = article.top_image.src res_dict = { 'title': article.title, 'img_src': img_src, 'content': article.cleaned_text } response = jsonify(res_dict) return response
def fullNews(link, feed): g = Goose() try: article = g.extract(url=link) createfiles(feed, article.title, article.cleaned_text) except: print('error')
def body(url): g = Goose() article = g.extract(url=url) article = str(article.cleaned_text) article = article.replace('"', '') article = " ".join(article.split()) article = str(article) return article
def get_reading(): global body try: g = Goose({'browser_user_agent': useragent_generator()}) reading = g.extract(url=BASE_URL) body = reading.cleaned_text except: body = 'None'
def body(url): g = Goose() article = g.extract(url=url) article = str(article.cleaned_text) article = article.replace('"','') article = " ".join(article.split()) article = str(article) return article
def _extract_content(self, html): ContentExtractor.calculate_best_node = calculate_best_node ContentExtractor.post_cleanup = post_cleanup g = Goose({'enable_image_fetching': False}) article = g.extract(raw_html=html) ContentExtractor.calculate_best_node = f1 ContentExtractor.post_cleanup = f2 return article.cleaned_text
def extract_article(self): ''' returns a goose article object ''' gooser = Goose() article = gooser.extract(url = self.url) return article
schneier = pickle.load(inf) g = Goose() def get_headline(article): '''example input => '<a name="12">Comments from Readers</a>''' headline = article.split("</h4>")[0] headline = BeautifulSoup(headline, 'html.parser').get_text() return headline # e.g. Comments from Readers def get_pubdate(url_page): '''example input => /crypto-gram/archives/2007/0315.html''' yyyy, mody = page.replace(".html","").split("/")[-2:] mo = mody[0:2] dy = mody[2:4] return "{}-{}-{}".format(yyyy, mo, dy) with open("out.json", "w") as of: for page in schneier: for article in schneier[page].split('<h4>')[1:]: out = defaultdict() body = article.split("</h4>")[1] out["pubdate"] = get_pubdate(page) out["headline"] = get_headline(article) out["text"] = g.extract(raw_html=body).cleaned_text out["url"] = "https://www.schneier.com/" + page json.dump(out, of) of.write("\n")
# coding: utf-8 # In[3]: from goose3 import Goose from goose3.text import StopWordsChinese # 初始化,设置中文分词 g = Goose({'stopwords_class': StopWordsChinese}) # 文章地址 url = 'https://mp.weixin.qq.com/s/zflbcF5PS06QC5YJXpiviQ' # 获取文章内容 article = g.extract(url=url) # 标题 print('标题:', article.title) # 显示正文 print(article.cleaned_text) # In[6]: url = 'http://edition.cnn.com/2012/02/22/world/europe/uk-occupy-london/index.html?hpt=ieu_c2' g = Goose({ 'browser_user_agent': 'Version/5.1.2 Safari/534.52.7', 'http_timeout': 15 }) article = g.extract(url=url) print(article.meta_description) print(article.meta_keywords)