def story_top_image(stories_id): story = mc.story(stories_id) # use the tool key so anyone can see these images story_html = apicache.story_raw_1st_download(TOOL_API_KEY, stories_id) article = newspaper.Article(url=story['url']) article.set_html(story_html) article.parse() return jsonify({ 'top': article.top_image, 'all': list(article.images), })
def scrap_article(article_link): article = newspaper.Article(article_link) article.download() article.parse() raw_paragraph = article.text paragraph_list = [p for p in raw_paragraph.split("\n") if p] return { "title": article.title, "paragraph_list": paragraph_list }
def all_the_content(content, article_database_ref, reload_pundits=False): """ :param content: this is the mongo object containing our content up to now :param reload_pundits: if true, pundits are re-scraped every time :return: returns keywords, entities, and newpundits, as well as storing them in the mongo object for the article """ reload_pundits = True article = newspaper.Article(content['url']) article.download() article.parse() article.nlp() print "HERE ARE THE NEWSPAPER KEYWORDS", article.keywords content['keywords'] = "" content['entities'] = "" # if not 'keywords' in content: # content['keywords'] = [x for x in get_keywords(content['text']) # if x['count'] > 2] # _content.update({'_id': bson.ObjectId(content['id'])}, # {'$set': {'keywords': content['keywords']}}) # # if not 'entities' in content: # content['entities'] = get_entities(content['text']) # _content.update({'_id': bson.ObjectId(content['id'])}, # {'$set': {'entities': content['entities']}}) if not 'newpundits' in content or reload_pundits: content['newpundits'] = [] dupe_list = [] snippets, ratios = pundits.keyword_match(article_database_ref, article.keywords) content['newpundits'] = snippets _content.update({'_id': bson.ObjectId(content['id'])}, {'$set': { 'newpundits': content['newpundits'] }}) if not len(content['newpundits']): print "nothing to see here!" failed_snippet = {} failed_snippet['name'] = "#shambles" failed_snippet['text'] = "we can't seem to find anything." content['newpundits'] = [[failed_snippet]] else: print "HERE ARE NEW PUNDITS:", content['newpundits'] return content['keywords'], content['entities'], content['newpundits']
def get_article(url): article = newspaper.Article(url, language=u'zh') try: article.download() article.parse() except Exception as e: print(u"Something go wrong...Cannot download it...") pass article_title = article.title article_text = article.text return article_title, article_text
def getKeywords(self): # UGLY HACK WARNING # if a site has a specific scraper written for it, Newspaper is never involved - but Newspaper's keyword functionality is really good and I don't want to write my own function for it # so I'm creating a newspaper.Article object and forcibly setting attributes to allow the natural language processing to work and give me keywords a = newspaper.Article(self.url) a.text = self.text a.title = self.title a.download_state = 2 # nlp() function below uses direct comparisons to check for download state so I'm getting away with setting it to something arbitrary a.is_parsed = True a.nlp() return a.keywords
def get_text_from_url(url, session, cleanwriter, errorwriter, allow_redirects=False, verify=True): url_idx = url[0] url_str = url[1] try: response = session.get(url_str, allow_redirects=allow_redirects, verify=verify) response.close() except (ConnectionError, InvalidSchema) as e: errorwriter.writerow([url_str, e.__class__.__name__]) response = None print(("#%s:" % url_idx), e.__class__.__name__, url_str) pass except (MissingSchema, TooManyRedirects, RetryError) as e: errorwriter.writerow([url_str, e.__class__.__name__]) response = None print(("#%s:" % url_idx), e.__class__.__name__, url_str) pass if response is not None: if response.ok: article = newspaper.Article(url_str) article.download() # See https://github.com/codelucas/newspaper/blob/master/newspaper/article.py#L31 if article.download_state == 2: article.parse() article.nlp() date, time = get_date_time(article.html) cleanwriter.writerow([ article.text, article.title, article.keywords, url_str, article.tags, article.meta_keywords, date, time ]) else: errorwriter.writerow([url_str, response.status_code]) print("#%s: Error with status code %s for URL: %s" % (url_idx, response.status_code, url_str)) else: print("%s is not a valid URL" % url_str)
def get_article(self, url): article = newspaper.Article(url, language=u'zh') try: article.download() article.parse() except Exception as e: self.log.info( u"Something go wrong:\n{}\nCannot download it...".format(e)) pass article_title = article.title article_text = article.text return article_title, article_text
def save_raw_html_files(graph, page_id, num_posts, html_save_loc): #Unrelated to project. Saving raw html files for TT's work page_posts = graph.get_connections(page_id, "posts", limit=num_posts) page_posts_data = page_posts['data'] article_index = 0 success = 0 driver = webdriver.Firefox() driver.set_page_load_timeout(30) for post in page_posts_data: try: print("Examining article {n}".format(n=article_index)) attachments = graph.get_connections(post['id'], "attachments") link_in_post = attachments['data'][0]['url'] driver.get(link_in_post) redirected_url = driver.current_url #Javascript redirect smh if "facebook.com" in redirected_url: print("Redirect failed - probably not an article") continue article = newspaper.Article(redirected_url, "en") article.download() html_file = article.html #final_save_loc = html_save_loc + str(article_index) + ".html" final_save_loc = html_save_loc + post['id'] + ".html" f = open(final_save_loc, 'w') f.write(html_file) f.close() article_index = article_index + 1 success = success + 1 except KeyError: print("This article has no attachments or url on facebook") continue except TimeoutException: print( "Time out exception has been thrown. Just go to next article, since we don't care about any particular article." ) continue except UnicodeEncodeError: print( "UnicodeEncodeError thrown. Just go to the next one lol, too many possible reasons why" ) continue print("Number of successful html files downloaded: {n}".format(n=success)) driver.close() return
def extract_headline(self): try: self.article = newspaper.Article(self.news_url) self.article.download() self.article.parse() except newspaper.article.ArticleException: #List possible errors in case of any exception print("\nCONNECTION/URL ERROR: There may be a problem with your connection or the URL entered may be invalid") article.title = "Invalid URL/Could not extract title" return self.article.title.strip()
def parse(self, response): newsItem = newsPaperItem() article = newspaper.Article(response.url) article.download() article.parse() nltk.download('punkt') article.nlp() newsItem["Author_Name"] = article.authors newsItem['Publication_Date'] = article.publish_date newsItem['Keywords'] = article.keywords newsItem['Article_text'] = article.text yield newsItem
def link_pull(self, url): parser = newspaper.Article(url, request_timeout=10) try: parser.download() parser.parse() except (newspaper.article.ArticleException, ValueError): return (None, None, []) article = parser.text title = parser.title img = parser.top_image return (article, title, [img])
def findArticle(links): articles = [] for l in links: url = l.strip() a = newspaper.Article(url, language='en') a.download() try: a.parse() except newspaper.article.ArticleException: print("Article not found") return articles, links
def scrape(self): for source in [self.queue[i].get_urls() for i in range(len(self.queue))]: if source: for date in source: for url in date: entry = {} article = nw.Article(url) article.download() article.parse() src = re.search('www.(.*).co',url).group(0).replace('.','') entry.update(title=article.title, source=src, text=article.text) self.results = self.results.append(entry,ignore_index=True) self.save()
def extract_article(url): """Function that takes the url string of a news article and returns the title and text of the article as a Python dictionary. Built on top of Newspaper's article scraping & curation library.""" link = newspaper.Article(url) link.download() link.parse() article = {} article["title"] = link.title article["text"] = link.text return (article)
def extract(self, url, html_text: str): doc = newspaper.Article(url) doc.download(input_html=html_text) doc.parse() self.content = { 'url': url, 'text': doc.text, 'title': doc.title, 'publish_date': doc.publish_date, 'top_image_url': doc.top_image, 'authors': doc.authors, 'extraction_method': METHOD_NEWSPAPER_3k, }
def xrun(suburls): sh = StructHtml(feilds, 'ex001.xlsx') for v in suburls: # print type(v) url = getInnerPageURLs(v)[0] url = "%s%s" % (mainHTTP, url) art = newspaper.Article(url, language='zh') art.download() art.parse() sh.getSruct(art.text, art.html) yield (v, sh.format_txt) pass pass
def retrieveContent(self, link): try: a = newspaper.Article(link) a.download() a.parse() text = a.text a.nlp() self.keywords = a.keywords self.retrieved = str(datetime.datetime.utcnow()) return a.text except Exception as e: log.error("Exception retrieving %s" % (link)) log.exception(e)
def set_article(self): try: self.article = newspaper.Article(self.url, keep_article_html=True) except Exception as e: print e.message self.article.download() if not self.article.is_downloaded: time.sleep(1) self.article.parse() if not self.article.is_parsed: time.sleep(1) self.article.nlp()
def getArticle(url): import dateutil.parser article = newspaper.Article(url, keep_article_html=True) try: article.download() article.parse() except Exception as e: raise e date = article.publish_date or dateutil.parser.parse( extractDate(article.html)).strftime('%Y-%m-%d %H:%M:%S') if not date: raise Exception("Cannot find date") return (url, article.title, ','.join(article.authors), str(date), article.text.replace('\n', ' '))
def getText(url): try: time.sleep(1 / 5) a = newspaper.Article(url, language='zh', memoize_articles=False) a.download() a.parse() text = ' ' + a.text except Exception as e: time.sleep(random.randint(1, 10) / 10) print("下载失败:" + url) print(e) text = getText(url) return text.replace('\n\n', '\n\n ')
def getText(self, url): try: time.sleep(3) a = newspaper.Article(url, language='zh') a.download() a.parse() text = ' ' + a.text except Exception as e: time.sleep(random.randint(1, 10)) print("下载失败:" + url) print(e) text = self.getText(url) return text.replace('\n\n', '\n\n ')
def build_article_object(article_url): '''Build a formatted string with the article title, summary, and url''' log.debug("Building article object for article {0}".format(article_url)) article = newspaper.Article(article_url) log.debug("Downloading article {0}".format(article_url)) article.download() log.debug("Finished downloading article {0}, parsing".format(article_url)) article.parse() log.debug("Finished debugging {0}, running nlp".format(article_url)) article.nlp() article_str = "{0} ({1})\n{2}\n".format( article.title.encode('ascii', 'ignore'), article_url, article.summary) output_strs.append(article_str)
def testurl(url, newsClassifier): a = newspaper.Article(url) a.download() a.parse() a.nlp() l1 = get_named_entities(a.text) author = "default" try: author = a.author[0] except: print "Not found" art = add_article(a.title, a.summary, url, author, l1) test_keywords(art, newsClassifier)
def __init__(self, title=None, url=None, pubDate=None, rssFeed=None): super(Article, self).__init__() # self._title = title if title != None else '' # self._url = url if url != None else '' # self._pub_date = pubDate if pubDate != None else '' self._title = title self._url = url self._pub_date = pubDate self._rssFeed = rssFeed if self._title != None and self._url != None and self._pub_date != None: self._articleParse = newspaper.Article(self.url)
def get_string_data(): # 读取文本数据 # 获取文章 article = newspaper.Article('http://www.bjnews.com.cn/news/2019/07/02/598100.html', language='zh') # 下载文章 article.download() # 解析文章 article.parse() # 对文章进行nlp处理 article.nlp() # nlp处理后的文章拼接 string_data = "".join(article.keywords) return string_data
def getArticle(self,url): article = newspaper.Article(url) for i in range(5): article.download() print("Retry:",i) html = article.html if html and len(html)>0: break article.parse() article.nlp() return article
def calculate_article_word_count(url): config = newspaper.Config() config.browser_user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) " \ "AppleWebKit/537.36 (KHTML, like Gecko) " \ "Chrome/64.0.3282.186 Safari/537.36" article = newspaper.Article(url, config=config) article.download() article.parse() if (len(article.text.split())) < 200: raise ValidationError('Could not find article') return len(article.text.split()) + len(article.title.split())
def newspaper_read(article_link, continue_var=0): #continue variables max_allowed = 7999 start_character = max_allowed * continue_var end_character = start_character + max_allowed text = '' n_entity = newspaper.Article(article_link) n_entity.download() n_entity.parse() text += (n_entity.text.encode('ascii', errors='ignore').replace('\n', '').replace( '\t', ''))[start_character:end_character] return text
def main(argv=None): urls = [] if len(argv) < 2: raise FileNotFoundError( 'Please input a file with urls as the first argument') with open(argv[1]) as f: for line in f.readlines(): line = line.rstrip("\n") # url will be the last "column" in line # this leaves room for user-created tags if line != '': line = line.split()[-1] urls.append(line) articles = [] print("downloading") for url in urls: print(".", url) article = newspaper.Article(url, language='en') article.download() article.parse() articles.append([article.title, article.text]) print("done downloading") data_matrix = get_data_matrix(articles) # write_matrix(data_matrix) # title_combinations = article_combinations(articles) # euclidean = get_similarity(title_combinations, data_matrix, euclidean_similarity) # cosine = get_similarity(title_combinations, data_matrix, cosine_similarity) # jaccard = get_similarity(title_combinations, data_matrix, jaccard_similarity) # similarity_matrix = get_similarity_matrix(title_combinations, # euclidean, # cosine, # jaccard) # write_matrix(similarity_matrix) print("start euclidean") euclidean_clusters = k_means(data_matrix, 5, euclidean_similarity) print("euclidean", [i[1] for i in euclidean_clusters]) print("-------\nstart cosine") cosine_clusters = k_means(data_matrix, 5, cosine_similarity) print("cosine", [i[1] for i in cosine_clusters]) print("-------\nstart jaccard") jaccard_clusters = k_means(data_matrix, 5, jaccard_similarity) print("jaccard", [i[1] for i in jaccard_clusters]) euclidean_sse = sse(data_matrix, euclidean_clusters) cosine_sse = sse(data_matrix, cosine_clusters) jaccard_sse = sse(data_matrix, jaccard_clusters) print(euclidean_sse) print(cosine_sse) print(jaccard_sse)
def parse_item(self, response): sel = Selector(response) try: #正文 new = newspaper.Article(url=response.url, language='zh') new.download() new.parse() content = re.sub('\s|\W', '', new.text) print(content) # 标题 if sel.xpath("//h1[@class='entry-title']/text()").extract_first(): title = sel.xpath( "//h1[@class='entry-title']/text()").extract_first() # print(title) else: pass #时间 if sel.xpath("//a[1]/time[@class='entry-date']").extract_first(): time = sel.xpath("//a[1]/time[@class='entry-date']//text()" ).extract_first() # print(time) else: pass #img图片url if sel.xpath( "//img[@class='aligncenter size-full wp-image-1142']/@src" ).extract_first(): img_url = sel.xpath( "//img[@class='aligncenter size-full wp-image-1142']/@src" ).extract() for url in img_url: print(url) else: pass #来源 if sel.xpath("//div[@class='entry-content']/p/text()").extract(): #内容来源一起 content = sel.xpath( "//div[@class='entry-content']/p/text()").extract() if '来源' in content[-1]: print(content[-1]) # print(content,response.url) else: pass else: pass print(response.url) except: pass