def parse_article(url, min_words_count=jg.MIN_WORDS_TO_SCRAPE): """ We download an article by ourselves so that we do it behind the Tor network and with a random user agent (Don't let Newspaper do it!). Then we fool Newspaper to think that it was the one who downloaded it so we can parse it and return the article. Returns None if the article is smaller than min_words_count. """ try: response = get_page(url) except Exception as err: update_log.error('Error in get_page()') update_log.error(err) return None if response is not None: article = ArticleParser(url="http://something") article.html = response.content article.download_state = 2 try: article.parse() except Exception as err: update_log.error('Error in article.parse()') update_log.error(err) return None else: add_url_to_blacklist(url) if len(article.text.split(' ')) >= min_words_count: return article return None
def _get_content_from_url(self, url): """Takes in a single url and return article content and title""" #r = requests.get(url) try: r = requests.get(url,timeout=6) # print('successful!') except requests.exceptions.Timeout as e: # Maybe set up for a retry print(e) return ' ', ' ' except requests.exceptions.RequestException as e: print(e) return ' ', ' ' # save to file with open('file.html', 'wb') as fh: fh.write(r.content) #print('Running Article...') a = Article(url) # set html manually with open("file.html", 'rb') as fh: a.html = fh.read() #print('Done opening Article.html...') # need to set download_state to 2 for this to work a.download_state = 2 a.parse() title = a.title content = re.sub("\n\n"," ",a.text) # Now the article should be populated return content, title
def run_newspaper(htmlstring): '''try with the newspaper module''' ## does not work! myarticle = Article('https://www.example.org/test/') myarticle.html = htmlstring myarticle.download_state = ArticleDownloadState.SUCCESS myarticle.parse() if myarticle.publish_date is None: return None date = convert_date(myarticle.publish_date, '%Y-%m-%d %H:%M:%S', '%Y-%m-%d') return date
def run_newspaper(htmlstring): '''try with the newspaper module''' # throws error on the eval_default dataset try: myarticle = Article(htmlstring) except (TypeError, UnicodeDecodeError): return None myarticle.html = htmlstring myarticle.download_state = ArticleDownloadState.SUCCESS myarticle.parse() if myarticle.publish_date is None or myarticle.publish_date == '': return None return convert_date(myarticle.publish_date, '%Y-%m-%d %H:%M:%S', '%Y-%m-%d')
def extract_data(url, bert_summary): article = Article(url) print("article object created") article.download() if article.download_state != ArticleDownloadState.SUCCESS: article.html = urllib.request.urlopen(url).read() # Hacking the library article.download_state = ArticleDownloadState.SUCCESS print("download completed") article.parse() print("parsing completed") top_image = article.top_image title = article.title if bert_summary: print("extracting bert summary") summary = extract_bert_summary(article.text) else: print("extracting short summary") summary = extract_short_summary(article) return summary, top_image, title
def parse_article(self, response): news_id = 19684 #response.meta.get('news_id') # save to file with open(str(news_id) + '.html', 'wb') as fh: fh.write(response.body) article = Article(response.url) # set html manually with open(str(news_id) + '.html', 'rb') as fh: article.html = fh.read() os.remove(str(news_id) + '.html') # need to set download_state to 2 for this to work article.download_state = 2 article.parse() article.nlp() date = article.publish_date keywords = str([x.replace("'", "''") for x in article.keywords]).replace('"', '\'') content = article.text.replace("'", "''") summary = article.summary.replace("'", "''") title = article.title.replace("'", "''") if date is None: date = 'null' else: date = "'" + str(date) + "'" authors = str([x.replace("'", "''") for x in article.authors]).replace('"', '\'') tags = str([x.replace("'", "''") for x in article.meta_keywords]).replace('"', '\'') dbconnector.execute( self.conn, 'INSERT INTO "ParsedNews-newspaper"("IDNews", "Date", "Content", "Keywords", ' + '"Summary", "Authors", "Tags", "Title") ' + 'VALUES (' + str(news_id) + ', ' + str(date) + ', \'' + content + '\', ARRAY ' + str(keywords) + '::text[], \'' + summary + '\', ARRAY ' + str(authors) + '::text[], ARRAY ' + str(tags) + '::text[], \'' + title + '\')') # get main article without comments content = extract_content(response.text).replace("'", "''") # get article and comments content_comments = '[\'' + extract_content_and_comments( response.text).replace("'", "''") + '\']' dbconnector.execute( self.conn, 'INSERT INTO "ParsedNews-dragnet"("IDNews", "Content", "Comments") ' + 'VALUES (' + str(news_id) + ', \'' + content + '\', ARRAY ' + str(content_comments) + '::text[])') date = articleDateExtractor.extractArticlePublishedDate( articleLink=response.url, html=response.text) if date is not None: dbconnector.execute( self.conn, 'INSERT INTO "ParsedNews-ade"("IDNews", "Date") ' + 'VALUES (' + str(news_id) + ', \'' + str(date) + '\')') g = Goose() article = g.extract(raw_html=response.text) date = article.publish_datetime_utc keywords = str([x.replace("'", "''") for x in article.tags]).replace('"', '\'') content = article.cleaned_text.replace("'", "''") summary = article.meta_description.replace("'", "''") title = article.title.replace("'", "''") if date is None: date = 'null' else: date = "'" + str(date) + "'" authors = str([x.replace("'", "''") for x in article.authors]).replace('"', '\'') tags = str([ x.replace("'", "''") for x in article.meta_keywords.split(",") ]).replace('"', '\'') tweets = str([x.replace("'", "''") for x in article.tweets]).replace('"', '\'') dbconnector.execute( self.conn, 'INSERT INTO "ParsedNews-goose"(' + '"IDNews", "Date", "Content", "Keywords", "Summary", ' + '"Authors", "Tags", "Tweets",' + '"Title") VALUES (' + str(news_id) + ', ' + date + ', \'' + content + '\', ARRAY ' + str(keywords) + '::text[], \'' + str(summary) + '\', ARRAY ' + str(authors) + '::text[], ARRAY ' + str(tags) + '::text[], ARRAY ' + str(tweets) + '::text[], \'' + str(title) + '\')') pass
def fetch_main_content(html: str) -> Article: a = Article(url='') a.html = html a.download_state = 2 a.parse() return a
source_list = ast.literal_eval(e['source_list']) #finds which is the position of the o_url in the list (we will need that to retrieve the correct .html) o_idx = source_list.index(o_url) a = Article(o_url) #finds the html file article_alias = a_url.rstrip("/").split("/")[-1] article_folder = html_folder+"/"+article_alias o_html_filename = article_folder+"/"+str(o_idx)+".html" # set html manually with open(o_html_filename, 'rb') as fh: a.html = fh.read() # need to set download_state to 2 for this to work a.download_state = 2 a.parse() # Now the article should be populated print(a.text) gold_df.to_csv(cwd+"/datasetVeritas3.csv", index=False) print("average number of annotations per doc:", sum(lenlen)/len(lenlen)) lenlen.sort(reverse = True) print(lenlen[:200]) print("max num of annotations on the same source") print(max(lenlen)) print("NEW") print(count_array.shape) print(b_array.shape)
def scrape(url): """ Scrapes an article from the 'url', extracts meta data using Nespaper3K package Parameters: -------- url : str, url to scrape Returns: -------- doc : dict, { 'url' : url, 'date' : article publish_date, 'title' : article title, 'text' : article cleaned_text, 'keywords' : article meta_keywords, 'summary' : article summary } False : bool, if get request fails or html < 500 """ from newspaper import Article, Config import re logger.info(f"SCRAPE: trying {url}") config = Config() config.memoize_articles = False config.fetch_images = False config.language = 'en' config.browser_user_agent = get_ua() config.request_timeout = 5 config.number_threads = 8 response = get_html_from_url(url) if response['status_code'] and response['html']: try: article = Article(url=url, config=config) article.download_state = 2 article.html = response['html'] article.parse() article.nlp() words_count = len((article.text).split()) if words_count > 200: logger.info( f'SCRAPE: Extracted TEXT from URL: {url}\n Title: "{article.title}"' ) return { 'url': url, 'datetime': article.publish_date, 'title': article.title, 'text': " ".join(re.split(r'[\n\t]+', article.text)), 'keywords': article.keywords, 'summary': article.summary } else: logger.info(f'''SCRAPE: Could not extract TEXT from {url}\n Article too short: {words_count} words''') except Exception as e: logger.info( f'SCRAPE: Could not extract TEXT from {url}\n Error: {e}') else: logger.info(f'SCRAPE: Could not extract TEXT from {url}') return False