def parse(self, response): #print type(response) article = None try: article = NewsPlease.from_html(response.body.encode("utf-8")) except: article = NewsPlease.from_html( response.body.decode('latin-1').encode("utf-8")) print "EXCEPTION OCCURED" print article.date_publish #print article.text article2 = Article(url="", language="es") article2.set_html(response.body) article2.parse() print response.url self.db.articles_es.insert({ "title": article.title, "pub_date": article.date_publish, "url": response.url, "content": article2.text, "raw_html": response.body }) links = self.linkExtractor.extract_links(response) for link in links: yield scrapy.Request(link.url, callback=self.parse)
def custom_crawl_articles(): url_file = open('crawled_urlIndex/manual_urls.txt', 'r') urls = [] counter = 0 for line in url_file.readlines(): counter += 1 print(counter) try: cdata = {} article = NewsPlease.from_url(line.strip()) purl = urlparse(line) cdata['src'] = str( purl.netloc.replace("www.", "").replace(".com", "")) cdata['url'] = line.strip() cdata['title'] = article.title.encode('utf-8').strip() cdata['og_title'] = article.title.encode('utf-8').strip() cdata['content'] = article.text.encode('utf-8').strip() cdata['lang'] = article.language.encode('utf-8').strip() adate = article.date_publish if adate is None: continue dateObj = datetime.datetime.strptime(str(adate), "%Y-%m-%d %H:%M:%S") publisedDate = dateObj.strftime("%d %b %Y") cdata['dateObj'] = dateObj cdata['publishedDate'] = publisedDate urls.append(cdata) except: continue json_to_csv(urls) return jsonify(data=urls)
def get_data(path, destination): links_list = set() with open(path, 'r', encoding='utf-8') as file: for line in file: link = line.split()[0] if len(link) < 10: continue links_list.add(link) links_list = list(links_list) final_outputs = {} important_keys = [ 'authors', 'date_publish', 'description', 'image_url', 'language', 'title', 'maintext' ] multiple_index = 200 for i in tqdm(range(len(links_list) // multiple_index)): keys = links_list[i * multiple_index:(i + 1) * multiple_index] values = NewsPlease.from_urls(keys, timeout=6) for key, value in values.items(): paper_data = {} for im_key in important_keys: paper_data[key] = value.__dict__[im_key] final_outputs[key] = paper_data pickle.dump(final_outputs, open(destination, 'wb'))
def read(self): url_file = open('news_urls.txt', 'r') articles = [] i = 0 for line in url_file: try: article = NewsPlease.from_url(line, timeout=3) print(article.title, file=self.title_file) article.text = article.text.replace('\n', '') print(article.text, file=self.content_file) articles.append(article.text) sleep(1) i += 1 if i % 10 == 0: if i != 0: print(i, 'articles collected so far') except: print('failure, next article') #clean workspace print(i, 'news articles collected on', self.term, '\n') url_file.close() os.remove('news_urls.txt') return articles
def crawl_newsapi_fulltext(self): """ crawl_newsapi_fulltext enriches existing rows in Article table that do not have fulltext by going to the associated URL, scraping the site, then obtaining the fulltext of the article and saving it to the database """ # For article filtering against None (Null in database), "is" and "is not" does not work articles = Article.query.filter( and_(Article.article_url != None, Article.article_fulltext == None)).all() n = 1 nmax = 4000 # number of articles to be processed at a time for article in articles: with suppress(Exception): newsplease_article = NewsPlease.from_url(article.article_url) article.article_fulltext = newsplease_article.text article.article_wordcount = len( newsplease_article.text.split(" ")) print(n) print(article.article_url) print(newsplease_article.title) print(newsplease_article.text) print('-----------------') db.session.flush() n = n + 1 if n > nmax: break db.session.commit()
def handle(self, *args, **options): news = RSSNews(RSS_Links) telegraph = Telegraph(access_token=os.getenv('TELEGRAPH_ACCESS_TOKEN')) if news.urls: for url, date in news.urls.items(): article = NewsPlease.from_url(url) a = Article(author=', '.join(article.authors) or 'Anonymous', title=article.title, short_text=article.description, content=article.maintext, date=date, source_link=url, img=article.image_url) a.save() response = telegraph.create_page(title=a.title, html_content=a.content) TelegraphArticle(title=a.title, link=response['url']).save() bot.send_telegraph_msg(response['url']) self.stdout.write(self.style.SUCCESS('Success'))
def detectURL(): data = request.json url = data['url'] try: article = NewsPlease.from_url(url) except: return jsonify([None, None]) statement = article.title justification = article.description try: subject = data['subject'] if data[ 'subject'] != "" else fnd.get_subject(statement)[0] except: result = (False, statement, justification) return jsonify(result) # print(subject) if justification == None: return jsonify([None, None]) result = fnd.detect(statement, subject, justification) result = result + (statement, justification, subject) # print(jsonify(article)) return jsonify(result)
def classify_texts(): requestObject = request.get_json() theURL = requestObject['urlOfContent'] articleTitle = "" articleMaintext = "" try: article = NewsPlease.from_url(theURL, timeout=20) articleTitle = article.title articleMaintext = article.maintext except: articleTitle = "" articleMaintext = "" HF_Rating = "" try: # fakeRating = requests.get('https://huggingface.co/openai-detector/?'+articleMaintext) # HF_Rating = fakeRating.json()['real_probability'] HF_Rating = inferWithHuggingFace(articleMaintext) except: HF_Rating = "" CML_Rating = getMLClassification(inputStr=requestObject['titleOfContent']) data_set = { "real": HF_Rating, "fullText": "" + articleMaintext, "real_CML": CML_Rating } outputJson = json.dumps(data_set) resp = make_response(outputJson) resp.headers['Access-Control-Allow-Origin'] = '*' resp.headers['Access-Control-Allow-Methods'] = 'DELETE, POST, GET, OPTIONS' resp.headers[ 'Access-Control-Allow-Headers'] = 'Content-Type, Access-Control-Allow-Headers, Authorization, X-Requested-Width' return resp
def parse(self, response): try: article = NewsPlease.from_html(response.body, response.url) text = article.maintext if any(x in text.lower() for x in self.keywords): item = ArticleItem() item['title'] = article.title item['text'] = text item['url'] = response.url print('Saved', response.url) yield item except: pass # Get all the <a> tags a_selectors = response.xpath("//a") # print('SELECTORS', a_selectors) # Loop on each tag for selector in a_selectors: text = selector.xpath("text()").extract_first() link = selector.xpath("@href").extract_first() if link != None: if 'https://' not in link: link = 'https://news.dartmouth.edu%s' % link # print(link) request = response.follow(link, callback=self.parse) # Return it thanks to a generator yield request
def extract_news(link): """This function extract news from given link. Arguments: link {string} -- [Link of news article.] Raises: ValueError: [Raise error if link is not for ekantipur/onlinekhabar] Returns: [tuple(title, sample_text)] -- [Title: Title of the news, sample_text: news article that has been extracted from the link given.] """ if 'onlinekhabar.com' in link: sample_text = get_content_onlinekhabar(link) elif 'ekantipur.com' in link: sample_text = get_content_ekantipur(link) else: raise ValueError( 'Currently we work with onlinekhabar and ekantipur only. Other sites will be addedd soon.' ) article = NewsPlease.from_url(link) title = article.title return (title, sample_text)
def extract_content(self, URL): """ This method returns the main content from the given URL """ try: # Extract content content = Goose().extract(URL).cleaned_text # If the returned content is null raise an exception to change the crawler if (len(content) == 0): raise Exception except Exception as exception: highlight_back( "[ContentCrawler] Crawler migrated from Goose to News-Please due to an exception: {}" .format(exception), 'G') try: # Extract content usinf NewsPleas content = NewsPlease.from_url(URL).text except Exception as exception: highlight_back( "[ContentCrawler] An exception has occured in News Please and Lassie method content is empty: {}" .format(exception), 'R') #content is now empty content = "" return content
def rss_view(request): context = {} blog_posts = [] import feedparser from newsplease import NewsPlease # Get a list of feed URLs with open('feeds.txt') as f: rss_urls = list(f) for url in rss_urls: NewsFeed = feedparser.parse(url) for entry in NewsFeed.entries: blog = {} blog['title'] = entry.title blog['link'] = entry.link # get content article = NewsPlease.from_url(entry.link) blog['content'] = article.maintext blog['image'] = article.image_url blog['date_published'] = article.date_publish blog['author'] = '' blog['description'] = article.description #print("date_publish: ", article.date_publish) blog_posts.append(blog) context['blog_posts'] = blog_posts return render(request, 'personal/rss.html', context)
def scrape(string): url = 'https://economictimes.indiatimes.com/topic/' + string # Connect to the URL response = requests.get(url) # Parse HTML and save to BeautifulSoup object¶ soup = BeautifulSoup(response.text, "html.parser") articles = [] x = soup.findAll('a') # To download the whole data set, let's do a for loop through all a tags for i in range(len(x)): #'a' tags are for links one_a_tag = x[i] if one_a_tag.has_attr('href'): link = one_a_tag['href'] if not link.startswith('http'): if link.startswith('/markets'): download_url = 'https://economictimes.indiatimes.com' + link print(download_url) articles.append(NewsPlease.from_url(download_url)) sentiments = list() for article in articles: string = article.title + '\n' + article.text sentiments.append(sentiment(string)) return sentiments
def getcontent(self): articles = [] urls5 = self.geturls() for i in urls5: article = NewsPlease.from_url(i) articles.append(article) return articles
def get_data(url): """ Extract the data from a specific url of a news article url : the url of the article, that we want to extract information from_url Return article.title : the title of the article article.text : the text block of the article article.date_publish :the data the article was published article.description : a short description of the article article.language : the language in that the article is written article.date_modify : the date the article was modifeied if it was article.url : the url of the article (same as input url) """ # Try downloading the article try: article = NewsPlease.from_url(url) # catch HTTPError and return empty values instead except urllib.error.HTTPError as err: # Print the error code print("HTTPError Found: ", err.code) return "", "", "", "", "", "", "" print("Data Extracted.") return article.title, article.text, article.date_publish, article.description, article.language, article.date_modify, article.url
def filter_record(self, warc_record, article=None): passed_filters, article = super().filter_record(warc_record, article) if not passed_filters: return False, article url = warc_record.rec_headers.get_header('WARC-Target-URI') def get_lang(): nonlocal article if article is None: article = NewsPlease.from_warc(warc_record) return article.language country = detect_country(url, get_lang) if not country or not is_european_cc(country): return False, article article.country = country if article is None: article = NewsPlease.from_warc(warc_record) lang = article.language if not lang or not is_european_langcode(lang): return False, article # TODO: Find COVID-19 mention searcher = get_covid_searchers().get(lang) if searcher is None: return False, article def match(key): return searcher.match((getattr(article, key) or "").lower().encode("utf-8")) if match("title"): return True, article if match("maintext"): return True, article return True, article
def news_from_link(ref_link, news_from_globo): row = { 'titulos': [], 'links': [], 'noticia': [], 'image': [], 'abstract': [], 'date': [] } article = NewsPlease.from_url(ref_link) if (article is not None): # Data returned by the NewsPlease row['titulos'].append(article.title) row['noticia'].append(article.text) row['abstract'].append(article.text) row['links'].append(article.url) if (news_from_globo): # we need to get the date from the original url, the date returned by the NewsPlease is wrong page_time = urllib.request.urlopen(article.url) soup_date = BeautifulSoup(page_time, 'html.parser') time_tag = soup_date.find_all('time', attrs={'itemprop': 'datePublished'}) public_date = time_tag[0].text formated_date = format_globo_date(public_date) row['date'].append(formated_date) else: formated_date = str(article.date_publish) row['date'].append(formated_date) path_image = article.image_url if path_image == '' or path_image == None: row['image'].append(0) else: row['image'].append(download_and_move_image(article.image_url)) news = News(row['abstract'], row['noticia'], row['date'], row['links'], row['titulos'], row['image']) try: print(row['titulos']) news_in_db = seguranca_table.check_news(news) print('news_in_db: ' + str(news_in_db)) if (not news_in_db): row = pd.DataFrame(row) df, categories = seguranca_lexical.lexical_corpus_and_title( row) print(categories) # DB categories and image if (categories != [set()]): news.set_categories(categories) seguranca_table.save_news(news) seguranca_post.post_news(df) except: print('Empty News')
def getNews(link): """ Function to get the news for a certain URL - using library newsplease :param link: the URL link for the news :return: the content of the news for the linl provided """ try: """ first_article = Article(url=link) first_article.download() first_article.parse() text=first_article.text """ article = NewsPlease.from_url(link) #we need to remove new lines and quotes, otherwise quilt will fail article_no_newlines = article.text.replace('\n', '') article_no_quotes = article_no_newlines.replace('"', "'") #article = NewsPlease.from_url(link) #return article.text return article_no_quotes except: print("An exception occurred while scrapping the news:",link) # traceback.print_exc() pass return None
def url(): news = [] error = None if request.method == 'POST': # article = NewsPlease.from_url('https://economictimes.indiatimes.com/wealth/personal-finance-news/rbi-policy-why-repo-rate-cut-failed-to-cheer/articleshow/71451242.cms') data = request.get_json() print(data['url']) article = NewsPlease.from_url(data['url']) news.append({ "authors": article.authors, "date_download": article.date_download, "date_modify": article.date_modify, "date_publish": article.date_publish, "description": article.description, "filename": article.filename, "image_url": article.image_url, "language": article.language, "localpath": article.localpath, "source_domain": article.source_domain, "text": article.text, "title": article.title, "title_page": article.title_page, "title_rss": article.title_rss, "url": article.url }) return jsonify(news)
def apacitationforlist(multipleurls): mylist = multipleurls.split(",") length = len(mylist) message = "" for x in range(length): message = message + str(x + 1) + ". " myurl = mylist[x] request = requests.get(myurl) if request.status_code < 400: article = NewsPlease.from_url(mylist[x]) if article.authors == None or article.title == None: message += "There is not enough information to make a citation." message += "\n" else: if len(article.authors) != 0: message += apacitation(article.authors[0], article.title, myurl) message += "\n" else: message += "We could not find an author." message += "\n" else: message += "The website you requested is not available or does not exist." message += "\n" print(message) return message
def url_Contents(url_article): article = NewsPlease.from_url(url_article) if (article.text) == None: print('None') else: content = article.text return content
def main(): article = NewsPlease.from_url( 'https://www.foxnews.com/politics/house-democrat-subpoenas-mnuchin-irs-for-trumps-tax-returns' ) doc = Document.from_newsplease(article) doc = extractor.parse(doc) answers = doc.get_top_answer('who').get_parts_as_text()
def extractorFunc(self): extract_list = [] with open("./crawler_urls/" + self.__filename, "r") as f: load_list = json.load(f) num_news = len(load_list) for i in range(num_news): news_dict = {} if self.__media in load_list[i]['media']: extractor = NewsPlease.from_url(load_list[i]['url']) news_dict["title"] = load_list[i]['title'] news_dict["media"] = load_list[i]['media'] news_dict["date"] = load_list[i]['date'] news_dict["url"] = load_list[i]['url'] maintext = extractor.maintext if maintext and len(maintext) > 200: news_dict["text"] = maintext else: continue extract_list.append(news_dict) else: continue if (i + 1) % 10 == 0: self.toJson(extract_list) extract_list = [] self.toJson(extract_list)
def filter_record(self, warc_record, article=None): url = warc_record.rec_headers.get_header('WARC-Target-URI') url_parts = tldextract.extract(url) domain = url_parts.registered_domain if domain not in STATE_BROADCASTERS: return False, article country = STATE_BROADCASTERS[domain] passed_filters, article = super().filter_record(warc_record, article) if not passed_filters: return False, article if article is None: article = NewsPlease.from_warc(warc_record) article.country = country if not article.language or not is_european_langcode(article.language): return False, article searcher = get_covid_searchers().get(article.language) if searcher is None: return False, article def match(key): return searcher.match((getattr(article, key) or "").lower().encode("utf-8")) if not match("title") and not match("maintext"): return False, article return True, article
def article_generator_text(keyword_query, num_articles): text = '' for url in search_news(str(keyword_query), num=1, stop=num_articles): article = NewsPlease.from_url(str(url)) if (article.text != None): if article.source_domain not in new_list: text = text + article.text return text
def crawl_page(self, response): self.crawl_other_links(response) article = NewsPlease.from_html(response.content, url=response.url) data = article.get_dict() data.pop('maintext') yield data
def run_newsplease(htmlstring): '''try with newsplease''' try: article = NewsPlease.from_html(htmlstring, url=None) return article.maintext # sanitize(article.maintext) except Exception as err: #print('Newsplease exception:', err) return ''
def download_url(url: str) -> None: try: article = NewsPlease.from_url(urls[url], timeout=10) save_obj(article, str(url).zfill(5), text_output_folder) return 1 except Exception as ex: print(url, ex) return 0
def extract_article(url): article = NewsPlease.from_url(url) date = article.date_publish author, text = extract_author(article.text) keywords = extract_keywords(url) metadata = author, date, keywords return text, metadata
def filter_record(self, warc_record, article=None): passed_filters, article = super().filter_record(warc_record, article) url = warc_record.rec_headers.get_header('WARC-Target-URI') canon_url = canonicalize_url(url) if canon_url not in all_urls: return False, article if article is None: article = NewsPlease.from_warc(warc_record) return True, article
'Equifax breach': ["https://www.wsj.com/articles/equifax-earnings-drop-27-in-quarter-marred-by-cyberattack-1510268187", "https://www.bloomberg.com/news/articles/2017-11-14/how-much-will-equifax-pay", "https://gizmodo.com/equifax-seized-138-scammy-lookalike-domains-instead-of-1820450580"] } for index, topic in enumerate(urls): for url in urls[topic]: dId = hashlib.sha224(url.encode('utf-8')).hexdigest() if not json_exist('data_raw', dId): # this is an object try: article = NewsPlease.from_url(url) # this is an dict article_dict = article.get_dict() # cluster with label and id article_dict['category_id'] = index article_dict['category'] = topic # enhancement for giveme5w article_dict['dId'] = dId # datetime-datetime-not-json-serializable bugfix" article_dict['date_publish'] = article_dict['date_publish'].isoformat() write_json('data_raw', article_dict['dId'], article_dict)
for sentence in sentences: if str(sentence["sentence_id"]) == sent1_id: sent1 = sentence["sentence"] if str(sentence["sentence_id"]) == sent2_id: sent2 = sentence["sentence"] print sent1_id, ":", sent1 print sent2_id, ":", sent2 print events[0]['source'], events[0]['target'], events[0]['code'] val = int(round(10*similar(sent1, sent2))) if val not in similar_count: similar_count[val] = 0 similar_count[val] = similar_count[val] + 1 from newsplease import NewsPlease article = NewsPlease.from_url(events[0]['url']) print events[0]['url'] print(article.text) doc_count += 1 print doc_count print root_code_match print event_match print similar_count