def html(self, fileids=None, categories=None): """ Returns the HTML content of each document, cleaning it using the readability-lxml library. """ for doc in self.docs(fileids, categories): try: yield Paper(doc).summary() except Unparseable as e: print("Could not parse HTML: {}".format(e)) continue
def html(self, fileids=None, categories=None): """ Возвращает содержимое HTML каждого документа, очищая его с помощью библиотеки readability-lxml. """ for doc in self.docs(fileids, categories): try: yield Paper(doc).summary() except Unparseable as e: print("Could not parse HTML: {}".format(e)) continue
def html(self, fileids=None, categories=None, readability=True): """ Returns the HTML content from each JSON document for every file in the corpus, ensuring that it exists. Note, this simply returns the HTML strings, it doesn't perform any parsing of the HTML. If readability is True, clean HTML is returned. """ ## Returns a generator of documents. html = self.fields('content', fileids, categories) if readability: for doc in html: try: yield Paper(doc).summary() except Unparseable as e: print("Could not parse HTML: {}".format(e)) else: for doc in html: yield doc
def docs(self, fileids=None, categories=None): """ Returns the events after pos tagging them """ # Initialize pos tagger if necessary - using spacy #if self.tagger is None : # self.tagger = spacy.load('fr_core_news_sm') # Resolve the fileids and the categories fileids = self.resolve(fileids, categories) #print(fileids) # Retrieve events for fileid in fileids: paras = [] event = self.get_event(fileid) assert (event is not None) # Build paragraphs # Limitation : one paragraph for the description paras.append(event['name']) html = Paper(event['description']).summary() ## print(event['description']) ## print(html) soup = bs4.BeautifulSoup(html, 'lxml') for element in soup.find_all(self.htmltags): #print('§'+element.text) paras.append(element.text) soup.decompose() ## print(paras) ## key=input('>>') ## for item in ['name', 'description', ] : ## paras.append(event[item]) # Return tags yield [[self.tagger.pos_tag(sent) for sent in sent_tokenize(para)] for para in paras]
def textgetter(url): """Scrapes web news and returns the content Parameters ---------- url : str web address to news report Returns ------- answer : dict Python dictionary with key/value pairs for: text (str) - Full text of article url (str) - url to article title (str) - extracted title of article author (str) - name of extracted author(s) base (str) - base url of where article was located provider (str) - string of the news provider from url published_date (str,isoformat) - extracted date of article top_image (str) - extracted url of the top image for article """ global done TAGS = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'p', 'li'] # regex for url check s = re.compile('(http://|https://)([A-Za-z0-9_\.-]+)') u = re.compile("(http://|https://)(www.)?(.*)(\.[A-Za-z0-9]{1,4})$") if s.search(url): site = u.search(s.search(url).group()).group(3) else: site = None answer = {} # check that its an url if s.search(url): if url in done.keys(): return done[url] pass try: r = requests.get(url, verify=False, timeout=1) except: done[url] = "Unable to reach website." answer['author'] = None answer['base'] = s.search(url).group() answer['provider'] = site answer['published_date'] = None answer['text'] = "Unable to reach website." answer['title'] = None answer['top_image'] = None answer['url'] = url yield answer if r.status_code != 200: done[url] = "Unable to reach website." answer['author'] = None answer['base'] = s.search(url).group() answer['provider'] = site answer['published_date'] = None answer['text'] = "Unable to reach website." answer['title'] = None answer['top_image'] = None answer['url'] = url if len(r.content) > 500: article = Article(url) article.download(input_html=r.content) article.parse() if len(article.text) >= 200: answer['author'] = ", ".join(article.authors) answer['base'] = s.search(url).group() answer['provider'] = site answer['published_date'] = article.publish_date if isinstance(article.publish_date, datetime.datetime): answer['published_date'] = article.publish_date.astimezone( pytz.utc).isoformat() answer['text'] = article.text answer['title'] = article.title answer['top_image'] = article.top_image answer['url'] = url else: doc = Paper(r.content) data = doc.summary() title = doc.title() soup = BeautifulSoup(data, 'lxml') newstext = " ".join([l.text for l in soup.find_all(TAGS)]) if len(newstext) > 200: answer['author'] = None answer['base'] = s.search(url).group() answer['provider'] = site answer['published_date'] = None answer['text'] = newstext answer['title'] = title answer['top_image'] = None answer['url'] = url else: newstext = " ".join([ l.text for l in soup.find_all('div', class_='field-item even') ]) done[url] = newstext answer['author'] = None answer['base'] = s.search(url).group() answer['provider'] = site answer['published_date'] = None answer['text'] = newstext answer['title'] = title answer['top_image'] = None answer['url'] = url else: answer['author'] = None answer['base'] = s.search(url).group() answer['provider'] = site answer['published_date'] = None answer['text'] = 'No text returned' answer['title'] = None answer['top_image'] = None answer['url'] = url yield answer yield answer del r, data else: answer['author'] = None answer['base'] = s.search(url).group() answer['provider'] = site answer['published_date'] = None answer['text'] = 'This is not a proper url' answer['title'] = None answer['top_image'] = None answer['url'] = url yield answer
def textgetter(url): """Scrapes web news and returns the content Parameters ---------- url : str web address to news report Returns ------- answer : dict Python dictionary with key/value pairs for: text (str) - Full text of article url (str) - url to article title (str) - extracted title of article author (str) - name of extracted author(s) base (str) - base url of where article was located provider (str) - string of the news provider from url published_date (str,isoformat) - extracted date of article top_image (str) - extracted url of the top image for article """ global done TAGS = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'p', 'li'] # regex for url check s = re.compile('(http://|https://)([A-Za-z0-9_\.-]+)') u = re.compile("(http://|https://)(www.)?(.*)(\.[A-Za-z0-9]{1,4})$") if s.search(url): site = u.search(s.search(url).group()).group(3) else: site = None answer = {} # check that its an url if s.search(url): if url in done.keys(): yield done[url] pass try: # make a request to the url r = requests.get(url, verify=False, timeout=1) except: # if the url does not return data, set to empty values done[url] = "Unable to reach website." answer['author'] = None answer['base'] = s.search(url).group() answer['provider']=site answer['published_date']=None answer['text'] = "Unable to reach website." answer['title'] = None answer['top_image'] = None answer['url'] = url answer['keywords']=None answer['summary']=None yield answer # if url does not return successfully, set ot empty values if r.status_code != 200: done[url] = "Unable to reach website." answer['author'] = None answer['base'] = s.search(url).group() answer['provider']=site answer['published_date']=None answer['text'] = "Unable to reach website." answer['title'] = None answer['top_image'] = None answer['url'] = url answer['keywords']=None answer['summary']=None # test if length of url content is greater than 500, if so, fill data if len(r.content)>500: # set article url article = Article(url) # test for python version because of html different parameters if int(platform.python_version_tuple()[0])==3: article.download(input_html=r.content) elif int(platform.python_version_tuple()[0])==2: article.download(html=r.content) # parse the url article.parse() article.nlp() # if parse doesn't pull text fill the rest of the data if len(article.text) >= 200: answer['author'] = ", ".join(article.authors) answer['base'] = s.search(url).group() answer['provider']=site answer['published_date'] = article.publish_date answer['keywords']=article.keywords answer['summary']=article.summary # convert the data to isoformat; exception for naive date if isinstance(article.publish_date,datetime.datetime): try: answer['published_date']=article.publish_date.astimezone(pytz.utc).isoformat() except: answer['published_date']=article.publish_date.isoformat() answer['text'] = article.text answer['title'] = article.title answer['top_image'] = article.top_image answer['url'] = url # if previous didn't work, try another library else: doc = Paper(r.content) data = doc.summary() title = doc.title() soup = BeautifulSoup(data, 'lxml') newstext = " ".join([l.text for l in soup.find_all(TAGS)]) # as we did above, pull text if it's greater than 200 length if len(newstext) > 200: answer['author'] = None answer['base'] = s.search(url).group() answer['provider']=site answer['published_date']=None answer['text'] = newstext answer['title'] = title answer['top_image'] = None answer['url'] = url answer['keywords']=None answer['summary']=None # if nothing works above, use beautiful soup else: newstext = " ".join([ l.text for l in soup.find_all( 'div', class_='field-item even') ]) done[url] = newstext answer['author'] = None answer['base'] = s.search(url).group() answer['provider']=site answer['published_date']=None answer['text'] = newstext answer['title'] = title answer['top_image'] = None answer['url'] = url answer['keywords']=None answer['summary']=None # if nothing works, fill with empty values else: answer['author'] = None answer['base'] = s.search(url).group() answer['provider']=site answer['published_date']=None answer['text'] = 'No text returned' answer['title'] = None answer['top_image'] = None answer['url'] = url answer['keywords']=None answer['summary']=None yield answer yield answer # the else clause to catch if invalid url passed in else: answer['author'] = None answer['base'] = s.search(url).group() answer['provider']=site answer['published_date']=None answer['text'] = 'This is not a proper url' answer['title'] = None answer['top_image'] = None answer['url'] = url answer['keywords']=None answer['summary']=None yield answer