def parse(self, response): #print type(response) article = None try: article = NewsPlease.from_html(response.body.encode("utf-8")) except: article = NewsPlease.from_html( response.body.decode('latin-1').encode("utf-8")) print "EXCEPTION OCCURED" print article.date_publish #print article.text article2 = Article(url="", language="es") article2.set_html(response.body) article2.parse() print response.url self.db.articles_es.insert({ "title": article.title, "pub_date": article.date_publish, "url": response.url, "content": article2.text, "raw_html": response.body }) links = self.linkExtractor.extract_links(response) for link in links: yield scrapy.Request(link.url, callback=self.parse)
def parse(self, response): try: article = NewsPlease.from_html(response.body, response.url) text = article.maintext if any(x in text.lower() for x in self.keywords): item = ArticleItem() item['title'] = article.title item['text'] = text item['url'] = response.url print('Saved', response.url) yield item except: pass # Get all the <a> tags a_selectors = response.xpath("//a") # print('SELECTORS', a_selectors) # Loop on each tag for selector in a_selectors: text = selector.xpath("text()").extract_first() link = selector.xpath("@href").extract_first() if link != None: if 'https://' not in link: link = 'https://news.dartmouth.edu%s' % link # print(link) request = response.follow(link, callback=self.parse) # Return it thanks to a generator yield request
def run_newsplease(htmlstring): '''try with newsplease''' try: article = NewsPlease.from_html(htmlstring, url=None) return article.maintext # sanitize(article.maintext) except Exception as err: #print('Newsplease exception:', err) return ''
def crawl_page(self, response): self.crawl_other_links(response) article = NewsPlease.from_html(response.content, url=response.url) data = article.get_dict() data.pop('maintext') yield data
def run_newsplease(htmlstring): '''try with newsplease''' try: article = NewsPlease.from_html(htmlstring, url=None) if article.date_publish is None: return None date = convert_date(article.date_publish, '%Y-%m-%d %H:%M:%S', '%Y-%m-%d') return date except Exception as err: print('Exception:', err) return None
def main(): output = {} for path in Path('html').glob('*.html.gz'): with gzip.open(path, 'rt', encoding='utf8') as f: html = f.read() item_id = path.stem.split('.')[0] article = NewsPlease.from_html(html, url=None) output[item_id] = {'articleBody': article.maintext} (Path('output') / 'news_please.json').write_text(json.dumps( output, sort_keys=True, ensure_ascii=False, indent=4), encoding='utf8')
def get_paragraphs_newsplease(str_text, mode): """ using Newsplease """ try: text_det = NewsPlease.from_html(str_text.encode(), url=None).maintext if text_det is None: list_paragraphs = [""] else: list_paragraphs = re.split("\n", text_det) except: list_paragraphs = [""] return list_paragraphs
def extract(html, url): try: article = NewsPlease.from_html(html, url=None) except newspaper.article.ArticleException as e: logger_e.info("{} - {}".format(url, e)) return {} except ValueError as e: logger_e.info("{} - {}".format(url, e)) return {} except: logger_e.info("{} - {}".format(url, "Unknown error")) return {} return { "title": article.title, "maintext": article.maintext, "language": article.language, }
def parse(self, response): now = datetime.datetime.now() article = NewsPlease.from_html(response.text, response.url) if article.date_publish is not None and article.text is not None: yield NewsEntry( full_url=response.url, source_domain = article.source_domain, date_publish = article.date_publish, date_download = str(now), title = article.title, description = article.description, text = article.text, # dont_filter=True ) for link in LxmlLinkExtractor(allow=self.allowed_domains).extract_links(response): yield Request(link.url, self.parse)
def save_page(self, response): # ignore 404s if response.status == 404: return # # make the parent directory # url_parts = response.url.split('://')[1].split('/') # parent_directory = os.path.join(self.directory, *url_parts) # os.makedirs(parent_directory, exist_ok=True) # # construct the output filename # time = response.meta['wayback_machine_time'] # if self.unix: # filename = '{0}.snapshot'.format(time.timestamp()) # else: # filename = '{0}.snapshot'.format(time.strftime(WaybackMachineMiddleware.timestamp_format)) # full_path = os.path.join(parent_directory, filename) # # write out the file # with open(full_path, 'wb') as f: # f.write(response.body) try: # check to make sure I don't already have it if bool(db.articles.find_one({'url': response.url})): return # if I don't, insert article = NewsPlease.from_html(response.body, response.url, datetime.today()).__dict__ if article['date_publish'] and article['title']: article['download_via'] = 'wayback' # insert to db db.insert_one(article) print('inserted ' + article['url']) except: traceback.print_exc print(article['url'])