def scrape_unit(self, unit): date = iso8601.iso8601.parse_date(unit["datum"], default_timezone=None) hostname = urlparse(unit["url"]).hostname publisher = ".".join(hostname.split(".")[-2:]) title = unit["titel"].strip() or "[No title]" article = Article(title=title, text=unit["bericht tekst"], url=unit["url"], date=date) article.set_property("author", unit["auteur"]) article.set_property("publisher", publisher) return article
def _parse_comment(self, comment, base_title, base_url): text = html2text(comment.cssselect("p")) article_id = comment.get("id") title = "{base_title}#{article_id}".format(**locals()) url = "{base_url}#{article_id}".format(**locals()) author, timestamp = _parse_comment_footer(comment.cssselect("footer")[0].text_content()) article = Article(date=timestamp, title=title, text=text.strip() or ".", url=url) article.set_property("author", author.strip()) article.set_property("medium", "GeenStijl Comments") return article
def scrape_unit_meta(self, article_element): CONTEXT['unit'] = article_element article_html = article_element.get_attribute("outerHTML") # print(f"dit is html{article_html}") article_doc = lxml.html.fromstring(article_html, base_url=SEARCH_URL) CONTEXT['doc'] = article_element def get_byline_prop(prop): for meta_element in article_doc.cssselect(f".nd-article__{prop}"): prop_value = meta_element.text_content().strip() if prop_value: return prop_value else: raise ValueError("Article {} has no property '{}'.".format(title, prop)) text_url = article_doc.cssselect("a.nd-article__headline-text")[0].get("href") url = "newsdesk://{}".format(get_newsdesk_article_id(text_url)) title = article_doc.cssselect("a.nd-article__headline-text")[0].text_content().strip() print(title) publisher = get_byline_prop("source") date_text = article_doc.cssselect(".nd-article__date")[0].get("title") date = date_text.split("Publicatiedatum:") pub_date = date[-1] pub_date = dutch_strptime(pub_date.strip()[:-1], "%d %b %Y %H:%M") load_date = date[1] load_date = dutch_strptime(load_date.strip(), "%d %b %Y %H:%M") article = Article(url=url, title=title, date=pub_date) article.set_property("publisher", publisher) article.set_property("text_url", text_url) # Crashes AmCAT API: #article.set_property("pubdate_date", pub_date) try: author = get_byline_prop("author") article.set_property("author", author) except ValueError: pass try: article.set_property("wordcount_int", int(get_byline_prop("word-count").split()[0].replace(",", ""))) except ValueError: logging.warning("could not find word count") try: article.set_property("country", get_byline_prop("source_country")) except ValueError: pass return NewsdeskUnit(article_element, article)
def scrape_unit_meta(self, article_element): article_html = article_element.get_attribute("outerHTML") article_doc = lxml.html.fromstring(article_html, base_url=SEARCH_URL) def get_byline_prop(prop): for meta_element in article_doc.cssselect(".article_byline__element.{}".format(prop)): prop_value = meta_element.text_content().strip() if prop_value: return prop_value else: raise ValueError("Article {} has no property '{}'.".format(title, prop)) text_url = article_doc.cssselect("a.article_headline")[0].get("href") url = "newsdesk://{}".format(get_newsdesk_article_id(text_url)) title = article_doc.cssselect("a.article_headline")[0].text_content().strip() publisher = get_byline_prop("source") date = get_byline_prop("harvest_date") date, pub_date = date.split("(gepubliceerd: ") date = dutch_strptime(date.strip(), "%d %b %Y %H:%M") pub_date = dutch_strptime(pub_date.strip()[:-1], "%d %b %Y %H:%M") article = Article(url=url, title=title, date=date) article.set_property("publisher", publisher) article.set_property("text_url", text_url) # Crashes AmCAT API: #article.set_property("pubdate_date", pub_date) try: article.set_property("author", get_byline_prop("author")) except ValueError: pass try: article.set_property("wordcount_int", int(get_byline_prop("word_count").split()[0])) except ValueError: pass try: article.set_property("country", get_byline_prop("source_country")) except ValueError: pass return NewsdeskUnit(article_element, article)
def scrape_unit(self, date_and_article_url): date, article_url = date_and_article_url log.info("Fetching {}".format(article_url)) article_doc = self.session.get_html(article_url) article_el = article_doc.cssselect("#content > article") if not article_el: log.error("Could not find article on {article_url}".format(**locals())) return None title = article_el[0].cssselect("h1")[0].text text = html2text(article_el[0].cssselect("p")) text = text.strip() or "." try: footer = article_el[0].cssselect("footer")[0] except IndexError as e: # Contains <embed> tag which is not closed gracefully :-( log.exception(e) return None author = footer.text.rsplit("|", 1)[0].strip() timestamp = parse_date(article_el[0].cssselect("footer > time")[0].get("datetime")) if not title: return None children = self._get_comments(title, article_url, article_doc) article = Article(date=timestamp, title=title, text=text) article.set_property("author", author) article.set_property("url", article_url) article.set_property("medium", "GeenStijl") return ArticleTree(article, [ArticleTree(c, []) for c in children])
def scrape_unit(self, article_info: ArticleTuple): date, page_num, url = article_info try: text_url = strip_query(self.session.get_redirected_url(url)) except RedirectError as e: if e.status_code == 404: return None raise try: text_doc = self.session.get_html(text_url) except HTTPError as e: if e.response.status_code == 404: logging.warning(f"{url} returned 404 skipping") return None else: raise for image in text_doc.cssselect(".image"): image.getparent().remove(image) date = datetime.datetime(date.year, date.month, date.day) try: title = text_doc.cssselect("article > h1")[0].text.strip() except: return None text = html2text(text_doc.cssselect("main > article > .body")) if not text.strip(): return None article = Article(title=title, date=date, text=text, url=url) if text_doc.cssselect("article > header.themed"): # New headers style author = text_doc.cssselect("article > header .author")[0].text section = text_doc.cssselect("article > header .title")[0].text article.set_property("author", author) else: # Old header style section = text_doc.cssselect("article > header > .title") section = section[0].text if section else "NOSECTION" author_a = text_doc.cssselect("article .author a") if author_a: author = author_a[0].text.strip() article.set_property("author", author) if author == section: section = "Opinie" download = text_doc.cssselect('form[name="download"]') if download: pdf_url = download[0].get("action") article.set_property("pdf_url", pdf_url) article.set_property("text_url", text_url) article.set_property("image_url", text_url + "?view=img") if section: article.set_property("section", section.strip()) return article
def scrape_unit(self, article_info: ArticleTuple): date, page_num, url = article_info try: text_url = strip_query(self.session.get_redirected_url(url)) except RedirectError as e: if e.status_code == 404: return None raise text_doc = self.session.get_html(text_url) for image in text_doc.cssselect(".image"): image.getparent().remove(image) date = datetime.datetime(date.year, date.month, date.day) try: title = text_doc.cssselect("article > h1")[0].text.strip() except: return None text = html2text(text_doc.cssselect("main > article > .body")) if not text.strip(): return None article = Article(title=title, date=date, text=text, url=url) if text_doc.cssselect("article > header.themed"): # New headers style author = text_doc.cssselect("article > header .author")[0].text section = text_doc.cssselect("article > header .title")[0].text article.set_property("author", author) else: # Old header style section = text_doc.cssselect("article > header > .title") section = section[0].text if section else "NOSECTION" author_a = text_doc.cssselect("article .author a") if author_a: author = author_a[0].text.strip() article.set_property("author", author) if author == section: section = "Opinie" download = text_doc.cssselect('form[name="download"]') if download: pdf_url = download[0].get("action") article.set_property("pdf_url", pdf_url) article.set_property("text_url", text_url) article.set_property("image_url", text_url + "?view=img") if section: article.set_property("section", section.strip()) return article
def scrape_unit(self, entry): article = Article() try: section, text = self.get_article_section_text(entry["link"]) print(section, text) except IndexError: return None article.set_property("nuid", entry["id"]) article.set_property("title", entry["title"]) article.set_property("date", self.parse_date(str(entry["published"]))) article.set_property("url", entry["link"]) article.set_property("section", section) article.set_property("text", text) return article