def test_get_ids(self): tree = ArticleTree( Article(id=3), [ ArticleTree(Article(id=5), []), ArticleTree(Article(id=6), [ ArticleTree(Article(id=7), []) ]) ] ) self.assertEqual({3, 5, 6, 7}, set(tree.get_ids()))
def scrape_unit(self, url): reader_url = "about:reader?url={}".format(url) doc = self.get_html(reader_url, wait_for="div.content p") for tag in REMOVE_TAGS: for element in doc.cssselect(tag): element.getparent().remove(element) article = doc.cssselect("div.content")[0] article_html = lxml.html.tostring(article).decode() title = doc.cssselect("h1.reader-title")[0].text_content().strip() text = html2text(article_html) if self.__class__.get_date is not GenericScraper.get_date: # Get contents of un-firefox-read-ed article self.wait(".reader-toolbar .close-button").click() time.sleep(0.3) doc_html = self.wait("html").get_attribute("outerHTML") doc = lxml.html.fromstring(doc_html, base_url=url) try: date = self.get_date(doc) except NotImplementedError: date = self.now except Exception as e: log.warning("get_date() failed for {} with: {}".format(url, e)) date = self.now else: date = self.now article = Article(date=date, title=title, text=text, url=url) return article
def scrape_unit(self, date_and_article_url): date, article_url = date_and_article_url log.info("Fetching {}".format(article_url)) article_doc = self.session.get_html(article_url) article_el = article_doc.cssselect("#content > article") if not article_el: log.error("Could not find article on {article_url}".format(**locals())) return None title = article_el[0].cssselect("h1")[0].text text = html2text(article_el[0].cssselect("p")) text = text.strip() or "." try: footer = article_el[0].cssselect("footer")[0] except IndexError as e: # Contains <embed> tag which is not closed gracefully :-( log.exception(e) return None author = footer.text.rsplit("|", 1)[0].strip() timestamp = parse_date(article_el[0].cssselect("footer > time")[0].get("datetime")) if not title: return None children = self._get_comments(title, article_url, article_doc) article = Article(date=timestamp, title=title, text=text) article.set_property("author", author) article.set_property("url", article_url) article.set_property("medium", "GeenStijl") return ArticleTree(article, [ArticleTree(c, []) for c in children])
def test_elastic_hash(self): """Can we reproduce a hash from elastic data alone?""" article = Article(**{ "date": datetime.date(2015, 1, 1), "section": "\u6f22\u5b57", "pagenr": 1928390, "headline": "Headline hier.", "byline": "byline..", "length": 1928, "metastring": "Even more strange characters.. \x0C ..", "url": "https://example.com", "externalid": None, "author": None, "addressee": "Hmm", "text": "Contains invalid char \x08 woo", "medium": create_test_medium(name="abc."), "project": create_test_project() }) article.save() es = ES() es.add_articles([article.id]) hash = get_article_dict(article)["hash"] es.flush() es_articles = es.query_all(filters={"ids": [article.id]}, fields=HASH_FIELDS + ["hash"]) es_article = list(es_articles)[0] self.assertEqual(article.id, es_article.id) self.assertEqual(hash, es_article.hash) self.assertEqual(_get_hash(es_article.to_dict()), hash)
def create(self, validated_data): children = validated_data.pop("children") article = Article(**validated_data) if article.length is None: article.length = word_len(article.text) return (article, map(self.create, children))
def _parse_comment(self, comment, base_title, base_url): text = html2text(comment.cssselect("p")) article_id = comment.get("id") title = "{base_title}#{article_id}".format(**locals()) url = "{base_url}#{article_id}".format(**locals()) author, timestamp = _parse_comment_footer(comment.cssselect("footer")[0].text_content()) article = Article(date=timestamp, title=title, text=text.strip() or ".", url=url) article.set_property("author", author.strip()) article.set_property("medium", "GeenStijl Comments") return article
def parse_file(self, file): for doc in split_file(file): data = dict(parse_doc(doc)) art = {} for field, setting in self.options['field_map'].items(): value, typ = setting['value'], setting['type'] val = data.get(value) if typ == 'field' else value if val: art[field] = val yield Article(**art)
def test_deduplication(self): """Does deduplication work as it is supposed to?""" # create dummy articles to have something in the db [amcattest.create_test_article() for i in range(10)] amcates.ES().refresh() art = dict(project=amcattest.create_test_project(), title="deduptest", text="test", date='2001-01-01') a1 = amcattest.create_test_article(**art) amcates.ES().refresh() self.assertEqual(_q(title='deduptest'), {a1.id}) # duplicate articles should not be added a2 = amcattest.create_test_article(**art) amcates.ES().refresh() self.assertEqual(a2.id, a1.id) self.assertTrue(a2._duplicate) self.assertEqual(_q(title='deduptest'), {a1.id}) # however, if an articleset is given the 'existing' article # should be added to that set s1 = amcattest.create_test_set() a3 = amcattest.create_test_article(articleset=s1, **art) amcates.ES().refresh() self.assertEqual(a3.id, a1.id) self.assertEqual(_q(title='deduptest'), {a1.id}) self.assertEqual(set(s1.get_article_ids()), {a1.id}) self.assertEqual(_q(sets=s1.id), {a1.id}) # if an existing hash is set, it should be correct art2 = dict(hash=b'hash', **art) self.assertRaises(ValueError, amcattest.create_test_article, **art2) #TODO! Check duplicates within new articles art['title'] = "internaldupe" a1, a2 = (Article(**art), Article(**art)) Article.create_articles([a1, a2], articleset=s1) self.assertEqual(a1.id, a2.id) self.assertEqual(len(_q(title='internaldupe')), 1)
def create_article(self, art_dict, project): art_dict = { k: v for k, v in art_dict.items() if k in COPY_ARTICLE_FIELDS } art_dict["project"] = project if 'headline' in art_dict and 'title' not in art_dict: art_dict['title'] = art_dict.pop('headline') art_dict = dict(self._map_es_type(k, v) for k, v in art_dict.items()) art = Article(**art_dict) return art
def scrape_unit(self, unit): date = iso8601.iso8601.parse_date(unit["datum"], default_timezone=None) hostname = urlparse(unit["url"]).hostname publisher = ".".join(hostname.split(".")[-2:]) title = unit["titel"].strip() or "[No title]" article = Article(title=title, text=unit["bericht tekst"], url=unit["url"], date=date) article.set_property("author", unit["auteur"]) article.set_property("publisher", publisher) return article
def copy_article(article: Article): new = Article( project_id=article.project_id, date=article.date, title=article.title, url=article.url, #text=article.text <-- purposely omit text! #hash=article.hash <-- purposely omit hash! parent_hash=article.parent_hash) new.properties.update(article.properties) return new
def scrape_unit_meta(self, article_element): CONTEXT['unit'] = article_element article_html = article_element.get_attribute("outerHTML") # print(f"dit is html{article_html}") article_doc = lxml.html.fromstring(article_html, base_url=SEARCH_URL) CONTEXT['doc'] = article_element def get_byline_prop(prop): for meta_element in article_doc.cssselect(f".nd-article__{prop}"): prop_value = meta_element.text_content().strip() if prop_value: return prop_value else: raise ValueError("Article {} has no property '{}'.".format(title, prop)) text_url = article_doc.cssselect("a.nd-article__headline-text")[0].get("href") url = "newsdesk://{}".format(get_newsdesk_article_id(text_url)) title = article_doc.cssselect("a.nd-article__headline-text")[0].text_content().strip() print(title) publisher = get_byline_prop("source") date_text = article_doc.cssselect(".nd-article__date")[0].get("title") date = date_text.split("Publicatiedatum:") pub_date = date[-1] pub_date = dutch_strptime(pub_date.strip()[:-1], "%d %b %Y %H:%M") load_date = date[1] load_date = dutch_strptime(load_date.strip(), "%d %b %Y %H:%M") article = Article(url=url, title=title, date=pub_date) article.set_property("publisher", publisher) article.set_property("text_url", text_url) # Crashes AmCAT API: #article.set_property("pubdate_date", pub_date) try: author = get_byline_prop("author") article.set_property("author", author) except ValueError: pass try: article.set_property("wordcount_int", int(get_byline_prop("word-count").split()[0].replace(",", ""))) except ValueError: logging.warning("could not find word count") try: article.set_property("country", get_byline_prop("source_country")) except ValueError: pass return NewsdeskUnit(article_element, article)
def set_up(self): self.aset = amcattest.create_test_set() self.asets = ArticleSet.objects.filter(id__in=[self.aset.id]) self.project = self.aset.project self.a1 = Article( title="Man leeft nog steeds in de gloria", text="Gezongen vloek op verjaardag maakt leven van man tot een vrolijke hel.", date=datetime.datetime(2017, 1, 2, 23, 22, 11), author="Rudolf Julius", publisher="De Speld", project=self.project, exists="Once", page_int=5, section_int=10, tags_tag={"gloria", "vloek"}, html="Man <i>leeft</i> nog steeds in de gloria" ) self.a2 = Article( title="VVD trots op opkomende zon", text="Kabinetsbeleid om geen parasol over Nederland te zetten betaalt zich uit", date=datetime.datetime(2016, 12, 14, 15, 13, 12), author="Thomas Hogeling", publisher="De Speld", project=self.project, page_int=5, section_int=11, tags_tag={"vvd", "nederland", "speld"} ) Article.create_articles([self.a1, self.a2], articleset=self.aset) amcates.ES().refresh() self.qs = ESQuerySet(self.asets)
def scrape_unit(self, entry): article = Article() try: section, text = self.get_article_section_text(entry["link"]) print(section, text) except IndexError: return None article.set_property("nuid", entry["id"]) article.set_property("title", entry["title"]) article.set_property("date", self.parse_date(str(entry["published"]))) article.set_property("url", entry["link"]) article.set_property("section", section) article.set_property("text", text) return article
def _scrape_unit(self, row): row = {k:v.decode("utf-8") for k,v in row.iteritems()} query = row.pop('zoekopdracht') self.queries.add(query) medium = Medium.get_or_create(row.pop('type bron')) date = row.pop('datum') date = datetime.datetime.strptime(date, "%Y-%m-%d %H:%M") headline = row.pop('titel') pagenr = row.pop('bereik') or None text = row.pop('bericht tekst') url = row.pop('url') author=row.pop('auteur') metastring = json.dumps(row) a = Article(headline=headline, pagenr=pagenr, text=text, date=date, medium=medium, url=url, author=author, metastring=metastring) yield a
def test_elastic_hash(self): """Can we reproduce a hash from elastic data alone?""" article = Article( **{ "date": datetime.date(2015, 1, 1), "title": "\u6f22\u5b57", "text": "Even more strange characters.. \x0C and \x08 woo?", "url": "https://example.com", "project": create_test_project() }) hash = get_article_dict(article)['hash'] Article.create_articles([article], articleset=amcattest.create_test_set()) ES().refresh() es_articles = ES().query_all(filters={"ids": [article.id]}, fields=["hash"]) es_articles = list(es_articles) es_article = list(es_articles)[0] self.assertEqual(article.id, es_article.id) self.assertEqual(hash, es_article.hash) self.assertEqual(hash, article.hash)
def _scrape_unit(self, document): article = Article() metadata = list(META) # We select all 'div' elements directly under '.article' divs = document.cssselect("* > div") # Check for author field. If present: remove from metadata # fields list try: author_field = document.cssselect(".author")[0] except IndexError: pass else: article.author = author_field.text_content().lstrip("Von").strip() divs.remove(author_field) # Strip everything before headline headline_field = document.cssselect("b.deHeadline")[0].getparent() divs = divs[divs.index(headline_field):] # Parse metadata. Loop through each 'div' within an article, along with # its field name according to META (thus based on its position) for field_name, element in zip(metadata, divs): if field_name is None: continue processor = PROCESSORS.get(field_name, lambda x: x) text_content = element.text_content().strip() setattr(article, field_name, processor(text_content)) # Fetch text, which is paragraphs = [p.text_content() for p in document.cssselect("p")] article.text = ("\n\n".join(paragraphs)).strip() # We must return a iterable, so we return a one-tuple return (article, )
def get_articles(self, fn, media): csv.field_size_limit(sys.maxsize) def _int(x): return int(x) if x else None def hash2binary(hash): if hash: if not isinstance(hash, str): raise TypeError("Hash should be str, not {}".format( type(hash))) return "\\x" + hash r = csv.reader(open(fn)) header = next(r) index = {col: i for (i, col) in enumerate(header)} AID = index['article_id'] if self.maxid: logging.info( "*** max(id) set by user: {self.maxid}".format(**locals())) max_id, self.n_rows = self.maxid, self.maxid else: logging.info("*** Scan input CSV to determine #rows and max(id)") for row in r: max_id = max(max_id, int(row[AID])) self.n_rows += 1 if not self.n_rows % 10000000: logging.info( ".. scanned {self.n_rows} rows".format(**locals())) logging.info( "{self.n_rows} rows, max ID {max_id}, allocating memory for hashes" .format(**locals())) hashes = ctypes.create_string_buffer(max_id * 28) NULL_HASH = b'\x00' * 28 orphans = "N/A" passno = 1 if self._continue: logging.info( "Continuing from previous migration, getting state from DB") c = conn().cursor('migration-continue') c.itersize = 10000 # how much records to buffer on a client c.execute("SELECT article_id, hash FROM articles") i = 0 while True: rows = c.fetchmany(10000) if not rows: break i += len(rows) if not i % 1000000: logging.info("Retrieved {i} rows...") for (aid, hash) in rows: offset = (aid - 1) * 28 hashes[offset:offset + 28] = hash self.n_rows -= i logging.info( "Continuing migration, {i} articles retrieved, {self.n_rows} to go" .format(**locals())) while orphans: logging.info( "*** Pass {passno}, #orphans {orphans}".format(**locals())) passno += 1 orphans = 0 r = csv.reader(open(fn)) next(r) # skip header for row in r: aid = int(row[AID]) offset = (aid - 1) * 28 stored_hash = hashes[offset:offset + 28] if stored_hash != NULL_HASH: continue parent_id = _int(row[index['parent_article_id']]) if (parent_id == aid) or (parent_id in SKIP_PARENTS): parent_id = None if parent_id: poffset = (parent_id - 1) * 28 parent_hash = hashes[poffset:poffset + 28] if parent_hash == NULL_HASH: orphans += 1 continue parent_hash = binascii.hexlify(parent_hash).decode("ascii") else: parent_hash = None date = row[index['date']] date = date.split("+")[0] date = datetime.strptime(date, '%Y-%m-%d %H:%M:%S') a = Article(project_id=row[index['project_id']], date=date, title=row[index['headline']], url=row[index['url']] or None, text=row[index['text']], parent_hash=parent_hash) a.properties = { v: row[index[v]] for v in PROP_FIELDS if row[index[v]] } a.properties['medium'] = media[int(row[index['medium_id']])] a.properties['uuid'] = str(a.properties['uuid']) props = json.dumps(a.properties) hash = amcates.get_article_dict(a)['hash'] hashes[offset:offset + 28] = binascii.unhexlify(hash) yield (a.project_id, aid, a.date, a.title, a.url, a.text, hash2binary(hash), hash2binary(a.parent_hash), props)
def _scrape_unit(self, row): self.queries.add(row[self.lang.query]) art = self.map_article(row) a = Article(**art) return a
def scrape_unit(self, article_info: ArticleTuple): date, page_num, url = article_info try: text_url = strip_query(self.session.get_redirected_url(url)) except RedirectError as e: if e.status_code == 404: return None raise try: text_doc = self.session.get_html(text_url) except HTTPError as e: if e.response.status_code == 404: logging.warning(f"{url} returned 404 skipping") return None else: raise for image in text_doc.cssselect(".image"): image.getparent().remove(image) date = datetime.datetime(date.year, date.month, date.day) try: title = text_doc.cssselect("article > h1")[0].text.strip() except: return None text = html2text(text_doc.cssselect("main > article > .body")) if not text.strip(): return None article = Article(title=title, date=date, text=text, url=url) if text_doc.cssselect("article > header.themed"): # New headers style author = text_doc.cssselect("article > header .author")[0].text section = text_doc.cssselect("article > header .title")[0].text article.set_property("author", author) else: # Old header style section = text_doc.cssselect("article > header > .title") section = section[0].text if section else "NOSECTION" author_a = text_doc.cssselect("article .author a") if author_a: author = author_a[0].text.strip() article.set_property("author", author) if author == section: section = "Opinie" download = text_doc.cssselect('form[name="download"]') if download: pdf_url = download[0].get("action") article.set_property("pdf_url", pdf_url) article.set_property("text_url", text_url) article.set_property("image_url", text_url + "?view=img") if section: article.set_property("section", section.strip()) return article
def _scrape_unit(self, unit): tweets = ["list", "from", "api"] for tweet in tweets: yield Article(text=tweet, headline=tweet, date='2010-01-01')
def scrape_unit(self, unit: TelegraafUnit): return Article(title=unit.title, url=unit.url, text=unit.text, date=unit.date, pagerange=unit.page_range)
def scrape_unit(self, unit: NRCUnit): m = re.match(r"https://www.nrc.nl/nieuws/(\d{4})/(\d{2})/(\d{2})/", unit.url) if not m: logging.warning(f"Invalid URL: {unit.url}") return None year = int(m.group(1)) month = int(m.group(2)) day = int(m.group(3)) online_date = datetime(year, month, day) try: html = self.session.get_content(unit.url) except HTTPError as e: if e.response.status_code == 404: logging.warning(f"No article found for {unit.url}") return # some articles don't exist, i.e. cartoons without text raise doc = lxml.html.fromstring(html, base_url=unit.url) intro = doc.cssselect("div.intro") if not intro: logging.debug(f"Invalid intro: {unit.url}") intro = "" else: intro2 = intro[0].text_content() headline = doc.cssselect(".article-header-container h1") if not headline: headline2 = "-" logging.warning(f"No headline {unit.url}") else: headline2 = headline[0].text_content() if not headline2: headline2 = "-" logging.warning(f"Empty headline {unit.url}") author = doc.cssselect("ul.article__byline__text.unstyled a") if not author: logging.debug(f"Invalid author: {unit.url}") author2 = "" else: author2 = author[0].text_content() text = doc.cssselect("div.article__content") if not text: text = doc.cssselect("div.article__header-and-content") text2 = text[0].text_content() text2 = re.sub(r"\s*\n\s*", "\n\n", text2).strip() text2 = re.sub(r"[ \t]+", " ", text2).strip() if intro: text3 = f"{intro2},{text2}" else: text3 = f"{text2}" article = dict(date=unit.date, online_date=online_date, title=headline2, text=text3, url=unit.url, pdf_url=unit.pdf, page_tag=unit.pages, section_tag=unit.sections, raw_html=html, author=author2) if unit.image is not None: article["image_url"] = unit.image return Article(**article)
def scrape_unit(self, unit: EPagesUnit): return Article(title=unit.title, url=unit.url, text=unit.text, pagenr_int=unit.page, date=unit.date)
def json_to_article(article: Dict[str, Any], project: Project) -> Article: article = Article(project=project, **article) article.compute_hash() return article
def test_default(self): self.assertEqual(PropertyMapping, type(Article().properties))
def parse_document(self, paragraphs): metadata, text = parse_page(paragraphs) metadata["medium"] = Medium.get_or_create(metadata["medium"]) return Article(text=text, **metadata)
def test_highlight_fragments(self): self.set_up() articleset = amcattest.create_test_set() project = articleset.project text = """ The Alderman Proctor's Drinking Fountain (grid reference ST566738) is a historic building on Clifton Down, Bristol, England. The city of Bristol began supplying municipal drinking water in 1858. To inform the public about the new water supply, Robert Lang made a proposal though the Bristol Times that public drinking fountains be constructed. Lang began the "Fountain Fund" in January 1859 with a donation of one hundred pounds. By 1906, there were more than 40 public drinking fountains throughout the city. In 1872, Alderman Thomas Proctor commissioned the firm of George and Henry Godwin to build the fountain to commemorate the 1861 presentation of <i>Clifton Down</i> to the City of Bristol by the Society of Merchant Venturers. **Commemorative plaque** The three-sided fountain is done in Gothic Revival style. The main portion is of limestone with pink marble columns and white marble surround. The commemorative plaque is of black lettering on white marble; the plaque reads, "Erected by Alderman Thomas Proctor, of Bristol to record the liberal gift of certain rights on Clifton Down made to the citizens by the Society of Merchant Venturers under the provision of the Clifton and Drudham Downs Acts of Parliament, 1861, whereby the enjoyment of these Downs is preserved to the citizens of Bristol for ever." The fountain bears the coat of arms for the city of Bristol, the Society of Merchant Venturers and that of Alderman Thomas Proctor. The fountain was originally situated at the head of Bridge Valley Road. It became a sight impediment to modern auto traffic in the later 20th century. The fountain was moved to the other side of the road, closer to the Mansion House in 1987. After the move, it underwent restoration and was re-dedicated on 1 May 1988. It has been designated by English Heritage as a grade II listed building since 1977. """ paragraphs = [" ".join(s.strip() for s in p.strip().split("\n")) for p in text.split("\n\n")] long_article = Article( title="Alderman Proctor's Drinking Fountain", text="\n\n".join(paragraphs).strip(), date=datetime.datetime(2017, 1, 18, 13, 29, 11), url="https://en.wikipedia.org/wiki/Alderman_Proctor%27s_Drinking_Fountain", publisher="Wikipedia", project=project ) Article.create_articles([long_article], articleset) amcates.ES().refresh() qs = ESQuerySet(ArticleSet.objects.filter(id=articleset.id)) fragments = qs.highlight_fragments('"Clifton Down"', ("text", "title"), fragment_size=50) self.assertEqual(1, len(qs)) self.assertEqual(1, len(fragments)) fragments = next(iter(fragments.values())) text_fragments = set(fragments["text"]) title_fragments = fragments["title"] self.assertEqual(1, len(title_fragments)) self.assertNotIn("<mark>", title_fragments[0]) self.assertEqual(3, len(text_fragments)) self.assertEqual(text_fragments, { ' presentation of <i><mark>Clifton</mark> <mark>Down</mark></i> to the City of Bristol', ' <mark>Clifton</mark> <mark>Down</mark>, Bristol, England.\n\nThe city of Bristol', ' the liberal gift of certain rights on <mark>Clifton</mark> <mark>Down</mark> made' })