def article_to_json(article: Article): static_fields = article.static_fields() - { "id", "project_id", "project", "properties" } static_fields = {fn: getattr(article, fn) for fn in static_fields} return dict(static_fields, properties=dict(article.get_properties().items()))
def test_list_media(self): """Test that list media works for more than 10 media""" from amcat.models import Article media = [amcattest.create_test_medium() for _ in range(20)] arts = [ amcattest.create_test_article(medium=m, create=False) for m in media ] s1 = amcattest.create_test_set() Article.create_articles(arts[:5], articleset=s1, check_duplicate=False, create_id=True) ES().flush() self.assertEqual(set(s1.get_mediums()), set(media[:5])) s2 = amcattest.create_test_set(project=s1.project) Article.create_articles(arts[5:], articleset=s2, check_duplicate=False, create_id=True) ES().flush() self.assertEqual(set(s2.get_mediums()), set(media[5:])) self.assertEqual(set(s1.project.get_mediums()), set(media))
def setup(self): s1 = amcattest.create_test_set() s2 = amcattest.create_test_set() a = amcattest.create_test_article(text='aap noot mies', title='m1', date='2001-01-01', create=False) b = amcattest.create_test_article(text='noot mies wim zus', title='m2', date='2001-02-01', create=False) c = amcattest.create_test_article(text='mies bla bla bla wim zus jet', title='m2', date='2002-01-01', create=False) d = amcattest.create_test_article(text='noot mies wim zus', title='m2', date='2001-02-03', create=False) e = amcattest.create_test_article(text='aap noot mies', title='m3', articleset=s2) Article.create_articles([a, b, c, d], articleset=s1) ES().refresh() return s1, s2, a, b, c, d, e
def test_aggregate(self): """Can we make tables per medium/date interval?""" from amcat.models import Article m1 = amcattest.create_test_medium(name="De Nep-Krant") m2, m3 = [amcattest.create_test_medium() for _ in range(2)] s1 = amcattest.create_test_set() s2 = amcattest.create_test_set() unused = amcattest.create_test_article(text='aap noot mies', medium=m3, articleset=s2) a = amcattest.create_test_article(text='aap noot mies', medium=m1, date='2001-01-01', create=False) b = amcattest.create_test_article(text='noot mies wim zus', medium=m2, date='2001-02-01', create=False) c = amcattest.create_test_article(text='mies bla bla bla wim zus jet', medium=m2, date='2002-01-01', create=False) d = amcattest.create_test_article(text='noot mies wim zus', medium=m2, date='2001-02-03', create=False) Article.create_articles([a,b,c,d], articleset=s1, check_duplicate=False, create_id=True) ES().flush() self.assertEqual(dict(ES().aggregate_query(filters=dict(sets=s1.id), group_by="mediumid")), {m1.id : 1, m2.id : 3}) self.assertEqual(dict(ES().aggregate_query(filters=dict(sets=s1.id), group_by="date", date_interval="year")), {datetime(2001,1,1) : 3, datetime(2002,1,1) : 1}) self.assertEqual(dict(ES().aggregate_query(filters=dict(sets=s1.id), group_by="date", date_interval="month")), {datetime(2001,1,1) : 1, datetime(2002,1,1) : 1, datetime(2001,2,1) : 2}) # set statistics stats = ES().statistics(filters=dict(sets=s1.id)) self.assertEqual(stats.n, 4) self.assertEqual(stats.start_date, datetime(2001,1,1)) self.assertEqual(stats.end_date, datetime(2002,1,1)) # media list self.assertEqual(set(ES().list_media(filters=dict(sets=s1.id))), {m1.id, m2.id})
def test_get_articles(self): from amcat.models import Sentence _get_articles = lambda a,s : list(get_articles(a,s)) # Should raise exception if sentences not in article article, sentences = self.create_test_sentences() s1 = Sentence.objects.filter(id=amcattest.create_test_sentence().id) self.assertRaises(ValueError, _get_articles, article, s1) # Should raise an exception if we try to split on headline self.assertRaises(ValueError, _get_articles, article, sentences.filter(parnr=1)) # Should return a "copy", with byline in "text" property arts = _get_articles(article, Sentence.objects.none()) Article.create_articles(arts) self.assertEquals(len(arts), 1) sbd.create_sentences(arts[0]) self.assertEquals( [s.sentence for s in sentences[1:]], [s.sentence for s in arts[0].sentences.all()[1:]] ) self.assertTrue("foo" in arts[0].text) # Should be able to split on byline self.assertEquals(2, len(_get_articles(article, sentences[1:2]))) a, b = _get_articles(article, sentences[4:5]) # Check if text on splitted articles contains expected self.assertTrue("Einde" not in a.text) self.assertTrue("Einde" in b.text)
def _run(self, local_project, remote_host, remote_token, remote_project_id, remote_articleset_id): try: page_size = 1000 query = RemoteQuery(remote_host, remote_token, remote_project_id, remote_articleset_id, page_size=page_size) set = { k: v for k, v in query.get_articleset().items() if k in COPY_SET_FIELDS } set.update(project=local_project) set = ArticleSet.objects.create(**set) for page in query: articles_hashes = [(self.create_article(x, local_project), x["hash"]) for x in page] hashmap = { old_hash: article.hash for article, old_hash in articles_hashes } articles, _ = zip(*articles_hashes) articles = list(articles) for article in articles: if article.parent_hash in hashmap: article.parent_hash = hashmap[article.parent_hash] Article.create_articles(articles, articleset=set) return set.id except APIError as e: self.handleError(e)
def test_elastic_hash(self): """Can we reproduce a hash from elastic data alone?""" article = Article(**{ "date": datetime.date(2015, 1, 1), "section": "\u6f22\u5b57", "pagenr": 1928390, "headline": "Headline hier.", "byline": "byline..", "length": 1928, "metastring": "Even more strange characters.. \x0C ..", "url": "https://example.com", "externalid": None, "author": None, "addressee": "Hmm", "text": "Contains invalid char \x08 woo", "medium": create_test_medium(name="abc."), "project": create_test_project() }) article.save() es = ES() es.add_articles([article.id]) hash = get_article_dict(article)["hash"] es.flush() es_articles = es.query_all(filters={"ids": [article.id]}, fields=HASH_FIELDS + ["hash"]) es_article = list(es_articles)[0] self.assertEqual(article.id, es_article.id) self.assertEqual(hash, es_article.hash) self.assertEqual(_get_hash(es_article.to_dict()), hash)
def create(self, validated_data): children = validated_data.pop("children") article = Article(**validated_data) if article.length is None: article.length = word_len(article.text) return (article, map(self.create, children))
def scrape_unit(self, unit): date = iso8601.iso8601.parse_date(unit["datum"], default_timezone=None) hostname = urlparse(unit["url"]).hostname publisher = ".".join(hostname.split(".")[-2:]) title = unit["titel"].strip() or "[No title]" article = Article(title=title, text=unit["bericht tekst"], url=unit["url"], date=date) article.set_property("author", unit["auteur"]) article.set_property("publisher", publisher) return article
def test_get_ids(self): tree = ArticleTree( Article(id=3), [ ArticleTree(Article(id=5), []), ArticleTree(Article(id=6), [ ArticleTree(Article(id=7), []) ]) ] ) self.assertEqual({3, 5, 6, 7}, set(tree.get_ids()))
def test_create(self): """Can we create/store/index an article object?""" a = amcattest.create_test_article(create=False, date='2010-12-31', headline=u'\ua000abcd\u07b4') Article.create_articles([a], create_id=True) db_a = Article.objects.get(pk=a.id) amcates.ES().flush() es_a = list(amcates.ES().query(filters={'ids': [a.id]}, fields=["date", "headline"]))[0] self.assertEqual(a.headline, db_a.headline) self.assertEqual(a.headline, es_a.headline) self.assertEqual('2010-12-31T00:00:00', db_a.date.isoformat()) self.assertEqual('2010-12-31T00:00:00', es_a.date.isoformat())
def _parse_comment(self, comment, base_title, base_url): text = html2text(comment.cssselect("p")) article_id = comment.get("id") title = "{base_title}#{article_id}".format(**locals()) url = "{base_url}#{article_id}".format(**locals()) author, timestamp = _parse_comment_footer(comment.cssselect("footer")[0].text_content()) article = Article(date=timestamp, title=title, text=text.strip() or ".", url=url) article.set_property("author", author.strip()) article.set_property("medium", "GeenStijl Comments") return article
def test_create_order(self): """Is insert order preserved in id order?""" articles = [amcattest.create_test_article(create=False) for _i in range(25)] random.shuffle(articles) Article.create_articles(articles) ids = [a.id for a in articles] # is order preserved? self.assertEqual(ids, sorted(ids)) # do the right articles have the right title? for saved in articles: indb = Article.objects.get(pk=saved.id) self.assertEqual(indb.title, saved.title)
def create(self, validated_data): articleset = self.get_articleset() if 'id' in validated_data: _check_read_access(self.context['request'].user, [validated_data['id']]) article = Article.objects.get(pk=validated_data['id']) articleset.add_articles([article]) else: article = json_to_article(validated_data, articleset.project) Article.create_articles([article], articleset=articleset) return article
def test_query_all(self): """Test that query_all works""" from amcat.models import Article arts = [amcattest.create_test_article(create=False) for _ in range(20)] s = amcattest.create_test_set() Article.create_articles(arts, articleset=s, check_duplicate=False) ES().flush() r = ES().query(filters=dict(sets=s.id), size=10) self.assertEqual(len(list(r)), 10) r = ES().query_all(filters=dict(sets=s.id), size=10) self.assertEqual(len(list(r)), len(arts))
def test_create(self): """Can we create/store/index an article object?""" a = amcattest.create_test_article(create=False, date='2010-12-31', title=u'\ua000abcd\u07b4') Article.create_articles([a]) db_a = Article.objects.get(pk=a.id) amcates.ES().refresh() es_a = list(amcates.ES().query(filters={'ids': [a.id]}, _source=["date", "title", "hash"]))[0] self.assertEqual(a.hash, db_a.hash) self.assertEqual(a.hash, es_a.hash) self.assertEqual(a.title, db_a.title) self.assertEqual(a.title, es_a.title) self.assertEqual('2010-12-31T00:00:00', db_a.date.isoformat()) self.assertEqual('2010-12-31T00:00:00', es_a.date.isoformat())
def test_query_all(self): """Test that query_all works""" from amcat.models import Article arts = [amcattest.create_test_article(create=False) for _ in range(20)] s = amcattest.create_test_set() Article.create_articles(arts, articleset=s, check_duplicate=False, create_id=True) ES().flush() r = ES().query(filters=dict(sets=s.id), size=10) self.assertEqual(len(list(r)), 10) r = ES().query_all(filters=dict(sets=s.id), size=10) self.assertEqual(len(list(r)), len(arts))
def setup(self): m1 = amcattest.create_test_medium(name="De Nep-Krant") m2, m3 = [amcattest.create_test_medium() for _ in range(2)] s1 = amcattest.create_test_set() s2 = amcattest.create_test_set() a = amcattest.create_test_article(text='aap noot mies', medium=m1, date='2001-01-01', create=False) b = amcattest.create_test_article(text='noot mies wim zus', medium=m2, date='2001-02-01', create=False) c = amcattest.create_test_article(text='mies bla bla bla wim zus jet', medium=m2, date='2002-01-01', create=False) d = amcattest.create_test_article(text='noot mies wim zus', medium=m2, date='2001-02-03', create=False) e = amcattest.create_test_article(text='aap noot mies', medium=m3, articleset=s2) Article.create_articles([a, b, c, d], articleset=s1) ES().flush() return m1, m2, m3, s1, s2, a, b, c, d, e
def setup(self): m1 = amcattest.create_test_medium(name="De Nep-Krant") m2, m3 = [amcattest.create_test_medium() for _ in range(2)] s1 = amcattest.create_test_set() s2 = amcattest.create_test_set() a = amcattest.create_test_article(text='aap noot mies', medium=m1, date='2001-01-01', create=False) b = amcattest.create_test_article(text='noot mies wim zus', medium=m2, date='2001-02-01', create=False) c = amcattest.create_test_article(text='mies bla bla bla wim zus jet', medium=m2, date='2002-01-01', create=False) d = amcattest.create_test_article(text='noot mies wim zus', medium=m2, date='2001-02-03', create=False) e = amcattest.create_test_article(text='aap noot mies', medium=m3, articleset=s2) Article.create_articles([a, b, c, d], articleset=s1, check_duplicate=False, create_id=True) ES().flush() return m1, m2, m3, s1, s2, a, b, c, d, e
def test_add_many(self): """Can we add a large number of articles from one set to another?""" s = amcattest.create_test_set() s2 = amcattest.create_test_set() p = amcattest.create_test_project() arts = [amcattest.create_test_article(project=p, create=False) for _x in range(1213)] Article.create_articles(arts, s) ES().refresh() self.assertEqual(len(arts), s.get_count()) s2.add_articles(arts) ES().refresh() self.assertEqual(len(arts), s2.get_count()) print(s2.get_count())
def scrape_unit(self, date_and_article_url): date, article_url = date_and_article_url log.info("Fetching {}".format(article_url)) article_doc = self.session.get_html(article_url) article_el = article_doc.cssselect("#content > article") if not article_el: log.error("Could not find article on {article_url}".format(**locals())) return None title = article_el[0].cssselect("h1")[0].text text = html2text(article_el[0].cssselect("p")) text = text.strip() or "." try: footer = article_el[0].cssselect("footer")[0] except IndexError as e: # Contains <embed> tag which is not closed gracefully :-( log.exception(e) return None author = footer.text.rsplit("|", 1)[0].strip() timestamp = parse_date(article_el[0].cssselect("footer > time")[0].get("datetime")) if not title: return None children = self._get_comments(title, article_url, article_doc) article = Article(date=timestamp, title=title, text=text) article.set_property("author", author) article.set_property("url", article_url) article.set_property("medium", "GeenStijl") return ArticleTree(article, [ArticleTree(c, []) for c in children])
def run(self): monitor = self.progress_monitor filename = self.options['filename'] file_shortname = os.path.split(self.options['filename'])[-1] monitor.update( 10, u"Importing {self.__class__.__name__} from {file_shortname} into {self.project}" .format(**locals())) articles = [] encoding = self.options['encoding'] files = list(self._get_files(filename, encoding)) nfiles = len(files) for i, (file, encoding, data) in enumerate(files): monitor.update( 20 / nfiles, "Parsing file {i}/{nfiles}: {file}".format(**locals())) articles += list(self.parse_file(file, encoding, data)) for article in articles: _set_project(article, self.project) if self.errors: raise ParseError(" ".join(map(str, self.errors))) monitor.update( 10, "All files parsed, saving {n} articles".format(n=len(articles))) Article.create_articles(articles, articleset=self.get_or_create_articleset(), monitor=monitor.submonitor(40)) if not articles: raise Exception("No articles were imported") monitor.update( 10, "Uploaded {n} articles, post-processing".format(n=len(articles))) aset = self.options["articleset"] new_provenance = self.get_provenance(file, articles) aset.provenance = ("%s\n%s" % (aset.provenance or "", new_provenance)).strip() aset.save() if getattr(self, 'task', None): self.task.log_usage("articles", "upload", n=len(articles)) monitor.update(10, "Done! Uploaded articles".format(n=len(articles))) return self.options["articleset"]
def test_add_many(self): """Can we add a large number of articles from one set to another?""" s = amcattest.create_test_set() s2 = amcattest.create_test_set() m = amcattest.create_test_medium() p = amcattest.create_test_project() arts = [amcattest.create_test_article(project=p, medium=m, create=False) for _x in range(1213)] Article.create_articles(arts, s, create_id=True) ES().flush() self.assertEqual(len(arts), s.get_count()) s2.add_articles(arts, monitor=ProgressMonitor()) ES().flush() self.assertEqual(len(arts), s2.get_count()) print(s2.get_count())
def handle_split(form, project, article, sentences): articles = list(get_articles(article, sentences)) # We won't use bulk_create yet, as it bypasses save() and doesn't # insert ids Article.create_articles(articles) for art in articles: sbd.get_or_create_sentences(art) if not form.is_valid(): raise ValueError("Form invalid: {form.errors}".format(**locals())) # Context variables for template form_data = form.cleaned_data all_sets = list(project.all_articlesets().filter(articles=article)) # Add splitted articles to existing sets for aset in form_data["add_splitted_to_sets"]: aset.add_articles(articles) # Add splitted articles to sets wherin the original article live{d,s} if form_data["add_splitted_to_all"]: asets = project.all_articlesets().filter(articles=article).only("id") for aset in asets: aset.add_articles(articles) if form_data["remove_from_sets"]: for aset in form_data["remove_from_sets"]: aset.remove_articles([article]) if form_data["remove_from_all_sets"]: for aset in ArticleSet.objects.filter(project=project, articles=article).distinct(): aset.remove_articles([article]) if form_data["add_splitted_to_new_set"]: new_splitted_set = ArticleSet.create_set( project, form_data["add_splitted_to_new_set"], articles) if form_data["add_to_sets"]: for articleset in form_data["add_to_sets"]: articleset.add_articles([article]) if form_data["add_to_new_set"]: new_set = ArticleSet.create_set(project, form_data["add_to_new_set"], [article]) return locals()
def run(self, scraper): try: units = list(scraper._get_units()) except Exception as e: self.errors.append(ScrapeError(None,None,e)) log.exception("scraper._get_units failed") return self.articles for i, unit in enumerate(units): try: articles = list(scraper._scrape_unit(unit)) except Exception as e: log.exception("scraper._scrape_unit failed") self.errors.append(ScrapeError(i,unit,e)) continue self.articles += articles for article in self.articles: _set_default(article, 'project', scraper.project) try: articles, errors = Article.create_articles(self.articles, scraper.articleset) self.saved_article_ids = {getattr(a, "duplicate_of", a.id) for a in self.articles} for e in errors: self.errors.append(ScrapeError(None,None,e)) except Exception as e: self.errors.append(ScrapeError(None,None,e)) print e log.exception("scraper._get_units failed") return self.saved_article_ids
def run(self, scraper): try: units = list(scraper._get_units()) except Exception as e: self.errors.append(ScrapeError(None, None, e)) log.exception("scraper._get_units failed") return self.articles for i, unit in enumerate(units): try: articles = list(scraper._scrape_unit(unit)) except Exception as e: log.exception("scraper._scrape_unit failed") self.errors.append(ScrapeError(i, unit, e)) continue self.articles += articles for article in self.articles: _set_default(article, 'project', scraper.project) try: articles, errors = Article.create_articles(self.articles, scraper.articleset) self.saved_article_ids = { getattr(a, "duplicate_of", a.id) for a in self.articles } for e in errors: self.errors.append(ScrapeError(None, None, e)) except Exception as e: self.errors.append(ScrapeError(None, None, e)) log.exception("scraper._get_units failed") return self.saved_article_ids
def scrape_unit(self, url): reader_url = "about:reader?url={}".format(url) doc = self.get_html(reader_url, wait_for="div.content p") for tag in REMOVE_TAGS: for element in doc.cssselect(tag): element.getparent().remove(element) article = doc.cssselect("div.content")[0] article_html = lxml.html.tostring(article).decode() title = doc.cssselect("h1.reader-title")[0].text_content().strip() text = html2text(article_html) if self.__class__.get_date is not GenericScraper.get_date: # Get contents of un-firefox-read-ed article self.wait(".reader-toolbar .close-button").click() time.sleep(0.3) doc_html = self.wait("html").get_attribute("outerHTML") doc = lxml.html.fromstring(doc_html, base_url=url) try: date = self.get_date(doc) except NotImplementedError: date = self.now except Exception as e: log.warning("get_date() failed for {} with: {}".format(url, e)) date = self.now else: date = self.now article = Article(date=date, title=title, text=text, url=url) return article
def test_add_many(self): """Can we add a large number of articles from one set to another?""" s = amcattest.create_test_set() s2 = amcattest.create_test_set() p = amcattest.create_test_project() arts = [ amcattest.create_test_article(project=p, create=False) for _x in range(1213) ] Article.create_articles(arts, s) ES().refresh() self.assertEqual(len(arts), s.get_count()) s2.add_articles(arts) ES().refresh() self.assertEqual(len(arts), s2.get_count()) print(s2.get_count())
def test_list_media(self): """Test that list media works for more than 10 media""" from amcat.models import Article media = [amcattest.create_test_medium() for _ in range(20)] arts = [amcattest.create_test_article(medium=m, create=False) for m in media] s1 = amcattest.create_test_set() Article.create_articles(arts[:5], articleset=s1, check_duplicate=False, create_id=True) ES().flush() self.assertEqual(set(s1.get_mediums()), set(media[:5])) s2 = amcattest.create_test_set(project=s1.project) Article.create_articles(arts[5:], articleset=s2, check_duplicate=False, create_id=True) ES().flush() self.assertEqual(set(s2.get_mediums()), set(media[5:])) self.assertEqual(set(s1.project.get_mediums()), set(media))
def create_articles(batch): for a in batch: a['oldid_int'] = a.pop('old_id') if a['text'] == '': a['text'] = '-' if a['title'] == '': a['title'] = '-' articles = Article.create_articles([Article(project_id=self.status.project.id, **a) for a in batch]) self.status.articles.update({a.get_property('oldid_int'): a.id for a in articles}) return articles
def handle_split(form, project, article, sentences): articles = list(get_articles(article, sentences)) # We won't use bulk_create yet, as it bypasses save() and doesn't # insert ids Article.create_articles(articles) for art in articles: sbd.get_or_create_sentences(art) if not form.is_valid(): raise ValueError("Form invalid: {form.errors}".format(**locals())) # Context variables for template form_data = form.cleaned_data all_sets = list(project.all_articlesets().filter(articles=article)) # Add splitted articles to existing sets for aset in form_data["add_splitted_to_sets"]: aset.add_articles(articles) # Add splitted articles to sets wherin the original article live{d,s} if form_data["add_splitted_to_all"]: asets = project.all_articlesets().filter(articles=article).only("id") for aset in asets: aset.add_articles(articles) if form_data["remove_from_sets"]: for aset in form_data["remove_from_sets"]: aset.remove_articles([article]) if form_data["remove_from_all_sets"]: for aset in ArticleSet.objects.filter(project=project, articles=article).distinct(): aset.remove_articles([article]) if form_data["add_splitted_to_new_set"]: new_splitted_set = ArticleSet.create_set(project, form_data["add_splitted_to_new_set"], articles) if form_data["add_to_sets"]: for articleset in form_data["add_to_sets"]: articleset.add_articles([article]) if form_data["add_to_new_set"]: new_set = ArticleSet.create_set(project, form_data["add_to_new_set"], [article]) return locals()
def parse_file(self, file, encoding, _data): reader = csv.DictReader(_open(file, encoding)) for unmapped_dict in reader: art_dict = self.map_article(unmapped_dict) properties = {} for k, v in art_dict.items(): v = parse_value(k, v) properties[k] = v yield Article.fromdict(properties)
def parse_file(self, file): reader = csv.DictReader(TextIOWrapper(file.file, encoding="utf8")) for unmapped_dict in reader: art_dict = self.map_article(unmapped_dict) properties = {} for k, v in art_dict.items(): v = self.parse_value(k, v) properties[k] = v yield Article.fromdict(properties)
def test_add_many(self): """Can we add a large number of articles from one set to another?""" s = amcattest.create_test_set() s2 = amcattest.create_test_set() m = amcattest.create_test_medium() p = amcattest.create_test_project() arts = [ amcattest.create_test_article(project=p, medium=m, create=False) for _x in range(1213) ] Article.create_articles(arts, s, create_id=True) ES().flush() self.assertEqual(len(arts), s.get_count()) s2.add_articles(arts, monitor=ProgressMonitor()) ES().flush() self.assertEqual(len(arts), s2.get_count()) print(s2.get_count())
def parse_file(self, file, _data): reader = self.get_reader(file) for unmapped_dict in reader: art_dict = self.map_article(unmapped_dict, dict(DEFAULTS)) properties = {} for k, v in art_dict.items(): v = parse_value(k, v) properties[k] = v yield Article.fromdict(properties)
def article_to_json(article: Article) -> Dict[str, Union[str, int, float, datetime.datetime]]: return { "title": article.title, "text": article.text, "hash": article.hash, "parent_hash": article.parent_hash, "url": article.url, "date": article.date, "properties": dict(article.get_properties()) }
def save(self, **kwargs): def _flatten(l): """Turn either an object or a (recursive/irregular/jagged) list-of-lists into a flat list""" # inspired by http://stackoverflow.com/questions/2158395/flatten-an-irregular-list-of-lists-in-python if isinstance(l, collections.Iterable) and not isinstance(l, basestring): for el in l: for sub in _flatten(el): yield sub else: yield l # flatten articles list (children in a many call yields a list of lists) self.object = list(_flatten(self.object)) Article.create_articles(self.object, self.context['view'].articleset) # make sure that self.many is True for serializing result self.many = True return self.object
def run(self): upload = self.options['upload'] upload.encoding_override(self.options['encoding']) monitor = self.progress_monitor root_dir = os.path.dirname(upload.filepath) monitor.update(10, u"Importing {self.__class__.__name__} from {upload.basename} into {self.project}" .format(**locals())) articles = [] files = self._get_files(upload) nfiles = len(upload) filemonitor = monitor.submonitor(nfiles, weight=60) for i, (file, data) in enumerate(files): filemonitor.update(1, "Parsing file {i}/{nfiles}: {file.name}".format(**locals())) articles += list(self.parse_file(file, data)) for article in articles: _set_project(article, self.project) if self.errors: raise ParseError(" ".join(map(str, self.errors))) monitor.update(10, "All files parsed, saving {n} articles".format(n=len(articles))) Article.create_articles(articles, articleset=self.get_or_create_articleset(), monitor=monitor.submonitor(40)) if not articles: raise Exception("No articles were imported") monitor.update(10, "Uploaded {n} articles, post-processing".format(n=len(articles))) aset = self.options['articleset'] new_provenance = self.get_provenance(upload.basename, articles) aset.provenance = ("%s\n%s" % (aset.provenance or "", new_provenance)).strip() aset.save() if getattr(self, 'task', None): self.task.log_usage("articles", "upload", n=len(articles)) monitor.update(10, "Done! Uploaded articles".format(n=len(articles))) return self.options["articleset"]
def _run(self, local_project, remote_host, remote_token, remote_project_id, remote_articleset_id): try: page_size = 1000 query = RemoteQuery(remote_host, remote_token, remote_project_id, remote_articleset_id, page_size=page_size) set = {k: v for k, v in query.get_articleset().items() if k in COPY_SET_FIELDS} set.update(project=local_project) set = ArticleSet.objects.create(**set) for page in query: articles_hashes = [(self.create_article(x, local_project), x["hash"]) for x in page] hashmap = {old_hash: article.hash for article, old_hash in articles_hashes} articles, _ = zip(*articles_hashes) articles = list(articles) for article in articles: if article.parent_hash in hashmap: article.parent_hash = hashmap[article.parent_hash] Article.create_articles(articles, articleset=set) return set.id except APIError as e: self.handleError(e)
def parse_file(self, file): for doc in split_file(file): data = dict(parse_doc(doc)) art = {} for field, setting in self.options['field_map'].items(): value, typ = setting['value'], setting['type'] val = data.get(value) if typ == 'field' else value if val: art[field] = val yield Article(**art)
def test_elastic_hash(self): """Can we reproduce a hash from elastic data alone?""" article = Article(**{ "date": datetime.date(2015, 1, 1), "title": "\u6f22\u5b57", "text": "Even more strange characters.. \x0C and \x08 woo?", "url": "https://example.com", "project": create_test_project() }) hash = get_article_dict(article)['hash'] Article.create_articles([article], articleset=amcattest.create_test_set()) ES().refresh() es_articles = ES().query_all(filters={"ids": [article.id]}, _source=["hash"]) es_articles = list(es_articles) es_article = list(es_articles)[0] self.assertEqual(article.id, es_article.id) self.assertEqual(hash, es_article.hash) self.assertEqual(hash, article.hash)
def test_deduplication(self): """Does deduplication work as it is supposed to?""" # create dummy articles to have something in the db [amcattest.create_test_article() for i in range(10)] amcates.ES().refresh() art = dict(project=amcattest.create_test_project(), title="deduptest", text="test", date='2001-01-01') a1 = amcattest.create_test_article(**art) amcates.ES().refresh() self.assertEqual(_q(title='deduptest'), {a1.id}) # duplicate articles should not be added a2 = amcattest.create_test_article(**art) amcates.ES().refresh() self.assertEqual(a2.id, a1.id) self.assertTrue(a2._duplicate) self.assertEqual(_q(title='deduptest'), {a1.id}) # however, if an articleset is given the 'existing' article # should be added to that set s1 = amcattest.create_test_set() a3 = amcattest.create_test_article(articleset=s1, **art) amcates.ES().refresh() self.assertEqual(a3.id, a1.id) self.assertEqual(_q(title='deduptest'), {a1.id}) self.assertEqual(set(s1.get_article_ids()), {a1.id}) self.assertEqual(_q(sets=s1.id), {a1.id}) # if an existing hash is set, it should be correct art2 = dict(hash=b'hash', **art) self.assertRaises(ValueError, amcattest.create_test_article, **art2) #TODO! Check duplicates within new articles art['title'] = "internaldupe" a1, a2 = (Article(**art), Article(**art)) Article.create_articles([a1, a2], articleset=s1) self.assertEqual(a1.id, a2.id) self.assertEqual(len(_q(title='internaldupe')), 1)
def article_to_json( article: Article ) -> Dict[str, Union[str, int, float, datetime.datetime]]: return { "title": article.title, "text": article.text, "hash": article.hash, "parent_hash": article.parent_hash, "url": article.url, "date": article.date, "properties": dict(article.get_properties()) }
def run(self, scraper): try: units = list(scraper._get_units()) except Exception as e: self.errors.append(ScrapeError(None, None, e)) log.exception("scraper._get_units failed") return self.articles for i, unit in enumerate(units): try: articles = list(scraper._scrape_unit(unit)) except Exception as e: log.exception("scraper._scrape_unit failed") self.errors.append(ScrapeError(i, unit, e)) continue self.articles += articles for article in self.articles: _set_default(article, 'project', scraper.project) try: articles, errors = Article.create_articles(self.articles, scraper.articleset) self.saved_article_ids = {a.id for a in self.articles} for e in errors: self.errors.append(ScrapeError(None, None, e)) stats_log.info( json.dumps({ "action": "scraped_articles", "narticles": len(self.saved_article_ids), "scraper": scraper.__class__.__name__ })) except Exception as e: self.errors.append(ScrapeError(None, None, e)) log.exception("scraper._get_units failed") # Do we need to save these id's to more sets? if hasattr(scraper, "articlesets") and hasattr(self, "saved_article_ids"): for aset in scraper.articlesets: stats_log.info( json.dumps({ "action": "add_scraped_articles", "articleset_id": aset.id, "articleset__name": aset.name, "narticles": len(self.saved_article_ids), "project_id": aset.project_id, "project__name": aset.project.name })) aset.add_articles(self.saved_article_ids) return getattr(self, "saved_article_ids", ())
def create_article(self, art_dict, project): art_dict = { k: v for k, v in art_dict.items() if k in COPY_ARTICLE_FIELDS } art_dict["project"] = project if 'headline' in art_dict and 'title' not in art_dict: art_dict['title'] = art_dict.pop('headline') art_dict = dict(self._map_es_type(k, v) for k, v in art_dict.items()) art = Article(**art_dict) return art
def save(self, **kwargs): import collections def _flatten(l): """Turn either an object or a (recursive/irregular/jagged) list-of-lists into a flat list""" # inspired by http://stackoverflow.com/questions/2158395/flatten-an-irregular-list-of-lists-in-python if isinstance( l, collections.Iterable) and not isinstance(l, basestring): for el in l: for sub in _flatten(el): yield sub else: yield l # flatten articles list (children in a many call yields a list of lists) self.object = list(_flatten(self.object)) Article.create_articles(self.object, self.context['view'].articleset) # make sure that self.many is True for serializing result self.many = True return self.object
def copy_article(article: Article): new = Article( project_id=article.project_id, date=article.date, title=article.title, url=article.url, #text=article.text <-- purposely omit text! #hash=article.hash <-- purposely omit hash! parent_hash=article.parent_hash) new.properties.update(article.properties) return new
def test_elastic_hash(self): """Can we reproduce a hash from elastic data alone?""" article = Article( **{ "date": datetime.date(2015, 1, 1), "title": "\u6f22\u5b57", "text": "Even more strange characters.. \x0C and \x08 woo?", "url": "https://example.com", "project": create_test_project() }) hash = get_article_dict(article)['hash'] Article.create_articles([article], articleset=amcattest.create_test_set()) ES().refresh() es_articles = ES().query_all(filters={"ids": [article.id]}, fields=["hash"]) es_articles = list(es_articles) es_article = list(es_articles)[0] self.assertEqual(article.id, es_article.id) self.assertEqual(hash, es_article.hash) self.assertEqual(hash, article.hash)
def _scrape_unit(self, document): article = Article() metadata = list(META) # We select all 'div' elements directly under '.article' divs = document.cssselect("* > div") # Check for author field. If present: remove from metadata # fields list try: author_field = document.cssselect(".author")[0] except IndexError: pass else: article.author = author_field.text_content().lstrip("Von").strip() divs.remove(author_field) # Strip everything before headline headline_field = document.cssselect("b.deHeadline")[0].getparent() divs = divs[divs.index(headline_field):] # Parse metadata. Loop through each 'div' within an article, along with # its field name according to META (thus based on its position) for field_name, element in zip(metadata, divs): if field_name is None: continue processor = PROCESSORS.get(field_name, lambda x: x) text_content = element.text_content().strip() setattr(article, field_name, processor(text_content)) # Fetch text, which is paragraphs = [p.text_content() for p in document.cssselect("p")] article.text = ("\n\n".join(paragraphs)).strip() # We must return a iterable, so we return a one-tuple return (article,)
def set_up(self): self.aset = amcattest.create_test_set() self.asets = ArticleSet.objects.filter(id__in=[self.aset.id]) self.project = self.aset.project self.a1 = Article( title="Man leeft nog steeds in de gloria", text="Gezongen vloek op verjaardag maakt leven van man tot een vrolijke hel.", date=datetime.datetime(2017, 1, 2, 23, 22, 11), author="Rudolf Julius", publisher="De Speld", project=self.project, exists="Once", page_int=5, section_int=10, tags_tag={"gloria", "vloek"}, html="Man <i>leeft</i> nog steeds in de gloria" ) self.a2 = Article( title="VVD trots op opkomende zon", text="Kabinetsbeleid om geen parasol over Nederland te zetten betaalt zich uit", date=datetime.datetime(2016, 12, 14, 15, 13, 12), author="Thomas Hogeling", publisher="De Speld", project=self.project, page_int=5, section_int=11, tags_tag={"vvd", "nederland", "speld"} ) Article.create_articles([self.a1, self.a2], articleset=self.aset) amcates.ES().refresh() self.qs = ESQuerySet(self.asets)
def from_field_name(cls, field_name: str, **kwargs): """Construct a category object corresponding to the field_name's type. For example, the field 'date' would map to a IntervalCategory, while author would map to TextCategory. @param kwargs: additional parameters passed to corresponding Category""" is_json_field = field_name not in Article.static_fields() field_type = get_property_primitive_type(field_name) if field_type in (int, str, float): return ArticleFieldCategory(is_json_field=is_json_field, field_name=field_name, **kwargs) elif field_type == datetime.datetime: return IntervalCategory(is_json_field=is_json_field, field_name=field_name, **kwargs) else: raise ValueError("Did not recognize primitive field type: {} (on {})".format(field_type, field_name))