def test_get_articles(self): from amcat.models import Sentence _get_articles = lambda a,s : list(get_articles(a,s)) # Should raise exception if sentences not in article article, sentences = self.create_test_sentences() s1 = Sentence.objects.filter(id=amcattest.create_test_sentence().id) self.assertRaises(ValueError, _get_articles, article, s1) # Should raise an exception if we try to split on headline self.assertRaises(ValueError, _get_articles, article, sentences.filter(parnr=1)) # Should return a "copy", with byline in "text" property arts = _get_articles(article, Sentence.objects.none()) Article.create_articles(arts) self.assertEquals(len(arts), 1) sbd.create_sentences(arts[0]) self.assertEquals( [s.sentence for s in sentences[1:]], [s.sentence for s in arts[0].sentences.all()[1:]] ) self.assertTrue("foo" in arts[0].text) # Should be able to split on byline self.assertEquals(2, len(_get_articles(article, sentences[1:2]))) a, b = _get_articles(article, sentences[4:5]) # Check if text on splitted articles contains expected self.assertTrue("Einde" not in a.text) self.assertTrue("Einde" in b.text)
def _run(self, local_project, remote_host, remote_token, remote_project_id, remote_articleset_id): try: page_size = 1000 query = RemoteQuery(remote_host, remote_token, remote_project_id, remote_articleset_id, page_size=page_size) set = { k: v for k, v in query.get_articleset().items() if k in COPY_SET_FIELDS } set.update(project=local_project) set = ArticleSet.objects.create(**set) for page in query: articles_hashes = [(self.create_article(x, local_project), x["hash"]) for x in page] hashmap = { old_hash: article.hash for article, old_hash in articles_hashes } articles, _ = zip(*articles_hashes) articles = list(articles) for article in articles: if article.parent_hash in hashmap: article.parent_hash = hashmap[article.parent_hash] Article.create_articles(articles, articleset=set) return set.id except APIError as e: self.handleError(e)
def test_list_media(self): """Test that list media works for more than 10 media""" from amcat.models import Article media = [amcattest.create_test_medium() for _ in range(20)] arts = [ amcattest.create_test_article(medium=m, create=False) for m in media ] s1 = amcattest.create_test_set() Article.create_articles(arts[:5], articleset=s1, check_duplicate=False, create_id=True) ES().flush() self.assertEqual(set(s1.get_mediums()), set(media[:5])) s2 = amcattest.create_test_set(project=s1.project) Article.create_articles(arts[5:], articleset=s2, check_duplicate=False, create_id=True) ES().flush() self.assertEqual(set(s2.get_mediums()), set(media[5:])) self.assertEqual(set(s1.project.get_mediums()), set(media))
def test_aggregate(self): """Can we make tables per medium/date interval?""" from amcat.models import Article m1 = amcattest.create_test_medium(name="De Nep-Krant") m2, m3 = [amcattest.create_test_medium() for _ in range(2)] s1 = amcattest.create_test_set() s2 = amcattest.create_test_set() unused = amcattest.create_test_article(text='aap noot mies', medium=m3, articleset=s2) a = amcattest.create_test_article(text='aap noot mies', medium=m1, date='2001-01-01', create=False) b = amcattest.create_test_article(text='noot mies wim zus', medium=m2, date='2001-02-01', create=False) c = amcattest.create_test_article(text='mies bla bla bla wim zus jet', medium=m2, date='2002-01-01', create=False) d = amcattest.create_test_article(text='noot mies wim zus', medium=m2, date='2001-02-03', create=False) Article.create_articles([a,b,c,d], articleset=s1, check_duplicate=False, create_id=True) ES().flush() self.assertEqual(dict(ES().aggregate_query(filters=dict(sets=s1.id), group_by="mediumid")), {m1.id : 1, m2.id : 3}) self.assertEqual(dict(ES().aggregate_query(filters=dict(sets=s1.id), group_by="date", date_interval="year")), {datetime(2001,1,1) : 3, datetime(2002,1,1) : 1}) self.assertEqual(dict(ES().aggregate_query(filters=dict(sets=s1.id), group_by="date", date_interval="month")), {datetime(2001,1,1) : 1, datetime(2002,1,1) : 1, datetime(2001,2,1) : 2}) # set statistics stats = ES().statistics(filters=dict(sets=s1.id)) self.assertEqual(stats.n, 4) self.assertEqual(stats.start_date, datetime(2001,1,1)) self.assertEqual(stats.end_date, datetime(2002,1,1)) # media list self.assertEqual(set(ES().list_media(filters=dict(sets=s1.id))), {m1.id, m2.id})
def setup(self): s1 = amcattest.create_test_set() s2 = amcattest.create_test_set() a = amcattest.create_test_article(text='aap noot mies', title='m1', date='2001-01-01', create=False) b = amcattest.create_test_article(text='noot mies wim zus', title='m2', date='2001-02-01', create=False) c = amcattest.create_test_article(text='mies bla bla bla wim zus jet', title='m2', date='2002-01-01', create=False) d = amcattest.create_test_article(text='noot mies wim zus', title='m2', date='2001-02-03', create=False) e = amcattest.create_test_article(text='aap noot mies', title='m3', articleset=s2) Article.create_articles([a, b, c, d], articleset=s1) ES().refresh() return s1, s2, a, b, c, d, e
def test_create(self): """Can we create/store/index an article object?""" a = amcattest.create_test_article(create=False, date='2010-12-31', headline=u'\ua000abcd\u07b4') Article.create_articles([a], create_id=True) db_a = Article.objects.get(pk=a.id) amcates.ES().flush() es_a = list(amcates.ES().query(filters={'ids': [a.id]}, fields=["date", "headline"]))[0] self.assertEqual(a.headline, db_a.headline) self.assertEqual(a.headline, es_a.headline) self.assertEqual('2010-12-31T00:00:00', db_a.date.isoformat()) self.assertEqual('2010-12-31T00:00:00', es_a.date.isoformat())
def test_create_order(self): """Is insert order preserved in id order?""" articles = [amcattest.create_test_article(create=False) for _i in range(25)] random.shuffle(articles) Article.create_articles(articles) ids = [a.id for a in articles] # is order preserved? self.assertEqual(ids, sorted(ids)) # do the right articles have the right title? for saved in articles: indb = Article.objects.get(pk=saved.id) self.assertEqual(indb.title, saved.title)
def create(self, validated_data): articleset = self.get_articleset() if 'id' in validated_data: _check_read_access(self.context['request'].user, [validated_data['id']]) article = Article.objects.get(pk=validated_data['id']) articleset.add_articles([article]) else: article = json_to_article(validated_data, articleset.project) Article.create_articles([article], articleset=articleset) return article
def test_create(self): """Can we create/store/index an article object?""" a = amcattest.create_test_article(create=False, date='2010-12-31', title=u'\ua000abcd\u07b4') Article.create_articles([a]) db_a = Article.objects.get(pk=a.id) amcates.ES().refresh() es_a = list(amcates.ES().query(filters={'ids': [a.id]}, _source=["date", "title", "hash"]))[0] self.assertEqual(a.hash, db_a.hash) self.assertEqual(a.hash, es_a.hash) self.assertEqual(a.title, db_a.title) self.assertEqual(a.title, es_a.title) self.assertEqual('2010-12-31T00:00:00', db_a.date.isoformat()) self.assertEqual('2010-12-31T00:00:00', es_a.date.isoformat())
def test_query_all(self): """Test that query_all works""" from amcat.models import Article arts = [amcattest.create_test_article(create=False) for _ in range(20)] s = amcattest.create_test_set() Article.create_articles(arts, articleset=s, check_duplicate=False, create_id=True) ES().flush() r = ES().query(filters=dict(sets=s.id), size=10) self.assertEqual(len(list(r)), 10) r = ES().query_all(filters=dict(sets=s.id), size=10) self.assertEqual(len(list(r)), len(arts))
def test_query_all(self): """Test that query_all works""" from amcat.models import Article arts = [amcattest.create_test_article(create=False) for _ in range(20)] s = amcattest.create_test_set() Article.create_articles(arts, articleset=s, check_duplicate=False) ES().flush() r = ES().query(filters=dict(sets=s.id), size=10) self.assertEqual(len(list(r)), 10) r = ES().query_all(filters=dict(sets=s.id), size=10) self.assertEqual(len(list(r)), len(arts))
def setup(self): m1 = amcattest.create_test_medium(name="De Nep-Krant") m2, m3 = [amcattest.create_test_medium() for _ in range(2)] s1 = amcattest.create_test_set() s2 = amcattest.create_test_set() a = amcattest.create_test_article(text='aap noot mies', medium=m1, date='2001-01-01', create=False) b = amcattest.create_test_article(text='noot mies wim zus', medium=m2, date='2001-02-01', create=False) c = amcattest.create_test_article(text='mies bla bla bla wim zus jet', medium=m2, date='2002-01-01', create=False) d = amcattest.create_test_article(text='noot mies wim zus', medium=m2, date='2001-02-03', create=False) e = amcattest.create_test_article(text='aap noot mies', medium=m3, articleset=s2) Article.create_articles([a, b, c, d], articleset=s1, check_duplicate=False, create_id=True) ES().flush() return m1, m2, m3, s1, s2, a, b, c, d, e
def setup(self): m1 = amcattest.create_test_medium(name="De Nep-Krant") m2, m3 = [amcattest.create_test_medium() for _ in range(2)] s1 = amcattest.create_test_set() s2 = amcattest.create_test_set() a = amcattest.create_test_article(text='aap noot mies', medium=m1, date='2001-01-01', create=False) b = amcattest.create_test_article(text='noot mies wim zus', medium=m2, date='2001-02-01', create=False) c = amcattest.create_test_article(text='mies bla bla bla wim zus jet', medium=m2, date='2002-01-01', create=False) d = amcattest.create_test_article(text='noot mies wim zus', medium=m2, date='2001-02-03', create=False) e = amcattest.create_test_article(text='aap noot mies', medium=m3, articleset=s2) Article.create_articles([a, b, c, d], articleset=s1) ES().flush() return m1, m2, m3, s1, s2, a, b, c, d, e
def test_add_many(self): """Can we add a large number of articles from one set to another?""" s = amcattest.create_test_set() s2 = amcattest.create_test_set() p = amcattest.create_test_project() arts = [amcattest.create_test_article(project=p, create=False) for _x in range(1213)] Article.create_articles(arts, s) ES().refresh() self.assertEqual(len(arts), s.get_count()) s2.add_articles(arts) ES().refresh() self.assertEqual(len(arts), s2.get_count()) print(s2.get_count())
def run(self): monitor = self.progress_monitor filename = self.options['filename'] file_shortname = os.path.split(self.options['filename'])[-1] monitor.update( 10, u"Importing {self.__class__.__name__} from {file_shortname} into {self.project}" .format(**locals())) articles = [] encoding = self.options['encoding'] files = list(self._get_files(filename, encoding)) nfiles = len(files) for i, (file, encoding, data) in enumerate(files): monitor.update( 20 / nfiles, "Parsing file {i}/{nfiles}: {file}".format(**locals())) articles += list(self.parse_file(file, encoding, data)) for article in articles: _set_project(article, self.project) if self.errors: raise ParseError(" ".join(map(str, self.errors))) monitor.update( 10, "All files parsed, saving {n} articles".format(n=len(articles))) Article.create_articles(articles, articleset=self.get_or_create_articleset(), monitor=monitor.submonitor(40)) if not articles: raise Exception("No articles were imported") monitor.update( 10, "Uploaded {n} articles, post-processing".format(n=len(articles))) aset = self.options["articleset"] new_provenance = self.get_provenance(file, articles) aset.provenance = ("%s\n%s" % (aset.provenance or "", new_provenance)).strip() aset.save() if getattr(self, 'task', None): self.task.log_usage("articles", "upload", n=len(articles)) monitor.update(10, "Done! Uploaded articles".format(n=len(articles))) return self.options["articleset"]
def test_add_many(self): """Can we add a large number of articles from one set to another?""" s = amcattest.create_test_set() s2 = amcattest.create_test_set() m = amcattest.create_test_medium() p = amcattest.create_test_project() arts = [amcattest.create_test_article(project=p, medium=m, create=False) for _x in range(1213)] Article.create_articles(arts, s, create_id=True) ES().flush() self.assertEqual(len(arts), s.get_count()) s2.add_articles(arts, monitor=ProgressMonitor()) ES().flush() self.assertEqual(len(arts), s2.get_count()) print(s2.get_count())
def handle_split(form, project, article, sentences): articles = list(get_articles(article, sentences)) # We won't use bulk_create yet, as it bypasses save() and doesn't # insert ids Article.create_articles(articles) for art in articles: sbd.get_or_create_sentences(art) if not form.is_valid(): raise ValueError("Form invalid: {form.errors}".format(**locals())) # Context variables for template form_data = form.cleaned_data all_sets = list(project.all_articlesets().filter(articles=article)) # Add splitted articles to existing sets for aset in form_data["add_splitted_to_sets"]: aset.add_articles(articles) # Add splitted articles to sets wherin the original article live{d,s} if form_data["add_splitted_to_all"]: asets = project.all_articlesets().filter(articles=article).only("id") for aset in asets: aset.add_articles(articles) if form_data["remove_from_sets"]: for aset in form_data["remove_from_sets"]: aset.remove_articles([article]) if form_data["remove_from_all_sets"]: for aset in ArticleSet.objects.filter(project=project, articles=article).distinct(): aset.remove_articles([article]) if form_data["add_splitted_to_new_set"]: new_splitted_set = ArticleSet.create_set( project, form_data["add_splitted_to_new_set"], articles) if form_data["add_to_sets"]: for articleset in form_data["add_to_sets"]: articleset.add_articles([article]) if form_data["add_to_new_set"]: new_set = ArticleSet.create_set(project, form_data["add_to_new_set"], [article]) return locals()
def run(self, scraper): try: units = list(scraper._get_units()) except Exception as e: self.errors.append(ScrapeError(None,None,e)) log.exception("scraper._get_units failed") return self.articles for i, unit in enumerate(units): try: articles = list(scraper._scrape_unit(unit)) except Exception as e: log.exception("scraper._scrape_unit failed") self.errors.append(ScrapeError(i,unit,e)) continue self.articles += articles for article in self.articles: _set_default(article, 'project', scraper.project) try: articles, errors = Article.create_articles(self.articles, scraper.articleset) self.saved_article_ids = {getattr(a, "duplicate_of", a.id) for a in self.articles} for e in errors: self.errors.append(ScrapeError(None,None,e)) except Exception as e: self.errors.append(ScrapeError(None,None,e)) print e log.exception("scraper._get_units failed") return self.saved_article_ids
def run(self, scraper): try: units = list(scraper._get_units()) except Exception as e: self.errors.append(ScrapeError(None, None, e)) log.exception("scraper._get_units failed") return self.articles for i, unit in enumerate(units): try: articles = list(scraper._scrape_unit(unit)) except Exception as e: log.exception("scraper._scrape_unit failed") self.errors.append(ScrapeError(i, unit, e)) continue self.articles += articles for article in self.articles: _set_default(article, 'project', scraper.project) try: articles, errors = Article.create_articles(self.articles, scraper.articleset) self.saved_article_ids = { getattr(a, "duplicate_of", a.id) for a in self.articles } for e in errors: self.errors.append(ScrapeError(None, None, e)) except Exception as e: self.errors.append(ScrapeError(None, None, e)) log.exception("scraper._get_units failed") return self.saved_article_ids
def test_list_media(self): """Test that list media works for more than 10 media""" from amcat.models import Article media = [amcattest.create_test_medium() for _ in range(20)] arts = [amcattest.create_test_article(medium=m, create=False) for m in media] s1 = amcattest.create_test_set() Article.create_articles(arts[:5], articleset=s1, check_duplicate=False, create_id=True) ES().flush() self.assertEqual(set(s1.get_mediums()), set(media[:5])) s2 = amcattest.create_test_set(project=s1.project) Article.create_articles(arts[5:], articleset=s2, check_duplicate=False, create_id=True) ES().flush() self.assertEqual(set(s2.get_mediums()), set(media[5:])) self.assertEqual(set(s1.project.get_mediums()), set(media))
def test_add_many(self): """Can we add a large number of articles from one set to another?""" s = amcattest.create_test_set() s2 = amcattest.create_test_set() p = amcattest.create_test_project() arts = [ amcattest.create_test_article(project=p, create=False) for _x in range(1213) ] Article.create_articles(arts, s) ES().refresh() self.assertEqual(len(arts), s.get_count()) s2.add_articles(arts) ES().refresh() self.assertEqual(len(arts), s2.get_count()) print(s2.get_count())
def create_articles(batch): for a in batch: a['oldid_int'] = a.pop('old_id') if a['text'] == '': a['text'] = '-' if a['title'] == '': a['title'] = '-' articles = Article.create_articles([Article(project_id=self.status.project.id, **a) for a in batch]) self.status.articles.update({a.get_property('oldid_int'): a.id for a in articles}) return articles
def handle_split(form, project, article, sentences): articles = list(get_articles(article, sentences)) # We won't use bulk_create yet, as it bypasses save() and doesn't # insert ids Article.create_articles(articles) for art in articles: sbd.get_or_create_sentences(art) if not form.is_valid(): raise ValueError("Form invalid: {form.errors}".format(**locals())) # Context variables for template form_data = form.cleaned_data all_sets = list(project.all_articlesets().filter(articles=article)) # Add splitted articles to existing sets for aset in form_data["add_splitted_to_sets"]: aset.add_articles(articles) # Add splitted articles to sets wherin the original article live{d,s} if form_data["add_splitted_to_all"]: asets = project.all_articlesets().filter(articles=article).only("id") for aset in asets: aset.add_articles(articles) if form_data["remove_from_sets"]: for aset in form_data["remove_from_sets"]: aset.remove_articles([article]) if form_data["remove_from_all_sets"]: for aset in ArticleSet.objects.filter(project=project, articles=article).distinct(): aset.remove_articles([article]) if form_data["add_splitted_to_new_set"]: new_splitted_set = ArticleSet.create_set(project, form_data["add_splitted_to_new_set"], articles) if form_data["add_to_sets"]: for articleset in form_data["add_to_sets"]: articleset.add_articles([article]) if form_data["add_to_new_set"]: new_set = ArticleSet.create_set(project, form_data["add_to_new_set"], [article]) return locals()
def test_add_many(self): """Can we add a large number of articles from one set to another?""" s = amcattest.create_test_set() s2 = amcattest.create_test_set() m = amcattest.create_test_medium() p = amcattest.create_test_project() arts = [ amcattest.create_test_article(project=p, medium=m, create=False) for _x in range(1213) ] Article.create_articles(arts, s, create_id=True) ES().flush() self.assertEqual(len(arts), s.get_count()) s2.add_articles(arts, monitor=ProgressMonitor()) ES().flush() self.assertEqual(len(arts), s2.get_count()) print(s2.get_count())
def run(self): upload = self.options['upload'] upload.encoding_override(self.options['encoding']) monitor = self.progress_monitor root_dir = os.path.dirname(upload.filepath) monitor.update(10, u"Importing {self.__class__.__name__} from {upload.basename} into {self.project}" .format(**locals())) articles = [] files = self._get_files(upload) nfiles = len(upload) filemonitor = monitor.submonitor(nfiles, weight=60) for i, (file, data) in enumerate(files): filemonitor.update(1, "Parsing file {i}/{nfiles}: {file.name}".format(**locals())) articles += list(self.parse_file(file, data)) for article in articles: _set_project(article, self.project) if self.errors: raise ParseError(" ".join(map(str, self.errors))) monitor.update(10, "All files parsed, saving {n} articles".format(n=len(articles))) Article.create_articles(articles, articleset=self.get_or_create_articleset(), monitor=monitor.submonitor(40)) if not articles: raise Exception("No articles were imported") monitor.update(10, "Uploaded {n} articles, post-processing".format(n=len(articles))) aset = self.options['articleset'] new_provenance = self.get_provenance(upload.basename, articles) aset.provenance = ("%s\n%s" % (aset.provenance or "", new_provenance)).strip() aset.save() if getattr(self, 'task', None): self.task.log_usage("articles", "upload", n=len(articles)) monitor.update(10, "Done! Uploaded articles".format(n=len(articles))) return self.options["articleset"]
def save(self, **kwargs): def _flatten(l): """Turn either an object or a (recursive/irregular/jagged) list-of-lists into a flat list""" # inspired by http://stackoverflow.com/questions/2158395/flatten-an-irregular-list-of-lists-in-python if isinstance(l, collections.Iterable) and not isinstance(l, basestring): for el in l: for sub in _flatten(el): yield sub else: yield l # flatten articles list (children in a many call yields a list of lists) self.object = list(_flatten(self.object)) Article.create_articles(self.object, self.context['view'].articleset) # make sure that self.many is True for serializing result self.many = True return self.object
def test_elastic_hash(self): """Can we reproduce a hash from elastic data alone?""" article = Article(**{ "date": datetime.date(2015, 1, 1), "title": "\u6f22\u5b57", "text": "Even more strange characters.. \x0C and \x08 woo?", "url": "https://example.com", "project": create_test_project() }) hash = get_article_dict(article)['hash'] Article.create_articles([article], articleset=amcattest.create_test_set()) ES().refresh() es_articles = ES().query_all(filters={"ids": [article.id]}, _source=["hash"]) es_articles = list(es_articles) es_article = list(es_articles)[0] self.assertEqual(article.id, es_article.id) self.assertEqual(hash, es_article.hash) self.assertEqual(hash, article.hash)
def _run(self, local_project, remote_host, remote_token, remote_project_id, remote_articleset_id): try: page_size = 1000 query = RemoteQuery(remote_host, remote_token, remote_project_id, remote_articleset_id, page_size=page_size) set = {k: v for k, v in query.get_articleset().items() if k in COPY_SET_FIELDS} set.update(project=local_project) set = ArticleSet.objects.create(**set) for page in query: articles_hashes = [(self.create_article(x, local_project), x["hash"]) for x in page] hashmap = {old_hash: article.hash for article, old_hash in articles_hashes} articles, _ = zip(*articles_hashes) articles = list(articles) for article in articles: if article.parent_hash in hashmap: article.parent_hash = hashmap[article.parent_hash] Article.create_articles(articles, articleset=set) return set.id except APIError as e: self.handleError(e)
def test_deduplication(self): """Does deduplication work as it is supposed to?""" # create dummy articles to have something in the db [amcattest.create_test_article() for i in range(10)] amcates.ES().refresh() art = dict(project=amcattest.create_test_project(), title="deduptest", text="test", date='2001-01-01') a1 = amcattest.create_test_article(**art) amcates.ES().refresh() self.assertEqual(_q(title='deduptest'), {a1.id}) # duplicate articles should not be added a2 = amcattest.create_test_article(**art) amcates.ES().refresh() self.assertEqual(a2.id, a1.id) self.assertTrue(a2._duplicate) self.assertEqual(_q(title='deduptest'), {a1.id}) # however, if an articleset is given the 'existing' article # should be added to that set s1 = amcattest.create_test_set() a3 = amcattest.create_test_article(articleset=s1, **art) amcates.ES().refresh() self.assertEqual(a3.id, a1.id) self.assertEqual(_q(title='deduptest'), {a1.id}) self.assertEqual(set(s1.get_article_ids()), {a1.id}) self.assertEqual(_q(sets=s1.id), {a1.id}) # if an existing hash is set, it should be correct art2 = dict(hash=b'hash', **art) self.assertRaises(ValueError, amcattest.create_test_article, **art2) #TODO! Check duplicates within new articles art['title'] = "internaldupe" a1, a2 = (Article(**art), Article(**art)) Article.create_articles([a1, a2], articleset=s1) self.assertEqual(a1.id, a2.id) self.assertEqual(len(_q(title='internaldupe')), 1)
def run(self, scraper): try: units = list(scraper._get_units()) except Exception as e: self.errors.append(ScrapeError(None, None, e)) log.exception("scraper._get_units failed") return self.articles for i, unit in enumerate(units): try: articles = list(scraper._scrape_unit(unit)) except Exception as e: log.exception("scraper._scrape_unit failed") self.errors.append(ScrapeError(i, unit, e)) continue self.articles += articles for article in self.articles: _set_default(article, 'project', scraper.project) try: articles, errors = Article.create_articles(self.articles, scraper.articleset) self.saved_article_ids = {a.id for a in self.articles} for e in errors: self.errors.append(ScrapeError(None, None, e)) stats_log.info( json.dumps({ "action": "scraped_articles", "narticles": len(self.saved_article_ids), "scraper": scraper.__class__.__name__ })) except Exception as e: self.errors.append(ScrapeError(None, None, e)) log.exception("scraper._get_units failed") # Do we need to save these id's to more sets? if hasattr(scraper, "articlesets") and hasattr(self, "saved_article_ids"): for aset in scraper.articlesets: stats_log.info( json.dumps({ "action": "add_scraped_articles", "articleset_id": aset.id, "articleset__name": aset.name, "narticles": len(self.saved_article_ids), "project_id": aset.project_id, "project__name": aset.project.name })) aset.add_articles(self.saved_article_ids) return getattr(self, "saved_article_ids", ())
def save(self, **kwargs): import collections def _flatten(l): """Turn either an object or a (recursive/irregular/jagged) list-of-lists into a flat list""" # inspired by http://stackoverflow.com/questions/2158395/flatten-an-irregular-list-of-lists-in-python if isinstance( l, collections.Iterable) and not isinstance(l, basestring): for el in l: for sub in _flatten(el): yield sub else: yield l # flatten articles list (children in a many call yields a list of lists) self.object = list(_flatten(self.object)) Article.create_articles(self.object, self.context['view'].articleset) # make sure that self.many is True for serializing result self.many = True return self.object
def test_elastic_hash(self): """Can we reproduce a hash from elastic data alone?""" article = Article( **{ "date": datetime.date(2015, 1, 1), "title": "\u6f22\u5b57", "text": "Even more strange characters.. \x0C and \x08 woo?", "url": "https://example.com", "project": create_test_project() }) hash = get_article_dict(article)['hash'] Article.create_articles([article], articleset=amcattest.create_test_set()) ES().refresh() es_articles = ES().query_all(filters={"ids": [article.id]}, fields=["hash"]) es_articles = list(es_articles) es_article = list(es_articles)[0] self.assertEqual(article.id, es_article.id) self.assertEqual(hash, es_article.hash) self.assertEqual(hash, article.hash)
def set_up(self): self.aset = amcattest.create_test_set() self.asets = ArticleSet.objects.filter(id__in=[self.aset.id]) self.project = self.aset.project self.a1 = Article( title="Man leeft nog steeds in de gloria", text="Gezongen vloek op verjaardag maakt leven van man tot een vrolijke hel.", date=datetime.datetime(2017, 1, 2, 23, 22, 11), author="Rudolf Julius", publisher="De Speld", project=self.project, exists="Once", page_int=5, section_int=10, tags_tag={"gloria", "vloek"}, html="Man <i>leeft</i> nog steeds in de gloria" ) self.a2 = Article( title="VVD trots op opkomende zon", text="Kabinetsbeleid om geen parasol over Nederland te zetten betaalt zich uit", date=datetime.datetime(2016, 12, 14, 15, 13, 12), author="Thomas Hogeling", publisher="De Speld", project=self.project, page_int=5, section_int=11, tags_tag={"vvd", "nederland", "speld"} ) Article.create_articles([self.a1, self.a2], articleset=self.aset) amcates.ES().refresh() self.qs = ESQuerySet(self.asets)
def run(self, scraper): try: units = list(scraper._get_units()) except Exception as e: self.errors.append(ScrapeError(None, None, e)) log.exception("scraper._get_units failed") return self.articles for i, unit in enumerate(units): try: articles = list(scraper._scrape_unit(unit)) except Exception as e: log.exception("scraper._scrape_unit failed") self.errors.append(ScrapeError(i, unit, e)) continue self.articles += articles for article in self.articles: _set_default(article, 'project', scraper.project) try: articles, errors = Article.create_articles(self.articles, scraper.articleset) self.saved_article_ids = {a.id for a in self.articles} for e in errors: self.errors.append(ScrapeError(None, None, e)) stats_log.info(json.dumps({ "action": "scraped_articles", "narticles": len(self.saved_article_ids), "scraper": scraper.__class__.__name__ })) except Exception as e: self.errors.append(ScrapeError(None, None, e)) log.exception("scraper._get_units failed") # Do we need to save these id's to more sets? if hasattr(scraper, "articlesets") and hasattr(self, "saved_article_ids"): for aset in scraper.articlesets: stats_log.info(json.dumps({ "action": "add_scraped_articles", "articleset_id": aset.id, "articleset__name": aset.name, "narticles": len(self.saved_article_ids), "project_id": aset.project_id, "project__name": aset.project.name })) aset.add_articles(self.saved_article_ids) return getattr(self, "saved_article_ids", ())
def create(self, validated_data): # Get articleset object given through URL articleset_id = self.context["view"].kwargs.get('articleset') if articleset_id is not None: articleset = ArticleSet.objects.get(pk=articleset_id) project = articleset.project else: raise ValueError("Missing articleset parameter?") # Create articles not yet in database new_articles = [a for a in validated_data if "id" not in a] if new_articles: new_articles = [json_to_article(article, project) for article in new_articles] yield from Article.create_articles(new_articles, articleset=articleset) # Add existing articles to this set to_add = [a['id'] for a in validated_data if "id" in a] if to_add: _check_read_access(self.context['request'].user, to_add) articleset.add_articles(to_add) yield from Article.objects.filter(pk__in=to_add).only("pk")
def create(self, validated_data): # Get articleset object given through URL articleset_id = self.context["view"].kwargs.get('articleset') if articleset_id is not None: articleset = ArticleSet.objects.get(pk=articleset_id) project = articleset.project else: raise ValueError("Missing articleset parameter?") # Create articles not yet in database new_articles = [a for a in validated_data if "id" not in a] if new_articles: new_articles = [ json_to_article(article, project) for article in new_articles ] yield from Article.create_articles(new_articles, articleset=articleset) # Add existing articles to this set to_add = [a['id'] for a in validated_data if "id" in a] if to_add: _check_read_access(self.context['request'].user, to_add) articleset.add_articles(to_add) yield from Article.objects.filter(pk__in=to_add).only("pk")
def test_highlight_fragments(self): self.set_up() articleset = amcattest.create_test_set() project = articleset.project text = """ The Alderman Proctor's Drinking Fountain (grid reference ST566738) is a historic building on Clifton Down, Bristol, England. The city of Bristol began supplying municipal drinking water in 1858. To inform the public about the new water supply, Robert Lang made a proposal though the Bristol Times that public drinking fountains be constructed. Lang began the "Fountain Fund" in January 1859 with a donation of one hundred pounds. By 1906, there were more than 40 public drinking fountains throughout the city. In 1872, Alderman Thomas Proctor commissioned the firm of George and Henry Godwin to build the fountain to commemorate the 1861 presentation of <i>Clifton Down</i> to the City of Bristol by the Society of Merchant Venturers. **Commemorative plaque** The three-sided fountain is done in Gothic Revival style. The main portion is of limestone with pink marble columns and white marble surround. The commemorative plaque is of black lettering on white marble; the plaque reads, "Erected by Alderman Thomas Proctor, of Bristol to record the liberal gift of certain rights on Clifton Down made to the citizens by the Society of Merchant Venturers under the provision of the Clifton and Drudham Downs Acts of Parliament, 1861, whereby the enjoyment of these Downs is preserved to the citizens of Bristol for ever." The fountain bears the coat of arms for the city of Bristol, the Society of Merchant Venturers and that of Alderman Thomas Proctor. The fountain was originally situated at the head of Bridge Valley Road. It became a sight impediment to modern auto traffic in the later 20th century. The fountain was moved to the other side of the road, closer to the Mansion House in 1987. After the move, it underwent restoration and was re-dedicated on 1 May 1988. It has been designated by English Heritage as a grade II listed building since 1977. """ paragraphs = [" ".join(s.strip() for s in p.strip().split("\n")) for p in text.split("\n\n")] long_article = Article( title="Alderman Proctor's Drinking Fountain", text="\n\n".join(paragraphs).strip(), date=datetime.datetime(2017, 1, 18, 13, 29, 11), url="https://en.wikipedia.org/wiki/Alderman_Proctor%27s_Drinking_Fountain", publisher="Wikipedia", project=project ) Article.create_articles([long_article], articleset) amcates.ES().refresh() qs = ESQuerySet(ArticleSet.objects.filter(id=articleset.id)) fragments = qs.highlight_fragments('"Clifton Down"', ("text", "title"), fragment_size=50) self.assertEqual(1, len(qs)) self.assertEqual(1, len(fragments)) fragments = next(iter(fragments.values())) text_fragments = set(fragments["text"]) title_fragments = fragments["title"] self.assertEqual(1, len(title_fragments)) self.assertNotIn("<mark>", title_fragments[0]) self.assertEqual(3, len(text_fragments)) self.assertEqual(text_fragments, { ' presentation of <i><mark>Clifton</mark> <mark>Down</mark></i> to the City of Bristol', ' <mark>Clifton</mark> <mark>Down</mark>, Bristol, England.\n\nThe city of Bristol', ' the liberal gift of certain rights on <mark>Clifton</mark> <mark>Down</mark> made' })
def test_aggregate(self): """Can we make tables per medium/date interval?""" from amcat.models import Article m1 = amcattest.create_test_medium(name="De Nep-Krant") m2, m3 = [amcattest.create_test_medium() for _ in range(2)] s1 = amcattest.create_test_set() s2 = amcattest.create_test_set() unused = amcattest.create_test_article(text='aap noot mies', medium=m3, articleset=s2) a = amcattest.create_test_article(text='aap noot mies', medium=m1, date='2001-01-01', create=False) b = amcattest.create_test_article(text='noot mies wim zus', medium=m2, date='2001-02-01', create=False) c = amcattest.create_test_article(text='mies bla bla bla wim zus jet', medium=m2, date='2002-01-01', create=False) d = amcattest.create_test_article(text='noot mies wim zus', medium=m2, date='2001-02-03', create=False) Article.create_articles([a, b, c, d], articleset=s1, check_duplicate=False) ES().flush() # counts per mediumid self.assertEqual( dict(ES().aggregate_query(filters=dict(sets=s1.id), group_by="mediumid")), { m1.id: 1, m2.id: 3 }) # counts per medium (name) self.assertEqual( dict(ES().aggregate_query(filters=dict(sets=s1.id), group_by="medium")), { m1.name: 1, m2.name: 3 }) self.assertEqual( dict(ES().aggregate_query(filters=dict(sets=s1.id), group_by="date", date_interval="year")), { datetime(2001, 1, 1): 3, datetime(2002, 1, 1): 1 }) self.assertEqual( dict(ES().aggregate_query(filters=dict(sets=s1.id), group_by="date", date_interval="month")), { datetime(2001, 1, 1): 1, datetime(2002, 1, 1): 1, datetime(2001, 2, 1): 2 }) # set statistics stats = ES().statistics(filters=dict(sets=s1.id)) self.assertEqual(stats.n, 4) self.assertEqual(stats.start_date, datetime(2001, 1, 1)) self.assertEqual(stats.end_date, datetime(2002, 1, 1)) # media list self.assertEqual(set(ES().list_media(filters=dict(sets=s1.id))), {m1.id, m2.id})