def test_properties(self): """Are properties stored as flat fields and with correct mapping?""" props = dict(proptest="123 test, and another", proptest2_url="http://example.org", proptest3_date="2001-01-01", proptest4_num=-1, proptest5_tag={"123 test", "and another"}, proptest6_id="123 test, and another") self.assertEqual( set(props.keys()) & set(ES().get_mapping().keys()), set()) a = amcattest.create_test_article(properties=props) mapping = ES().get_mapping() for field, ftype in dict(proptest="default", proptest2_url="url", proptest3_date="date", proptest4_num="num", proptest5_tag="tag").items(): self.assertEqual(mapping[field], settings.ES_MAPPING_TYPES[ftype]) src = ES().get(a.id) self.assertEqual(set(mapping.keys()), set(props.keys()) | ALL_FIELDS) # test if term vectors are correct, i.e. test analysis def tokens(field): tokens = list(ES().get_tokens(a.id, fields=[field])) return [w for (f, p, w) in sorted(tokens)] self.assertEqual(tokens("proptest"), ["123", "test", "and", "another"]) self.assertEqual(set(tokens("proptest5_tag")), {"123 test", "and another"}) self.assertEqual(tokens("proptest6_id"), ["123 test, and another"]) self.assertEqual(tokens("proptest2_url"), ["http://example.org"])
def _run(self, job): es = ES() for ca in job.coded_articles.all(): coding_json = {"job": job.id, "sentence_codings": []} for coding in ca.codings.all(): values = {} print((coding, coding.sentence)) for cv in coding.values.all(): fieldtype = cv.field.fieldtype.name if fieldtype == "Codebook": values[cv.field.label] = cv.intval values[cv.field.label + "_label"] = cv.value.label elif fieldtype == "Text": values[cv.field.label + "_label"] = cv.value elif fieldtype == "Quality": values[cv.field.label] = cv.value / 10 elif fieldtype == "Yes/No": values[cv.field.label + "_bool"] = cv.value else: values[cv.field.label] = cv.value if coding.sentence_id is None: coding_json["article_coding"] = values else: values["sentence_id"] = coding.sentence_id coding_json["sentence_codings"].append(coding) src = es.get(ca.article_id) src["codings"] = [c for c in src.get('codings', []) if not c['job'] == job.id] src['codings'].append(coding_json) es.es.index(index=es.index, doc_type=es.doc_type, id=ca.article_id, body=src)
def _run(self, job): es = ES() for ca in job.coded_articles.all(): coding_json = {"job": job.id, "sentence_codings": []} for coding in ca.codings.all(): values = {} print coding, coding.sentence for cv in coding.values.all(): fieldtype = cv.field.fieldtype.name if fieldtype == "Codebook": values[cv.field.label] = cv.intval values[cv.field.label + "_label"] = cv.value.label elif fieldtype == "Text": values[cv.field.label + "_label"] = cv.value elif fieldtype == "Quality": values[cv.field.label] = cv.value / 10 elif fieldtype == "Yes/No": values[cv.field.label + "_bool"] = cv.value else: values[cv.field.label] = cv.value if coding.sentence_id is None: coding_json["article_coding"] = values else: values["sentence_id"] = coding.sentence_id coding_json["sentence_codings"].append(coding) src = es.get(ca.article_id) src["codings"] = [c for c in src.get('codings', []) if not c['job'] == job.id] src['codings'].append(coding_json) es.es.index(index=es.index, doc_type=es.doc_type, id=ca.article_id, body=src)
def getArticles(form, **kargs): fields = ['mediumid', 'date', 'headline', 'medium'] sort = form.get('sortColumn', None) if 'keywordInContext' in form['columns']: raise NotImplementedError() query = query_from_form(form) kargs["highlight" if query else "lead"] = True filters = dict(filters_from_form(form)) log.info("Query: {query!r}, with filters: {filters}".format(**locals())) score = 'hits' in form['columns'] result = list(ES().query(query, filters=filters, fields=fields, sort=sort, score=score, **kargs)) if 'hits' in form['columns']: # add hits columns def add_hits_column(r): r.hits = {q.label : 0 for q in form['queries']} return r result_dict = {r.id : add_hits_column(r) for r in result} f = dict(ids=list(result_dict.keys())) for q in queries_from_form(form): for hit in ES().query(q.query, filters=f, fields=[]): result_dict[hit.id].hits[q.label] = hit.score return result
def test_filters(self): """ Do filters work properly? """ m1, m2 = [amcattest.create_test_medium() for _ in range(2)] a = amcattest.create_test_article(text='aap noot mies', medium=m1, date="2001-01-01") b = amcattest.create_test_article(text='noot mies wim zus', medium=m2, date="2002-01-01") c = amcattest.create_test_article(text='mies bla bla bla wim zus jet', medium=m2, date="2003-01-01") s1 = amcattest.create_test_set(articles=[a, b, c]) s2 = amcattest.create_test_set(articles=[a, b]) ES().flush() q = lambda **filters: set(ES().query_ids(filters=filters)) # MEDIUM FILTER self.assertEqual(q(mediumid=m2.id), {b.id, c.id}) #### DATE FILTERS self.assertEqual(q(sets=s1.id, start_date='2001-06-01'), {b.id, c.id}) # start is inclusive self.assertEqual(q(sets=s1.id, start_date='2002-01-01', end_date="2002-06-01"), {b.id}) # end is exclusive self.assertEqual(q(sets=s1.id, start_date='2001-01-01', end_date="2003-01-01"), {a.id, b.id}) # COMBINATION self.assertEqual(q(sets=s2.id, start_date='2001-06-01'), {b.id}) self.assertEqual(q(end_date='2002-06-01', mediumid=m2.id), {b.id})
def test_aggregate(self): """Can we make tables per date interval?""" s1, s2, a, b, c, d, e = self.setup() self.assertEqual( dict(ES().aggregate_query(filters=dict(sets=s1.id), group_by="date", date_interval="year")), { datetime.datetime(2001, 1, 1): 3, datetime.datetime(2002, 1, 1): 1 }) self.assertEqual( dict(ES().aggregate_query(filters=dict(sets=s1.id), group_by="date", date_interval="month")), { datetime.datetime(2001, 1, 1): 1, datetime.datetime(2002, 1, 1): 1, datetime.datetime(2001, 2, 1): 2 }) # set statistics stats = ES().statistics(filters=dict(sets=s1.id)) self.assertEqual(stats.n, 4) self.assertEqual(stats.start_date, datetime.datetime(2001, 1, 1)) self.assertEqual(stats.end_date, datetime.datetime(2002, 1, 1))
def test_properties(self): """Are properties stored as flat fields and with correct mapping?""" props = dict( proptest="123 test, and another", proptest2_url="http://example.org", proptest3_date="2001-01-01", proptest4_num=-1, proptest5_tag={"123 test", "and another"}, proptest6_id="123 test, and another") self.assertEqual(set(props.keys()) & set(ES().get_mapping().keys()), set()) a = amcattest.create_test_article(properties=props) mapping = ES().get_mapping() for field, ftype in dict(proptest="default", proptest2_url="url", proptest3_date="date", proptest4_num="num", proptest5_tag="tag").items(): self.assertEqual(mapping[field], settings.ES_MAPPING_TYPES[ftype]) src = ES().get(a.id) self.assertEqual(set(mapping.keys()), set(props.keys()) | ALL_FIELDS) # test if term vectors are correct, i.e. test analysis def tokens(field): tokens = list(ES().get_tokens(a.id, fields=[field])) return [w for (f, p, w) in sorted(tokens)] self.assertEqual(tokens("proptest"), ["123", "test", "and", "another"]) self.assertEqual(set(tokens("proptest5_tag")), {"123 test", "and another"}) self.assertEqual(tokens("proptest6_id"), ["123 test, and another"]) self.assertEqual(tokens("proptest2_url"), ["http://example.org"])
def test_complex_phrase_query(self): """Test complex phrase queries. DOES NOT WORK YET""" a = amcattest.create_test_article(text='aap noot mies') b = amcattest.create_test_article(text='noot mies wim zus') c = amcattest.create_test_article(text='mies bla bla bla wim zus jet') s1 = amcattest.create_test_set(articles=[a, b, c]) ES().add_articles([a.id, b.id, c.id]) self.assertEqual(set(ES().query_ids('"mi* wi*"~5', filters=dict(sets=s1.id))), {b.id, c.id})
def _refresh_property_cache(self) -> Set[str]: """Discard property cache and recalculate properties""" from amcat.tools.amcates import ES es = ES() es.refresh() properties = es.get_used_properties([self.id]) self._reset_property_cache() return self._add_to_property_cache(properties)
def refresh_index(self, full_refresh=False): """ Make sure that the index for this set is up to date """ from amcat.tools.amcates import ES ES().check_index() ES().synchronize_articleset(self, full_refresh=full_refresh) self.save()
def __init__(self, form): """ Form *must* be valid before passing. @type form: SelectionForm """ self.es = ES() self.form = form self.data = SelectionData(form.cleaned_data)
def test_query(self): """Do query and query_ids work properly?""" a = amcattest.create_test_article(headline="bla", text="artikel artikel een", date="2001-01-01") ES().flush() es_a, = ES().query("een", fields=["date", "headline"]) self.assertEqual(es_a.headline, "bla") self.assertEqual(es_a.id, a.id) ids = set(ES().query_ids(filters=dict(mediumid=a.medium_id))) self.assertEqual(ids, {a.id})
def inner(*args, **kargs): from amcat.tools.amcates import ES es = ES() if not es.es.ping(): raise unittest.SkipTest("ES not enabled") es.delete_index() ES().check_index() return func(*args, **kargs)
def inner(*args, **kargs): from amcat.tools.amcates import ES if not settings.ES_INDEX.endswith("__unittest"): settings.ES_INDEX += "__unittest" es = ES() if not es.es.ping(): raise unittest.SkipTest("ES not enabled") es.delete_index() ES().check_index() return func(*args, **kargs)
def test_scores(self): """test if scores (and matches) are as expected for various queries""" s = amcattest.create_test_set(articles=[ amcattest.create_test_article(headline="a", text='dit is een test'), ]) s.refresh_index() def q(query): result = ES().query(query, filters={'sets': s.id}, fields=["headline"]) return {a.headline: a.score for a in result} self.assertEqual(q("test"), {"a": 1}) m1, m2 = [amcattest.create_test_medium() for _ in range(2)] a = amcattest.create_test_article(text='aap noot mies', medium=m1) b = amcattest.create_test_article(text='noot mies wim zus', medium=m2) c = amcattest.create_test_article(text='mies bla bla bla wim zus jet', medium=m2) d = amcattest.create_test_article(text='ik woon in een sociale huurwoning, net als anderen', medium=m2) ES().flush() self.assertEqual(set(ES().query_ids("no*")), {a.id, b.id}) self.assertEqual(set(ES().query_ids("no*", filters=dict(mediumid=m2.id))), {b.id}) self.assertEqual(set(ES().query_ids("zus AND jet", filters=dict(mediumid=m2.id))), {c.id}) self.assertEqual(set(ES().query_ids("zus OR jet", filters=dict(mediumid=m2.id))), {b.id, c.id}) self.assertEqual(set(ES().query_ids('"mies wim"', filters=dict(mediumid=m2.id))), {b.id}) self.assertEqual(set(ES().query_ids('"mies wim"~5', filters=dict(mediumid=m2.id))), {b.id, c.id}) self.assertEqual(set(ES().query_ids('"sociale huur*"', filters=dict(mediumid=m2.id))), {d.id}) self.assertEqual(set(ES().query_ids('"sociale huur*"', filters=dict(mediumid=m2.id))), {d.id})
def test_byline(self): aset = amcattest.create_test_set() amcattest.create_test_article(byline="bob", text="eve", articleset=aset) ES().flush() q = lambda query: set(ES().query_ids(query, filters={"sets": aset.id})) self.assertEqual(1, len(q("byline:bob"))) self.assertEqual(0, len(q("byline:eve"))) self.assertEqual(1, len(q("bob")))
def test_query(self): """Do query and query_ids work properly?""" a = amcattest.create_test_article(title="bla", text="artikel artikel een", date="2001-01-01") ES().refresh() es_a, = ES().query("een", fields=["date", "title"]) self.assertEqual(es_a.title, "bla") self.assertEqual(es_a.id, a.id) ids = set(ES().query_ids(filters=dict(title='bla'))) self.assertEqual(ids, {a.id})
def set_default_similarity(*args, **kwargs): # Make sure index exists es = ES() es.check_index() es.refresh() # Push new settings to indices indices = es.es.indices indices.close(es.index) indices.put_settings(settings.ES_SETTINGS, es.index) indices.open(es.index)
def test_articlesets(self): a, b, c = [amcattest.create_test_article() for _x in range(3)] s1 = amcattest.create_test_set(articles=[a, b, c]) s2 = amcattest.create_test_set(articles=[b, c]) s3 = amcattest.create_test_set(articles=[b]) ES().refresh() es_c = ES().get(c.id) self.assertEqual(set(es_c['sets']), {s1.id, s2.id}) ids = ES().query_ids(filters=dict(sets=s1.id)) self.assertEqual(set(ids), {a.id, b.id, c.id})
def create(self, validated_data): try: article = Article.objects.get(uuid=validated_data["uuid"]) except (Article.DoesNotExist, KeyError) as e: article = super(ArticleSerializer, self).create(validated_data) elastic = ES() elastic.add_articles([article.id]) elastic.flush() self.context["view"].articleset.add_articles([article]) return article
def _add_column(table, column_name, query, filters, group_by, dateInterval): if group_by == "total": n = ES().count(query, filters) table.addValue("Total", column_name, n) else: results = ES().aggregate_query(query, filters, group_by, dateInterval) if group_by == "mediumid": results = add_medium_names(results) for group, n in results: table.addValue(unicode(group), column_name, n) table.columnTypes[column_name] = int
def test_query_all(self): """Test that query_all works""" from amcat.models import Article arts = [amcattest.create_test_article(create=False) for _ in range(20)] s = amcattest.create_test_set() Article.create_articles(arts, articleset=s, check_duplicate=False, create_id=True) ES().flush() r = ES().query(filters=dict(sets=s.id), size=10) self.assertEqual(len(list(r)), 10) r = ES().query_all(filters=dict(sets=s.id), size=10) self.assertEqual(len(list(r)), len(arts))
def test_not(self): aset = amcattest.create_test_set() eve = amcattest.create_test_article(text="eve", articleset=aset) paul = amcattest.create_test_article(text="paul", articleset=aset) adam = amcattest.create_test_article(text="adam", articleset=aset) ES().flush() q = lambda query: set(ES().query_ids(query, filters={"sets": aset.id})) self.assertEqual({eve.id}, q("eve")) self.assertEqual({paul.id, adam.id}, q("NOT eve")) self.assertEqual({paul.id, adam.id}, q("* NOT eve")) self.assertEqual({eve.id}, q("NOT (NOT eve)"))
def test_estoken(self): aset = amcattest.create_test_set() a1 = amcattest.create_test_article(title="dit is de titel", text="En dit, dit is de tekst", articleset=aset, project=aset.project) a2 = amcattest.create_test_article( title="dit is nog een kop", text="Van je een, van je twee, van je drie!", articleset=aset, project=aset.project) ES().refresh() # url(r'^projects/(?P<project_id>[0-9]+)/articlesets/(?P<articleset_id>[0-9]+)/tokens/?$', TokensView.as_view(), name="tokens"), url = reverse("api:tokens", kwargs=dict(project_id=aset.project.id, articleset_id=aset.id)) + "?format=json" r = self.client.get(url) self.assertEqual(r.status_code, 200) tokens = json.loads(r.content.decode(r.charset))['results'] words1 = " ".join(t["word"] for t in tokens if t['id'] == a1.id) words2 = " ".join(t["word"] for t in tokens if t['id'] == a2.id) self.assertEqual(words1, "dit is de titel en dit dit is de tekst") self.assertEqual( words2, "dit is nog een kop van je een van je twee van je drie")
def set_up(self): self.a1 = amcattest.create_test_article() self.a1.text = "aap noot mies" self.a1.date = datetime.datetime(2010, 1, 1) self.a1.properties = {"author": "De Bas", "length_int": 5} self.a1.save() self.a2 = amcattest.create_test_article() self.a2.text = "aap noot geit" self.a2.date = datetime.datetime(2010, 1, 1) self.a2.properties = {"author": "Het Martijn", "length_int": 5} self.a2.save() self.a3 = amcattest.create_test_article() self.a3.text = "lamp" self.a3.date = datetime.datetime(2010, 1, 2) self.a3.properties = {"author": "Het Martijn", "length_int": 15} self.a3.save() self.aset1 = amcattest.create_test_set() self.aset1.add_articles([self.a1, self.a2]) self.aset1.refresh_index(True) self.aset2 = amcattest.create_test_set() self.aset2.add_articles([self.a3]) self.aset2.refresh_index(True) ES().refresh()
def get_mediums(self): """ Return a sequence of Medium object used in this set """ from amcat.tools.amcates import ES medium_ids = ES().list_media(filters=dict(sets=self.id)) return Medium.objects.filter(id__in=medium_ids)
def _do_query(self, query): result = ES().search(query) if len(result["hits"]["hits"]) == self.size: raise NotImplementedError("Returned 10000 articles exactly. Time to implement scroll :)") return result
def test_date(self): # Test iso8601 parsing, database parsing, etc. iso8601_date_string = '1992-12-31T23:59:00' date = datetime.datetime(1992, 12, 31, 23, 59, 0) date_parsed = iso8601.parse_date(iso8601_date_string, default_timezone=None) a = amcattest.create_test_article(date=iso8601_date_string) self.assertEqual(date_parsed, date) self.assertEqual(a.date, date) ES().refresh() # Test Elastic date parsing es_date = ES().get(a.id)["date"] self.assertEqual(es_date, '1992-12-31T23:59:00') self.assertEqual(iso8601.parse_date(es_date, None), date)
def __iter__(self) -> Iterable[ESArticle]: if not self.highlights: # Case 1: no highlighters hits = ES().search(self.get_query())["hits"]["hits"] for hit in hits: _to_flat_dict(hit["fields"]) yield ESArticle(self.fields, hit["fields"]) else: # Case 2: at least one highlighter present. We need to execute a query for every # highlighter plus one for the original text. original_texts = self._do_query(self.get_query())["hits"]["hits"] for hit in original_texts: _to_flat_dict(hit["fields"]) # Order might be unreliable, so we make mappings unordered = self.order_by(None) highlighted_texts = [] for highlight in self.highlights: unordered.get_query(highlight) result = self._do_query(self.get_query(highlight)) for hit in result["hits"]["hits"]: _to_flat_dict(hit["highlight"]) highlighted_texts.append({d["_id"]: d["highlight"] for d in result["hits"]["hits"]}) markers = [h.mark for h in self.highlights] for text in original_texts: highlighted = [h.get(text["_id"], text["fields"]) for h in highlighted_texts] merged = dict(merge_highlighted_document(text["fields"], highlighted, markers)) yield HighlightedESArticle(self.fields, ChainMap(merged, text["fields"]))
def getTable(form, progress_monitor=NullMonitor): table = table3.DictTable(default=0) table.rowNamesRequired = True dateInterval = form['dateInterval'] group_by = form['xAxis'] if group_by == "medium": group_by = "mediumid" filters = dict(filters_from_form(form)) queries = list(queries_from_form(form)) query = query_from_form(form) yAxis = form['yAxis'] if yAxis == 'total': _add_column(table, 'total', query, filters, group_by, dateInterval) progress_monitor.update(90, "Got results") elif yAxis == 'medium': media = Medium.objects.filter(pk__in=ES().list_media(query, filters)).only("name") for medium in sorted(media): filters['mediumid'] = medium.id name = u"{medium.id} - {}".format(medium.name.replace(",", " ").replace(".", " "), **locals()) _add_column(table, name, query, filters, group_by, dateInterval) progress_monitor.update(90 / len(media), "Got results for medium {medium.id}".format(**locals())) elif yAxis == 'searchTerm': for q in queries: _add_column(table, q.label, q.query, filters, group_by, dateInterval) progress_monitor.update(90 / len(queries), "Got results for {q.label!r}".format(**locals())) else: raise Exception('yAxis {yAxis} not recognized'.format(**locals())) table.queries = queries return table
def setup(self): s1 = amcattest.create_test_set() s2 = amcattest.create_test_set() a = amcattest.create_test_article(text='aap noot mies', title='m1', date='2001-01-01', create=False) b = amcattest.create_test_article(text='noot mies wim zus', title='m2', date='2001-02-01', create=False) c = amcattest.create_test_article(text='mies bla bla bla wim zus jet', title='m2', date='2002-01-01', create=False) d = amcattest.create_test_article(text='noot mies wim zus', title='m2', date='2001-02-03', create=False) e = amcattest.create_test_article(text='aap noot mies', title='m3', articleset=s2) Article.create_articles([a, b, c, d], articleset=s1) ES().refresh() return s1, s2, a, b, c, d, e
def test_highlight_article(self): s1, s2, a, b, c, d, e = self.setup() result = ES().highlight_article(a.id, "aap") self.assertEqual(result["text"], "<em>aap</em> noot mies") result = ES().highlight_article(a.id, "aap OR mies") self.assertEqual(result["text"], "<em>aap</em> noot <em>mies</em>") result = ES().highlight_article(a.id, "aap OR mies") self.assertEqual(result["text"], "<em>aap</em> noot <em>mies</em>") result = ES().highlight_article(a.id, '"aap mies"~0') self.assertEqual(result["text"], "aap noot mies") result = ES().highlight_article(a.id, '"aap mies"~1') self.assertEqual(result["text"], "<em>aap</em> noot <em>mies</em>")
def get_article_ids_from_elastic(self): """ Return the sequence of ids of articles in this set. As opposed to get_article_ids, this method uses elastic to fetch its data. @rtype: set """ return set(ES().query_ids(filters={"sets": [self.id]}))
def test_add_many(self): """Can we add a large number of articles from one set to another?""" s = amcattest.create_test_set() s2 = amcattest.create_test_set() p = amcattest.create_test_project() arts = [ amcattest.create_test_article(project=p, create=False) for _x in range(1213) ] Article.create_articles(arts, s) ES().refresh() self.assertEqual(len(arts), s.get_count()) s2.add_articles(arts) ES().refresh() self.assertEqual(len(arts), s2.get_count()) print(s2.get_count())
def test_elastic_hash(self): """Can we reproduce a hash from elastic data alone?""" article = Article(**{ "date": datetime.date(2015, 1, 1), "section": "\u6f22\u5b57", "pagenr": 1928390, "headline": "Headline hier.", "byline": "byline..", "length": 1928, "metastring": "Even more strange characters.. \x0C ..", "url": "https://example.com", "externalid": None, "author": None, "addressee": "Hmm", "text": "Contains invalid char \x08 woo", "medium": create_test_medium(name="abc."), "project": create_test_project() }) article.save() es = ES() es.add_articles([article.id]) hash = get_article_dict(article)["hash"] es.flush() es_articles = es.query_all(filters={"ids": [article.id]}, fields=HASH_FIELDS + ["hash"]) es_article = list(es_articles)[0] self.assertEqual(article.id, es_article.id) self.assertEqual(hash, es_article.hash) self.assertEqual(_get_hash(es_article.to_dict()), hash)
def add_articles(self, article_ids, add_to_index=True, monitor=NullMonitor()): """ Add the given articles to this articleset. Implementation is exists of three parts: 1. Adding ArticleSetArticle objects 2. Adding CodedArticle objects 3. Updating index @param article_ids: articles to be removed @type article_ids: iterable with indexing of integers or Article objects @param add_to_index: notify elasticsearch of changes @type add_to_index: bool """ monitor = monitor.submonitor(total=4) article_ids = {(art if type(art) is int else art.id) for art in article_ids} # Only use articles that exist to_add = article_ids - self.get_article_ids() to_add = list(Article.exists(to_add)) monitor.update(message="Adding {n} articles to {aset}..".format(n=len(to_add), aset=self)) ArticleSetArticle.objects.bulk_create( [ArticleSetArticle(articleset=self, article_id=artid) for artid in to_add], batch_size=100, ) monitor.update(message="{n} articleset articles added to database, adding to codingjobs..".format(n=len(to_add))) cjarts = [CodedArticle(codingjob=c, article_id=a) for c, a in itertools.product(self.codingjob_set.all(), to_add)] CodedArticle.objects.bulk_create(cjarts) if add_to_index: monitor.update(message="{n} articles added to codingjobs, adding to index".format(n=len(cjarts))) es = ES() es.add_to_set(self.id, to_add, monitor=monitor) es.refresh() # We need to flush, or setting cache will fail # Add to property cache properties = ES().get_used_properties(article_ids=to_add) self._add_to_property_cache(properties) else: monitor.update(2)
def determine_metadata(self, request, view): form = view.get_form() field_names = list(form.fields.keys()) fields = list(map(partial(getitem, form), field_names)) articlesets = view.get_articlesets() props = {prop for aset in articlesets for prop in aset.get_used_properties()} articleset_ids = list(articlesets.values_list('id', flat=True)) # lucene limitation setsquery = { 'bool': { 'should': [ {'terms': {'sets': articleset_ids[i:i+1000]} } for i in range(0, len(articleset_ids), 1000) ] } } if props: aggs = ES().search({ 'aggs': { k: { 'terms': { 'field': '{}.raw'.format(k) if get_property_mapping_type(k) == "default" else k, 'size': self.bucket_count_limit } } for k in props }, 'query': setsquery })['aggregations'] filter_props = {k: [v['key'] for v in vs['buckets']] for k, vs in aggs.items()} else: filter_props = {} return { "help_texts": OrderedDict(zip(field_names, [f.help_text.strip() or None for f in fields])), "form": OrderedDict(zip(field_names, [f.as_widget() for f in fields])), "labels": OrderedDict(zip(field_names, [f.label for f in fields])), "help_text": view.get_view_description(), "filter_properties": filter_props # TODO: filter_properties should be moved to a different view. }
class SelectionSearch: def __init__(self, form): """ Form *must* be valid before passing. @type form: SelectionForm """ self.es = ES() self.form = form self.data = SelectionData(form.cleaned_data) def _get_filters(self): """ Get filters for dates, mediums, articlesets and articles for given form. Yields iterables of tuples containing (filter_name, filter_value). @type form: SelectionForm """ yield get_date_filters( self.data.start_date, self.data.end_date, self.data.on_date, self.data.datetype ) yield (("mediumid", [m.id for m in self.data.mediums]),) yield (("sets", [a.id for a in self.data.articlesets]),) yield (("ids", self.data.article_ids or None),) @cached def get_filters(self): """Returns dict with filter -> value, which can be passed to elastic""" # Remove all filters which value is None return {k: v for k, v in chain(*self._get_filters()) if v is not None} @cached def get_query(self): """ @rtype: str """ return ' OR '.join('(%s)' % q.query for q in self.get_queries()) or None @cached def get_queries(self): """Get SearchQuery objects @rtype: iterable of SearchQuery""" if not self.data.query: return [] codebook = self.data.codebook label_lan = self.data.codebook_label_language replacement_lan = self.data.codebook_replacement_language if codebook: codebook.cache_labels() queries = map(str.strip, self.data.query.split("\n")) #filter empty lines queries = filter(lambda x: x, queries) queries = map(SearchQuery.from_string, queries) resolved = resolve_queries( list(queries), codebook=codebook, label_language=label_lan, replacement_language=replacement_lan ) return [q for q in resolved if not q.label.startswith("_")] @cached def get_count(self): try: return self.es.count(self.get_query(), self.get_filters()) except queryparser.QueryParseError: # try queries one by one for i, q in enumerate(self.get_queries()): queryparser.parse_to_terms(q.query, context=(q.declared_label or i+1)) # if error wasn't raised yet, re-raise original raise @cached def get_statistics(self): return self.es.statistics(self.get_query(), self.get_filters()) @cached def get_mediums(self): return Medium.objects.filter(id__in=self.get_medium_ids()) def get_aggregate(self, categories, flat=True): # If we're aggregating on terms, we don't want a global filter query = None if not any(isinstance(c, TermCategory) for c in categories): query = self.get_query() aggr = aggregate(query, self.get_filters(), categories, flat=flat) return sorted(aggr, key=to_sortable_tuple) def get_nested_aggregate(self, categories): return to_nested(self.get_aggregate(categories)) def get_medium_ids(self): return self.es.list_media(self.get_query(), self.get_filters()) def get_article_ids(self): return ES().query_ids(self.get_query(), self.get_filters()) def _get_article_ids_per_query(self): for q in self.get_queries(): yield q, list(ES().query_ids(q.query, self.get_filters())) def get_article_ids_per_query(self): return dict(self._get_article_ids_per_query()) def get_articles(self, size=None, offset=0): """ """ query = self.get_query() lead = not query fields = ['headline','text','date', 'length','medium','author','section'] return ES().query(query, self.get_filters(), True, size=size, from_=offset, fields=fields, lead=lead)
class SelectionSearch: """ """ def __init__(self, form): """ Form *must* be valid before passing. @type form: SelectionForm """ self.es = ES() self.form = form self.data = SelectionData(form.cleaned_data) def _get_filters(self): """ Get filters for dates, mediums, articlesets and articles for given form. Yields iterables of tuples containing (filter_name, filter_value). @type form: SelectionForm """ yield get_date_filters(self.data.start_date, self.data.end_date, self.data.on_date, self.data.datetype) yield (("mediumid", [m.id for m in self.data.mediums]),) yield (("sets", [a.id for a in self.data.articlesets]),) yield (("ids", self.data.article_ids or None),) @cached def get_filters(self): """Returns dict with filter -> value, which can be passed to elastic""" # Remove all filters which value is None return {k: v for k, v in chain(*self._get_filters()) if v is not None} @cached def get_query(self): """ @rtype: unicode """ return " OR ".join("(%s)" % q.query for q in self.get_queries()) or None @cached def get_queries(self): """Get SearchQuery objects @rtype: iterable of SearchQuery""" if not self.data.query: return [] codebook = self.data.codebook label_lan = self.data.codebook_label_language replacement_lan = self.data.codebook_replacement_language if codebook: codebook.cache_labels() queries = map(unicode.strip, self.data.query.split("\n")) queries = map(SearchQuery.from_string, queries) resolved = resolve_queries( queries, codebook=codebook, label_language=label_lan, replacement_language=replacement_lan ) return [q for q in resolved if not q.label.startswith("_")] @cached def get_count(self): return self.es.count(self.get_query(), self.get_filters()) @cached def get_statistics(self): return self.es.statistics(self.get_query(), self.get_filters()) @cached def get_mediums(self): return Medium.objects.filter(id__in=self.get_medium_ids()) def get_aggregate(self, x_axis, y_axis, interval="month"): x_axis = FIELD_MAP.get(x_axis, x_axis) y_axis = FIELD_MAP.get(y_axis, y_axis) if y_axis == "total": group_by = [x_axis] else: group_by = [x_axis, y_axis] query = None if "term" in (x_axis, y_axis) else self.get_query() aggr = ES().aggregate_query( query=query, terms=self.get_queries(), filters=self.get_filters(), group_by=group_by, date_interval=interval, sets=map(attrgetter("id"), self.data.articlesets), ) aggr = get_mediums(aggr, list(group_by)) aggr = get_articlesets(aggr, list(group_by)) return aggr def get_medium_ids(self): return self.es.list_media(self.get_query(), self.get_filters()) def get_article_ids(self): return ES().query_ids(self.get_query(), self.get_filters()) def _get_article_ids_per_query(self): for q in self.get_queries(): yield q, list(ES().query_ids(q.query, self.get_filters())) def get_article_ids_per_query(self): return dict(self._get_article_ids_per_query()) def get_articles(self, size=None, offset=0): """ """ article_ids = self.get_article_ids() if size is not None: article_ids = islice(article_ids, offset, size + offset) # Return in order article_ids = tuple(article_ids) article_dict = Article.objects.in_bulk(article_ids) return (article_dict[pk] for pk in article_ids)
class SelectionSearch: def __init__(self, form): """ Form *must* be valid before passing. @type form: SelectionForm """ self.es = ES() self.form = form self.data = SelectionData(form) def _get_set_filters(self): yield "sets", [a.id for a in self.data.articlesets] def _get_filters(self) -> Iterable[Tuple[str, Any]]: """ Get filters for dates, articlesets and articles for given form. Yields iterables of tuples containing (filter_name, filter_value). @type form: SelectionForm """ if self.data.start_date is not None: yield "start_date", self.data.start_date if self.data.end_date is not None: yield "end_date", self.data.end_date yield "ids", self.data.article_ids or None yield from self._get_set_filters() if self.data.filters: for filter in self.data.filters: yield from filter.get_filter_kwargs() @cached def get_filters(self): """Returns dict with filter -> value, which can be passed to elastic""" # Remove all filters which value is None return {k: v for k, v in self._get_filters() if v is not None} @cached def get_query(self): """ @rtype: str """ return ' OR '.join('(%s)' % q.query for q in self.get_queries()) or None @cached def get_queries(self): """Get SearchQuery objects @rtype: iterable of SearchQuery""" if not self.data.query: return [] codebook = self.data.codebook label_lan = self.data.codebook_label_language replacement_lan = self.data.codebook_replacement_language if codebook: codebook.cache_labels() queries = map(str.strip, self.data.query.split("\n")) # filter empty lines queries = filter(lambda x: x, queries) queries = map(SearchQuery.from_string, queries) resolved = resolve_queries( list(queries), codebook=codebook, label_language=label_lan, replacement_language=replacement_lan ) return [q for q in resolved if not q.label.startswith("_")] @cached def get_count(self): try: return self.es.count(self.get_query(), self.get_filters()) except queryparser.QueryParseError: # try queries one by one for i, q in enumerate(self.get_queries()): queryparser.parse_to_terms(q.query, context=(q.declared_label or i + 1)) # if error wasn't raised yet, re-raise original raise @cached def get_statistics(self): return self.es.statistics(self.get_query(), self.get_filters()) def get_aggregate(self, categories, flat=True, objects=True): # If we're aggregating on terms, we don't want a global filter query = None if not any(isinstance(c, TermCategory) for c in categories): query = self.get_query() return aggregate(query, self.get_filters(), categories, flat=flat, objects=objects) def get_nested_aggregate(self, categories): return to_nested(self.get_aggregate(categories)) def get_article_ids(self): return ES().query_ids(self.get_query(), self.get_filters()) def _get_article_ids_per_query(self): for q in self.get_queries(): yield q, list(ES().query_ids(q.query, self.get_filters())) def get_article_ids_per_query(self): return dict(self._get_article_ids_per_query()) def get_articles(self, size=None, offset=0, fields=(), **kwargs): return ES().query(self.get_query(), self.get_filters(), True, size=size, from_=offset, _source=fields, **kwargs) @staticmethod def get_instance(form): """ Gets a SelectionSearch instance depending on the selection data. If codingjobs are given, a CodingJobSelectionSearch is returned. :param form: A SelectionForm :return: An instance of SelectionSearch that is appropriate for the given SelectionForm. """ data = SelectionData(form) if data.codingjobs: return CodingJobSelectionSearch(form) if data.articlesets: return SelectionSearch(form) raise Exception("Invalid selection: no articlesets or codingjobs given.")