Пример #1
0
    def test_properties(self):
        """Are properties stored as flat fields and with correct mapping?"""
        props = dict(proptest="123 test, and another",
                     proptest2_url="http://example.org",
                     proptest3_date="2001-01-01",
                     proptest4_num=-1,
                     proptest5_tag={"123 test", "and another"},
                     proptest6_id="123 test, and another")

        self.assertEqual(
            set(props.keys()) & set(ES().get_mapping().keys()), set())

        a = amcattest.create_test_article(properties=props)

        mapping = ES().get_mapping()
        for field, ftype in dict(proptest="default",
                                 proptest2_url="url",
                                 proptest3_date="date",
                                 proptest4_num="num",
                                 proptest5_tag="tag").items():
            self.assertEqual(mapping[field], settings.ES_MAPPING_TYPES[ftype])

        src = ES().get(a.id)
        self.assertEqual(set(mapping.keys()), set(props.keys()) | ALL_FIELDS)

        # test if term vectors are correct, i.e. test analysis
        def tokens(field):
            tokens = list(ES().get_tokens(a.id, fields=[field]))
            return [w for (f, p, w) in sorted(tokens)]

        self.assertEqual(tokens("proptest"), ["123", "test", "and", "another"])
        self.assertEqual(set(tokens("proptest5_tag")),
                         {"123 test", "and another"})
        self.assertEqual(tokens("proptest6_id"), ["123 test, and another"])
        self.assertEqual(tokens("proptest2_url"), ["http://example.org"])
Пример #2
0
    def _run(self, job):
        es = ES()
        for ca in job.coded_articles.all():
            coding_json = {"job": job.id, "sentence_codings": []} 
            
            for coding in ca.codings.all():
                values = {}
                print((coding, coding.sentence))
                for cv in coding.values.all():
                    fieldtype = cv.field.fieldtype.name
                    if fieldtype == "Codebook":
                        values[cv.field.label] = cv.intval
                        values[cv.field.label + "_label"] = cv.value.label
                    elif fieldtype == "Text":
                        values[cv.field.label + "_label"] = cv.value
                    elif fieldtype == "Quality":
                        values[cv.field.label] = cv.value / 10
                    elif fieldtype == "Yes/No":
                        values[cv.field.label + "_bool"] = cv.value
                    else:
                        values[cv.field.label] = cv.value
                
                if coding.sentence_id is None:
                    coding_json["article_coding"] = values
                else:
                    values["sentence_id"] = coding.sentence_id
                    coding_json["sentence_codings"].append(coding)

            src = es.get(ca.article_id)
            src["codings"] = [c for c in src.get('codings', [])
                              if not c['job'] == job.id]
            src['codings'].append(coding_json)
            es.es.index(index=es.index, doc_type=es.doc_type, id=ca.article_id, body=src)
Пример #3
0
    def _run(self, job):
        es = ES()
        for ca in job.coded_articles.all():
            coding_json = {"job": job.id, "sentence_codings": []} 
            
            for coding in ca.codings.all():
                values = {}
                print coding, coding.sentence
                for cv in coding.values.all():
                    fieldtype = cv.field.fieldtype.name
                    if fieldtype == "Codebook":
                        values[cv.field.label] = cv.intval
                        values[cv.field.label + "_label"] = cv.value.label
                    elif fieldtype == "Text":
                        values[cv.field.label + "_label"] = cv.value
                    elif fieldtype == "Quality":
                        values[cv.field.label] = cv.value / 10
                    elif fieldtype == "Yes/No":
                        values[cv.field.label + "_bool"] = cv.value
                    else:
                        values[cv.field.label] = cv.value
                
                if coding.sentence_id is None:
                    coding_json["article_coding"] = values
                else:
                    values["sentence_id"] = coding.sentence_id
                    coding_json["sentence_codings"].append(coding)

            src = es.get(ca.article_id)
            src["codings"] = [c for c in src.get('codings', [])
                              if not c['job'] == job.id]
            src['codings'].append(coding_json)
            es.es.index(index=es.index, doc_type=es.doc_type, id=ca.article_id, body=src)
Пример #4
0
def getArticles(form, **kargs):
    fields = ['mediumid', 'date', 'headline', 'medium']
    
    sort = form.get('sortColumn', None)

    if 'keywordInContext' in form['columns']:
        raise NotImplementedError()

    query = query_from_form(form)

    kargs["highlight" if query else "lead"] = True
        
    filters = dict(filters_from_form(form))

    log.info("Query: {query!r}, with filters: {filters}".format(**locals()))


    score = 'hits' in form['columns']
    result = list(ES().query(query, filters=filters, fields=fields, sort=sort, score=score, **kargs))

    if 'hits' in form['columns']:
        # add hits columns
        def add_hits_column(r):
            r.hits = {q.label : 0 for q in form['queries']}
            return r
            
        result_dict = {r.id : add_hits_column(r) for r in result}
        f = dict(ids=list(result_dict.keys()))
        
        for q in queries_from_form(form):
            for hit in ES().query(q.query, filters=f, fields=[]):
                result_dict[hit.id].hits[q.label] = hit.score

    return result
Пример #5
0
    def test_filters(self):
        """
        Do filters work properly?
        """
        m1, m2 = [amcattest.create_test_medium() for _ in range(2)]
        a = amcattest.create_test_article(text='aap noot mies', medium=m1, date="2001-01-01")
        b = amcattest.create_test_article(text='noot mies wim zus', medium=m2, date="2002-01-01")
        c = amcattest.create_test_article(text='mies bla bla bla wim zus jet', medium=m2, date="2003-01-01")

        s1 = amcattest.create_test_set(articles=[a, b, c])
        s2 = amcattest.create_test_set(articles=[a, b])
        ES().flush()

        q = lambda **filters: set(ES().query_ids(filters=filters))

        # MEDIUM FILTER
        self.assertEqual(q(mediumid=m2.id), {b.id, c.id})

        #### DATE FILTERS
        self.assertEqual(q(sets=s1.id, start_date='2001-06-01'), {b.id, c.id})
        # start is inclusive
        self.assertEqual(q(sets=s1.id, start_date='2002-01-01', end_date="2002-06-01"), {b.id})
        # end is exclusive
        self.assertEqual(q(sets=s1.id, start_date='2001-01-01', end_date="2003-01-01"), {a.id, b.id})

        # COMBINATION
        self.assertEqual(q(sets=s2.id, start_date='2001-06-01'), {b.id})
        self.assertEqual(q(end_date='2002-06-01', mediumid=m2.id), {b.id})
Пример #6
0
    def test_aggregate(self):
        """Can we make tables per date interval?"""
        s1, s2, a, b, c, d, e = self.setup()

        self.assertEqual(
            dict(ES().aggregate_query(filters=dict(sets=s1.id),
                                      group_by="date",
                                      date_interval="year")), {
                                          datetime.datetime(2001, 1, 1): 3,
                                          datetime.datetime(2002, 1, 1): 1
                                      })

        self.assertEqual(
            dict(ES().aggregate_query(filters=dict(sets=s1.id),
                                      group_by="date",
                                      date_interval="month")), {
                                          datetime.datetime(2001, 1, 1): 1,
                                          datetime.datetime(2002, 1, 1): 1,
                                          datetime.datetime(2001, 2, 1): 2
                                      })

        # set statistics
        stats = ES().statistics(filters=dict(sets=s1.id))
        self.assertEqual(stats.n, 4)
        self.assertEqual(stats.start_date, datetime.datetime(2001, 1, 1))
        self.assertEqual(stats.end_date, datetime.datetime(2002, 1, 1))
Пример #7
0
    def test_properties(self):
        """Are properties stored as flat fields and with correct mapping?"""
        props = dict(
            proptest="123 test, and another",
            proptest2_url="http://example.org",
            proptest3_date="2001-01-01",
            proptest4_num=-1,
            proptest5_tag={"123 test", "and another"},
            proptest6_id="123 test, and another")

        self.assertEqual(set(props.keys()) & set(ES().get_mapping().keys()), set())

        a = amcattest.create_test_article(properties=props)

        mapping = ES().get_mapping()
        for field, ftype in dict(proptest="default", proptest2_url="url",
                                 proptest3_date="date", proptest4_num="num",
                                 proptest5_tag="tag").items():
            self.assertEqual(mapping[field], settings.ES_MAPPING_TYPES[ftype])
            
        src = ES().get(a.id)
        self.assertEqual(set(mapping.keys()), set(props.keys()) | ALL_FIELDS)

        # test if term vectors are correct, i.e. test analysis
        def tokens(field):
            tokens = list(ES().get_tokens(a.id, fields=[field]))
            return [w for (f, p, w) in sorted(tokens)]

        self.assertEqual(tokens("proptest"), ["123", "test", "and", "another"])
        self.assertEqual(set(tokens("proptest5_tag")), {"123 test", "and another"})
        self.assertEqual(tokens("proptest6_id"), ["123 test, and another"])
        self.assertEqual(tokens("proptest2_url"), ["http://example.org"])
Пример #8
0
 def test_complex_phrase_query(self):
     """Test complex phrase queries. DOES NOT WORK YET"""
     a = amcattest.create_test_article(text='aap noot mies')
     b = amcattest.create_test_article(text='noot mies wim zus')
     c = amcattest.create_test_article(text='mies bla bla bla wim zus jet')
     s1 = amcattest.create_test_set(articles=[a, b, c])
     ES().add_articles([a.id, b.id, c.id])
     self.assertEqual(set(ES().query_ids('"mi* wi*"~5', filters=dict(sets=s1.id))), {b.id, c.id})
Пример #9
0
 def _refresh_property_cache(self) -> Set[str]:
     """Discard property cache and recalculate properties"""
     from amcat.tools.amcates import ES
     es = ES()
     es.refresh()
     properties = es.get_used_properties([self.id])
     self._reset_property_cache()
     return self._add_to_property_cache(properties)
Пример #10
0
 def refresh_index(self, full_refresh=False):
     """
     Make sure that the index for this set is up to date
     """
     from amcat.tools.amcates import ES
     ES().check_index()
     ES().synchronize_articleset(self, full_refresh=full_refresh)
     self.save()
Пример #11
0
 def __init__(self, form):
     """
     Form *must* be valid before passing.
     @type form: SelectionForm
     """
     self.es = ES()
     self.form = form
     self.data = SelectionData(form.cleaned_data)
Пример #12
0
 def _refresh_property_cache(self) -> Set[str]:
     """Discard property cache and recalculate properties"""
     from amcat.tools.amcates import ES
     es = ES()
     es.refresh()
     properties = es.get_used_properties([self.id])
     self._reset_property_cache()
     return self._add_to_property_cache(properties)
Пример #13
0
 def test_query(self):
     """Do query and query_ids work properly?"""
     a = amcattest.create_test_article(headline="bla", text="artikel artikel een", date="2001-01-01")
     ES().flush()
     es_a, = ES().query("een", fields=["date", "headline"])
     self.assertEqual(es_a.headline, "bla")
     self.assertEqual(es_a.id, a.id)
     ids = set(ES().query_ids(filters=dict(mediumid=a.medium_id)))
     self.assertEqual(ids, {a.id})
Пример #14
0
    def inner(*args, **kargs):
        from amcat.tools.amcates import ES

        es = ES()
        if not es.es.ping():
            raise unittest.SkipTest("ES not enabled")
        es.delete_index()
        ES().check_index()
        return func(*args, **kargs)
Пример #15
0
 def inner(*args, **kargs):
     from amcat.tools.amcates import ES
     if not settings.ES_INDEX.endswith("__unittest"):
         settings.ES_INDEX += "__unittest"
     es = ES()
     if not es.es.ping():
         raise unittest.SkipTest("ES not enabled")
     es.delete_index()
     ES().check_index()
     return func(*args, **kargs)
Пример #16
0
 def inner(*args, **kargs):
     from amcat.tools.amcates import ES
     if not settings.ES_INDEX.endswith("__unittest"):
         settings.ES_INDEX += "__unittest"
     es = ES()
     if not es.es.ping():
         raise unittest.SkipTest("ES not enabled")
     es.delete_index()
     ES().check_index()
     return func(*args, **kargs)
Пример #17
0
    def test_scores(self):
        """test if scores (and matches) are as expected for various queries"""
        s = amcattest.create_test_set(articles=[
            amcattest.create_test_article(headline="a", text='dit is een test'),
        ])

        s.refresh_index()

        def q(query):
            result = ES().query(query, filters={'sets': s.id}, fields=["headline"])
            return {a.headline: a.score for a in result}

        self.assertEqual(q("test"), {"a": 1})

        m1, m2 = [amcattest.create_test_medium() for _ in range(2)]
        a = amcattest.create_test_article(text='aap noot mies', medium=m1)
        b = amcattest.create_test_article(text='noot mies wim zus', medium=m2)
        c = amcattest.create_test_article(text='mies bla bla bla wim zus jet', medium=m2)
        d = amcattest.create_test_article(text='ik woon in een sociale huurwoning, net als anderen', medium=m2)
        ES().flush()

        self.assertEqual(set(ES().query_ids("no*")), {a.id, b.id})
        self.assertEqual(set(ES().query_ids("no*", filters=dict(mediumid=m2.id))), {b.id})
        self.assertEqual(set(ES().query_ids("zus AND jet", filters=dict(mediumid=m2.id))), {c.id})
        self.assertEqual(set(ES().query_ids("zus OR jet", filters=dict(mediumid=m2.id))), {b.id, c.id})
        self.assertEqual(set(ES().query_ids('"mies wim"', filters=dict(mediumid=m2.id))), {b.id})
        self.assertEqual(set(ES().query_ids('"mies wim"~5', filters=dict(mediumid=m2.id))), {b.id, c.id})

        self.assertEqual(set(ES().query_ids('"sociale huur*"', filters=dict(mediumid=m2.id))), {d.id})
        self.assertEqual(set(ES().query_ids('"sociale huur*"', filters=dict(mediumid=m2.id))), {d.id})
Пример #18
0
    def test_byline(self):
        aset = amcattest.create_test_set()
        amcattest.create_test_article(byline="bob", text="eve", articleset=aset)

        ES().flush()

        q = lambda query: set(ES().query_ids(query, filters={"sets": aset.id}))

        self.assertEqual(1, len(q("byline:bob")))
        self.assertEqual(0, len(q("byline:eve")))
        self.assertEqual(1, len(q("bob")))
Пример #19
0
 def test_query(self):
     """Do query and query_ids work properly?"""
     a = amcattest.create_test_article(title="bla",
                                       text="artikel artikel een",
                                       date="2001-01-01")
     ES().refresh()
     es_a, = ES().query("een", fields=["date", "title"])
     self.assertEqual(es_a.title, "bla")
     self.assertEqual(es_a.id, a.id)
     ids = set(ES().query_ids(filters=dict(title='bla')))
     self.assertEqual(ids, {a.id})
def set_default_similarity(*args, **kwargs):
    # Make sure index exists
    es = ES()
    es.check_index()
    es.refresh()

    # Push new settings to indices
    indices = es.es.indices
    indices.close(es.index)
    indices.put_settings(settings.ES_SETTINGS, es.index)
    indices.open(es.index)
Пример #21
0
    def test_articlesets(self):
        a, b, c = [amcattest.create_test_article() for _x in range(3)]
        s1 = amcattest.create_test_set(articles=[a, b, c])
        s2 = amcattest.create_test_set(articles=[b, c])
        s3 = amcattest.create_test_set(articles=[b])
        ES().refresh()

        es_c = ES().get(c.id)
        self.assertEqual(set(es_c['sets']), {s1.id, s2.id})

        ids = ES().query_ids(filters=dict(sets=s1.id))
        self.assertEqual(set(ids), {a.id, b.id, c.id})
Пример #22
0
    def create(self, validated_data):
        try:
            article = Article.objects.get(uuid=validated_data["uuid"])
        except (Article.DoesNotExist, KeyError) as e:
            article = super(ArticleSerializer, self).create(validated_data)

        elastic = ES()
        elastic.add_articles([article.id])
        elastic.flush()

        self.context["view"].articleset.add_articles([article])
        return article
Пример #23
0
def _add_column(table, column_name, query, filters, group_by, dateInterval):
    if group_by == "total":
        n = ES().count(query, filters)
        table.addValue("Total", column_name, n)
    else:        
        results = ES().aggregate_query(query, filters, group_by, dateInterval)
        if group_by == "mediumid": 
            results = add_medium_names(results)

        for group, n in results:
            table.addValue(unicode(group), column_name, n)
    table.columnTypes[column_name] = int
Пример #24
0
    def test_query_all(self):
        """Test that query_all works"""
        from amcat.models import Article
        arts = [amcattest.create_test_article(create=False) for _ in range(20)]
        s = amcattest.create_test_set()
        Article.create_articles(arts, articleset=s, check_duplicate=False, create_id=True)
        ES().flush()

        r = ES().query(filters=dict(sets=s.id), size=10)
        self.assertEqual(len(list(r)), 10)

        r = ES().query_all(filters=dict(sets=s.id), size=10)
        self.assertEqual(len(list(r)), len(arts))
Пример #25
0
    def test_not(self):
        aset = amcattest.create_test_set()
        eve = amcattest.create_test_article(text="eve", articleset=aset)
        paul = amcattest.create_test_article(text="paul", articleset=aset)
        adam = amcattest.create_test_article(text="adam", articleset=aset)

        ES().flush()

        q = lambda query: set(ES().query_ids(query, filters={"sets": aset.id}))

        self.assertEqual({eve.id}, q("eve"))
        self.assertEqual({paul.id, adam.id}, q("NOT eve"))
        self.assertEqual({paul.id, adam.id}, q("* NOT eve"))
        self.assertEqual({eve.id}, q("NOT (NOT eve)"))
Пример #26
0
    def test_estoken(self):
        aset = amcattest.create_test_set()
        a1 = amcattest.create_test_article(title="dit is de titel",
                                           text="En dit, dit is de tekst",
                                           articleset=aset,
                                           project=aset.project)
        a2 = amcattest.create_test_article(
            title="dit is nog een kop",
            text="Van je een, van je twee, van je drie!",
            articleset=aset,
            project=aset.project)

        ES().refresh()
        #     url(r'^projects/(?P<project_id>[0-9]+)/articlesets/(?P<articleset_id>[0-9]+)/tokens/?$', TokensView.as_view(), name="tokens"),

        url = reverse("api:tokens",
                      kwargs=dict(project_id=aset.project.id,
                                  articleset_id=aset.id)) + "?format=json"
        r = self.client.get(url)
        self.assertEqual(r.status_code, 200)

        tokens = json.loads(r.content.decode(r.charset))['results']

        words1 = " ".join(t["word"] for t in tokens if t['id'] == a1.id)
        words2 = " ".join(t["word"] for t in tokens if t['id'] == a2.id)

        self.assertEqual(words1, "dit is de titel en dit dit is de tekst")
        self.assertEqual(
            words2, "dit is nog een kop van je een van je twee van je drie")
Пример #27
0
    def set_up(self):
        self.a1 = amcattest.create_test_article()
        self.a1.text = "aap noot mies"
        self.a1.date = datetime.datetime(2010, 1, 1)
        self.a1.properties = {"author": "De Bas", "length_int": 5}
        self.a1.save()

        self.a2 = amcattest.create_test_article()
        self.a2.text = "aap noot geit"
        self.a2.date = datetime.datetime(2010, 1, 1)
        self.a2.properties = {"author": "Het Martijn", "length_int": 5}
        self.a2.save()

        self.a3 = amcattest.create_test_article()
        self.a3.text = "lamp"
        self.a3.date = datetime.datetime(2010, 1, 2)
        self.a3.properties = {"author": "Het Martijn", "length_int": 15}
        self.a3.save()

        self.aset1 = amcattest.create_test_set()
        self.aset1.add_articles([self.a1, self.a2])
        self.aset1.refresh_index(True)

        self.aset2 = amcattest.create_test_set()
        self.aset2.add_articles([self.a3])
        self.aset2.refresh_index(True)

        ES().refresh()
Пример #28
0
 def get_mediums(self):
     """
     Return a sequence of Medium object used in this set
     """
     from amcat.tools.amcates import ES
     medium_ids = ES().list_media(filters=dict(sets=self.id))
     return Medium.objects.filter(id__in=medium_ids)
Пример #29
0
    def _do_query(self, query):
        result = ES().search(query)

        if len(result["hits"]["hits"]) == self.size:
            raise NotImplementedError("Returned 10000 articles exactly. Time to implement scroll :)")

        return result
Пример #30
0
    def test_date(self):
        # Test iso8601 parsing, database parsing, etc.
        iso8601_date_string = '1992-12-31T23:59:00'
        date = datetime.datetime(1992, 12, 31, 23, 59, 0)
        date_parsed = iso8601.parse_date(iso8601_date_string,
                                         default_timezone=None)
        a = amcattest.create_test_article(date=iso8601_date_string)
        self.assertEqual(date_parsed, date)
        self.assertEqual(a.date, date)

        ES().refresh()

        # Test Elastic date parsing
        es_date = ES().get(a.id)["date"]
        self.assertEqual(es_date, '1992-12-31T23:59:00')
        self.assertEqual(iso8601.parse_date(es_date, None), date)
Пример #31
0
    def __iter__(self) -> Iterable[ESArticle]:
        if not self.highlights:
            # Case 1: no highlighters
            hits = ES().search(self.get_query())["hits"]["hits"]
            for hit in hits:
                _to_flat_dict(hit["fields"])
                yield ESArticle(self.fields, hit["fields"])
        else:
            # Case 2: at least one highlighter present. We need to execute a query for every
            # highlighter plus one for the original text.
            original_texts = self._do_query(self.get_query())["hits"]["hits"]
            for hit in original_texts:
                _to_flat_dict(hit["fields"])

            # Order might be unreliable, so we make mappings
            unordered = self.order_by(None)
            highlighted_texts = []
            for highlight in self.highlights:
                unordered.get_query(highlight)
                result = self._do_query(self.get_query(highlight))
                for hit in result["hits"]["hits"]:
                    _to_flat_dict(hit["highlight"])
                highlighted_texts.append({d["_id"]: d["highlight"] for d in result["hits"]["hits"]})

            markers = [h.mark for h in self.highlights]
            for text in original_texts:
                highlighted = [h.get(text["_id"], text["fields"]) for h in highlighted_texts]
                merged = dict(merge_highlighted_document(text["fields"], highlighted, markers))
                yield HighlightedESArticle(self.fields, ChainMap(merged, text["fields"]))
Пример #32
0
def getTable(form, progress_monitor=NullMonitor):
    table = table3.DictTable(default=0)
    table.rowNamesRequired = True
    dateInterval = form['dateInterval']
    group_by = form['xAxis']
    if group_by == "medium": group_by = "mediumid"
    filters = dict(filters_from_form(form))

    queries = list(queries_from_form(form))
    query = query_from_form(form)

    yAxis = form['yAxis']
    if yAxis == 'total':
        _add_column(table, 'total', query, filters, group_by, dateInterval)
        progress_monitor.update(90, "Got results")
    elif yAxis == 'medium':
        media = Medium.objects.filter(pk__in=ES().list_media(query, filters)).only("name")
        
        for medium in sorted(media):
            filters['mediumid'] = medium.id
            name = u"{medium.id} - {}".format(medium.name.replace(",", " ").replace(".", " "), **locals())
            _add_column(table, name, query, filters, group_by, dateInterval)
            progress_monitor.update(90 / len(media), "Got results for medium {medium.id}".format(**locals()))
    elif yAxis == 'searchTerm':
        for q in queries:
            _add_column(table, q.label, q.query, filters, group_by, dateInterval)
            progress_monitor.update(90 / len(queries), "Got results for {q.label!r}".format(**locals()))
    else:
        raise Exception('yAxis {yAxis} not recognized'.format(**locals()))

    table.queries = queries
    return table
Пример #33
0
    def setup(self):
        s1 = amcattest.create_test_set()
        s2 = amcattest.create_test_set()
        a = amcattest.create_test_article(text='aap noot mies',
                                          title='m1',
                                          date='2001-01-01',
                                          create=False)
        b = amcattest.create_test_article(text='noot mies wim zus',
                                          title='m2',
                                          date='2001-02-01',
                                          create=False)
        c = amcattest.create_test_article(text='mies bla bla bla wim zus jet',
                                          title='m2',
                                          date='2002-01-01',
                                          create=False)
        d = amcattest.create_test_article(text='noot mies wim zus',
                                          title='m2',
                                          date='2001-02-03',
                                          create=False)
        e = amcattest.create_test_article(text='aap noot mies',
                                          title='m3',
                                          articleset=s2)

        Article.create_articles([a, b, c, d], articleset=s1)
        ES().refresh()
        return s1, s2, a, b, c, d, e
Пример #34
0
    def test_highlight_article(self):
        s1, s2, a, b, c, d, e = self.setup()

        result = ES().highlight_article(a.id, "aap")
        self.assertEqual(result["text"], "<em>aap</em> noot mies")

        result = ES().highlight_article(a.id, "aap OR mies")
        self.assertEqual(result["text"], "<em>aap</em> noot <em>mies</em>")

        result = ES().highlight_article(a.id, "aap OR mies")
        self.assertEqual(result["text"], "<em>aap</em> noot <em>mies</em>")

        result = ES().highlight_article(a.id, '"aap mies"~0')
        self.assertEqual(result["text"], "aap noot mies")

        result = ES().highlight_article(a.id, '"aap mies"~1')
        self.assertEqual(result["text"], "<em>aap</em> noot <em>mies</em>")
Пример #35
0
    def get_article_ids_from_elastic(self):
        """
        Return the sequence of ids of articles in this set. As opposed to get_article_ids, this
        method uses elastic to fetch its data.

        @rtype: set
        """
        return set(ES().query_ids(filters={"sets": [self.id]}))
Пример #36
0
    def test_add_many(self):
        """Can we add a large number of articles from one set to another?"""
        s = amcattest.create_test_set()
        s2 = amcattest.create_test_set()
        p = amcattest.create_test_project()

        arts = [
            amcattest.create_test_article(project=p, create=False)
            for _x in range(1213)
        ]
        Article.create_articles(arts, s)
        ES().refresh()
        self.assertEqual(len(arts), s.get_count())
        s2.add_articles(arts)
        ES().refresh()
        self.assertEqual(len(arts), s2.get_count())
        print(s2.get_count())
Пример #37
0
 def __init__(self, form):
     """
     Form *must* be valid before passing.
     @type form: SelectionForm
     """
     self.es = ES()
     self.form = form
     self.data = SelectionData(form.cleaned_data)
Пример #38
0
    def test_elastic_hash(self):
        """Can we reproduce a hash from elastic data alone?"""
        article = Article(**{
            "date": datetime.date(2015, 1, 1),
            "section": "\u6f22\u5b57",
            "pagenr": 1928390,
            "headline": "Headline hier.",
            "byline": "byline..",
            "length": 1928,
            "metastring": "Even more strange characters.. \x0C ..",
            "url": "https://example.com",
            "externalid": None,
            "author": None,
            "addressee": "Hmm",
            "text": "Contains invalid char \x08 woo",
            "medium": create_test_medium(name="abc."),
            "project": create_test_project()
        })

        article.save()

        es = ES()
        es.add_articles([article.id])
        hash = get_article_dict(article)["hash"]
        es.flush()

        es_articles = es.query_all(filters={"ids": [article.id]}, fields=HASH_FIELDS + ["hash"])
        es_article = list(es_articles)[0]

        self.assertEqual(article.id, es_article.id)
        self.assertEqual(hash, es_article.hash)
        self.assertEqual(_get_hash(es_article.to_dict()), hash)
Пример #39
0
    def add_articles(self, article_ids, add_to_index=True, monitor=NullMonitor()):
        """
        Add the given articles to this articleset. Implementation is exists of three parts:

          1. Adding ArticleSetArticle objects
          2. Adding CodedArticle objects
          3. Updating index

        @param article_ids: articles to be removed
        @type article_ids: iterable with indexing of integers or Article objects

        @param add_to_index: notify elasticsearch of changes
        @type add_to_index: bool
        """
        monitor = monitor.submonitor(total=4)

        article_ids = {(art if type(art) is int else art.id) for art in article_ids}

        # Only use articles that exist
        to_add = article_ids - self.get_article_ids()
        to_add = list(Article.exists(to_add))

        monitor.update(message="Adding {n} articles to {aset}..".format(n=len(to_add), aset=self))
        ArticleSetArticle.objects.bulk_create(
            [ArticleSetArticle(articleset=self, article_id=artid) for artid in to_add],
            batch_size=100,
        )

        monitor.update(message="{n} articleset articles added to database, adding to codingjobs..".format(n=len(to_add)))
        cjarts = [CodedArticle(codingjob=c, article_id=a) for c, a in itertools.product(self.codingjob_set.all(), to_add)]
        CodedArticle.objects.bulk_create(cjarts)

        if add_to_index:
            monitor.update(message="{n} articles added to codingjobs, adding to index".format(n=len(cjarts)))
            es = ES()
            es.add_to_set(self.id, to_add, monitor=monitor)
            es.refresh()  # We need to flush, or setting cache will fail
            # Add to property cache
            properties = ES().get_used_properties(article_ids=to_add)
            self._add_to_property_cache(properties)
        else:
            monitor.update(2)
Пример #40
0
    def determine_metadata(self, request, view):
        form = view.get_form()
        field_names = list(form.fields.keys())
        fields = list(map(partial(getitem, form), field_names))

        articlesets = view.get_articlesets()

        props = {prop for aset in articlesets for prop in aset.get_used_properties()}
        articleset_ids = list(articlesets.values_list('id', flat=True))

        # lucene limitation
        setsquery = {
           'bool': {
                'should': [ {'terms': {'sets': articleset_ids[i:i+1000]} } for i in range(0, len(articleset_ids), 1000) ]
            }
        }

        if props:
            aggs = ES().search({
                'aggs': {
                    k: {
                        'terms': {
                            'field': '{}.raw'.format(k) if get_property_mapping_type(k) == "default" else k,
                            'size': self.bucket_count_limit
                        }
                    } for k in props
                },
                'query': setsquery
            })['aggregations']

            filter_props = {k: [v['key'] for v in vs['buckets']] for k, vs in aggs.items()}
        else:
            filter_props = {}

        return {
            "help_texts": OrderedDict(zip(field_names, [f.help_text.strip() or None for f in fields])),
            "form": OrderedDict(zip(field_names, [f.as_widget() for f in fields])),
            "labels": OrderedDict(zip(field_names, [f.label for f in fields])),
            "help_text": view.get_view_description(),
            "filter_properties": filter_props  # TODO: filter_properties should be moved to a different view.
        }
Пример #41
0
    def test_elastic_hash(self):
        """Can we reproduce a hash from elastic data alone?"""
        article = Article(**{
            "date": datetime.date(2015, 1, 1),
            "section": "\u6f22\u5b57",
            "pagenr": 1928390,
            "headline": "Headline hier.",
            "byline": "byline..",
            "length": 1928,
            "metastring": "Even more strange characters.. \x0C ..",
            "url": "https://example.com",
            "externalid": None,
            "author": None,
            "addressee": "Hmm",
            "text": "Contains invalid char \x08 woo",
            "medium": create_test_medium(name="abc."),
            "project": create_test_project()
        })

        article.save()

        es = ES()
        es.add_articles([article.id])
        hash = get_article_dict(article)["hash"]
        es.flush()

        es_articles = es.query_all(filters={"ids": [article.id]}, fields=HASH_FIELDS + ["hash"])
        es_article = list(es_articles)[0]

        self.assertEqual(article.id, es_article.id)
        self.assertEqual(hash, es_article.hash)
        self.assertEqual(_get_hash(es_article.to_dict()), hash)
Пример #42
0
class SelectionSearch:
    def __init__(self, form):
        """
        Form *must* be valid before passing.
        @type form: SelectionForm
        """
        self.es = ES()
        self.form = form
        self.data = SelectionData(form.cleaned_data)

    def _get_filters(self):
        """
        Get filters for dates, mediums, articlesets and articles for given form. Yields
        iterables of tuples containing (filter_name, filter_value).

        @type form: SelectionForm
        """
        yield get_date_filters(
            self.data.start_date, self.data.end_date,
            self.data.on_date, self.data.datetype
        )

        yield (("mediumid", [m.id for m in self.data.mediums]),)
        yield (("sets", [a.id for a in self.data.articlesets]),)
        yield (("ids", self.data.article_ids or None),)

    @cached
    def get_filters(self):
        """Returns dict with filter -> value, which can be passed to elastic"""
        # Remove all filters which value is None
        return {k: v for k, v in chain(*self._get_filters()) if v is not None}

    @cached
    def get_query(self):
        """
        @rtype: str
        """
        return ' OR '.join('(%s)' % q.query for q in self.get_queries()) or None

    @cached
    def get_queries(self):
        """Get SearchQuery objects

        @rtype: iterable of SearchQuery"""
        if not self.data.query:
            return []

        codebook = self.data.codebook
        label_lan = self.data.codebook_label_language
        replacement_lan = self.data.codebook_replacement_language

        if codebook:
            codebook.cache_labels()

        queries = map(str.strip, self.data.query.split("\n"))
        #filter empty lines
        queries = filter(lambda x: x, queries)
        queries = map(SearchQuery.from_string, queries)

        resolved = resolve_queries(
            list(queries), codebook=codebook,
            label_language=label_lan,
            replacement_language=replacement_lan
        )

        return [q for q in resolved if not q.label.startswith("_")]

    @cached
    def get_count(self):
        try:
            return self.es.count(self.get_query(), self.get_filters())
        except queryparser.QueryParseError:
            # try queries one by one
            for i, q in enumerate(self.get_queries()):
                queryparser.parse_to_terms(q.query, context=(q.declared_label or i+1))
            # if error wasn't raised yet, re-raise original
            raise

    @cached
    def get_statistics(self):
        return self.es.statistics(self.get_query(), self.get_filters())

    @cached
    def get_mediums(self):
        return Medium.objects.filter(id__in=self.get_medium_ids())

    def get_aggregate(self, categories, flat=True):
        # If we're aggregating on terms, we don't want a global filter
        query = None
        if not any(isinstance(c, TermCategory) for c in categories):
            query = self.get_query()

        aggr = aggregate(query, self.get_filters(), categories, flat=flat)
        return sorted(aggr, key=to_sortable_tuple)

    def get_nested_aggregate(self, categories):
        return to_nested(self.get_aggregate(categories))

    def get_medium_ids(self):
        return self.es.list_media(self.get_query(), self.get_filters())

    def get_article_ids(self):
        return ES().query_ids(self.get_query(), self.get_filters())

    def _get_article_ids_per_query(self):
        for q in self.get_queries():
            yield q, list(ES().query_ids(q.query, self.get_filters()))

    def get_article_ids_per_query(self):
        return dict(self._get_article_ids_per_query())

    def get_articles(self, size=None, offset=0):
        """

        """
        query = self.get_query()
        lead = not query
        fields = ['headline','text','date', 'length','medium','author','section']
        return ES().query(query, self.get_filters(), True, size=size, from_=offset, fields=fields, lead=lead)
Пример #43
0
class SelectionSearch:
    """

    """

    def __init__(self, form):
        """
        Form *must* be valid before passing.
        @type form: SelectionForm
        """
        self.es = ES()
        self.form = form
        self.data = SelectionData(form.cleaned_data)

    def _get_filters(self):
        """
        Get filters for dates, mediums, articlesets and articles for given form. Yields
        iterables of tuples containing (filter_name, filter_value).

        @type form: SelectionForm
        """
        yield get_date_filters(self.data.start_date, self.data.end_date, self.data.on_date, self.data.datetype)

        yield (("mediumid", [m.id for m in self.data.mediums]),)
        yield (("sets", [a.id for a in self.data.articlesets]),)
        yield (("ids", self.data.article_ids or None),)

    @cached
    def get_filters(self):
        """Returns dict with filter -> value, which can be passed to elastic"""
        # Remove all filters which value is None
        return {k: v for k, v in chain(*self._get_filters()) if v is not None}

    @cached
    def get_query(self):
        """
        @rtype: unicode
        """
        return " OR ".join("(%s)" % q.query for q in self.get_queries()) or None

    @cached
    def get_queries(self):
        """Get SearchQuery objects

        @rtype: iterable of SearchQuery"""
        if not self.data.query:
            return []

        codebook = self.data.codebook
        label_lan = self.data.codebook_label_language
        replacement_lan = self.data.codebook_replacement_language

        if codebook:
            codebook.cache_labels()

        queries = map(unicode.strip, self.data.query.split("\n"))
        queries = map(SearchQuery.from_string, queries)

        resolved = resolve_queries(
            queries, codebook=codebook, label_language=label_lan, replacement_language=replacement_lan
        )

        return [q for q in resolved if not q.label.startswith("_")]

    @cached
    def get_count(self):
        return self.es.count(self.get_query(), self.get_filters())

    @cached
    def get_statistics(self):
        return self.es.statistics(self.get_query(), self.get_filters())

    @cached
    def get_mediums(self):
        return Medium.objects.filter(id__in=self.get_medium_ids())

    def get_aggregate(self, x_axis, y_axis, interval="month"):
        x_axis = FIELD_MAP.get(x_axis, x_axis)
        y_axis = FIELD_MAP.get(y_axis, y_axis)

        if y_axis == "total":
            group_by = [x_axis]
        else:
            group_by = [x_axis, y_axis]

        query = None if "term" in (x_axis, y_axis) else self.get_query()

        aggr = ES().aggregate_query(
            query=query,
            terms=self.get_queries(),
            filters=self.get_filters(),
            group_by=group_by,
            date_interval=interval,
            sets=map(attrgetter("id"), self.data.articlesets),
        )

        aggr = get_mediums(aggr, list(group_by))
        aggr = get_articlesets(aggr, list(group_by))

        return aggr

    def get_medium_ids(self):
        return self.es.list_media(self.get_query(), self.get_filters())

    def get_article_ids(self):
        return ES().query_ids(self.get_query(), self.get_filters())

    def _get_article_ids_per_query(self):
        for q in self.get_queries():
            yield q, list(ES().query_ids(q.query, self.get_filters()))

    def get_article_ids_per_query(self):
        return dict(self._get_article_ids_per_query())

    def get_articles(self, size=None, offset=0):
        """

        """
        article_ids = self.get_article_ids()
        if size is not None:
            article_ids = islice(article_ids, offset, size + offset)

        # Return in order
        article_ids = tuple(article_ids)
        article_dict = Article.objects.in_bulk(article_ids)
        return (article_dict[pk] for pk in article_ids)
Пример #44
0
class SelectionSearch:
    def __init__(self, form):
        """
        Form *must* be valid before passing.
        @type form: SelectionForm
        """
        self.es = ES()
        self.form = form
        self.data = SelectionData(form)

    def _get_set_filters(self):
        yield "sets", [a.id for a in self.data.articlesets]

    def _get_filters(self) -> Iterable[Tuple[str, Any]]:
        """
        Get filters for dates,  articlesets and articles for given form. Yields
        iterables of tuples containing (filter_name, filter_value).

        @type form: SelectionForm
        """

        if self.data.start_date is not None:
            yield "start_date", self.data.start_date

        if self.data.end_date is not None:
            yield "end_date", self.data.end_date

        yield "ids", self.data.article_ids or None

        yield from self._get_set_filters()

        if self.data.filters:
            for filter in self.data.filters:
                yield from filter.get_filter_kwargs()

    @cached
    def get_filters(self):
        """Returns dict with filter -> value, which can be passed to elastic"""
        # Remove all filters which value is None
        return {k: v for k, v in self._get_filters() if v is not None}

    @cached
    def get_query(self):
        """
        @rtype: str
        """
        return ' OR '.join('(%s)' % q.query for q in self.get_queries()) or None

    @cached
    def get_queries(self):
        """Get SearchQuery objects

        @rtype: iterable of SearchQuery"""
        if not self.data.query:
            return []
        codebook = self.data.codebook
        label_lan = self.data.codebook_label_language
        replacement_lan = self.data.codebook_replacement_language

        if codebook:
            codebook.cache_labels()

        queries = map(str.strip, self.data.query.split("\n"))
        # filter empty lines
        queries = filter(lambda x: x, queries)
        queries = map(SearchQuery.from_string, queries)

        resolved = resolve_queries(
            list(queries), codebook=codebook,
            label_language=label_lan,
            replacement_language=replacement_lan
        )

        return [q for q in resolved if not q.label.startswith("_")]

    @cached
    def get_count(self):
        try:
            return self.es.count(self.get_query(), self.get_filters())
        except queryparser.QueryParseError:
            # try queries one by one
            for i, q in enumerate(self.get_queries()):
                queryparser.parse_to_terms(q.query, context=(q.declared_label or i + 1))
            # if error wasn't raised yet, re-raise original
            raise

    @cached
    def get_statistics(self):
        return self.es.statistics(self.get_query(), self.get_filters())

    def get_aggregate(self, categories, flat=True, objects=True):
        # If we're aggregating on terms, we don't want a global filter
        query = None
        if not any(isinstance(c, TermCategory) for c in categories):
            query = self.get_query()

        return aggregate(query, self.get_filters(), categories, flat=flat, objects=objects)

    def get_nested_aggregate(self, categories):
        return to_nested(self.get_aggregate(categories))

    def get_article_ids(self):
        return ES().query_ids(self.get_query(), self.get_filters())

    def _get_article_ids_per_query(self):
        for q in self.get_queries():
            yield q, list(ES().query_ids(q.query, self.get_filters()))

    def get_article_ids_per_query(self):
        return dict(self._get_article_ids_per_query())

    def get_articles(self, size=None, offset=0, fields=(), **kwargs):
        return ES().query(self.get_query(), self.get_filters(), True, size=size, from_=offset, _source=fields, **kwargs)

    @staticmethod
    def get_instance(form):
        """
        Gets a SelectionSearch instance depending on the selection data.
        If codingjobs are given, a CodingJobSelectionSearch is returned.

        :param form: A SelectionForm
        :return: An instance of SelectionSearch that is appropriate for the given SelectionForm.
        """
        data = SelectionData(form)
        if data.codingjobs:
            return CodingJobSelectionSearch(form)
        if data.articlesets:
            return SelectionSearch(form)

        raise Exception("Invalid selection: no articlesets or codingjobs given.")