Пример #1
0
    def test_lowest_dimension_found(self):
        self.doc.content = "key11 and some other stuff"
        self.solr_docs.update(self.doc)

        expected = [
            SolrDocKeyword("key1", SolrDocKeywordTypes.KWM),
            SolrDocKeyword("key11", SolrDocKeywordTypes.KWM),
        ]

        self.solr_docs.apply_kwm(self.hierarchy.get_keywords())

        self.doc = self.solr_docs.get(self.doc.id)
        self.assertEqual(sorted(self.doc.keywords), sorted(expected))
Пример #2
0
    def test_keyword_added_and_deleted(self, client, doc_with_2_keywords):
        doc = self.solr_docs.get(doc_with_2_keywords)
        doc.keywords.add(SolrDocKeyword("keyword3",
                                        SolrDocKeywordTypes.MANUAL))
        doc.keywords.remove(
            SolrDocKeyword("keyword2", SolrDocKeywordTypes.META))

        client.patch("/changekeywords",
                     data=json.dumps(doc.as_dict()),
                     **POST_JSON)

        assert "keyword1" in self.solr_keyword_statistics
        assert "keyword2" not in self.solr_keyword_statistics
        assert "keyword3" in self.solr_keyword_statistics
Пример #3
0
def change_keywords():
    """
    Handles the updating of keywords for a document
    :return: json object containing a success/error message
    """
    try:
        iDoc = request.json
        id = iDoc.get("id")
        keywords = iDoc.get("keywords")
    except Exception as e:
        return jsonify(f"Bad Request: {e}"), 400

    try:
        solDoc = solr.docs.get(id)
        keywords_before = copy.deepcopy(solDoc.keywords)
        keywords_after = {
            SolrDocKeyword(kw["value"],
                           SolrDocKeywordTypes.from_str(kw["type"]))
            for kw in keywords
        }

        solDoc.keywords = keywords_after
        solr.docs.update(solDoc)

        solr.keyword_statistics.update(keywords_before, keywords_after)
    except Exception as e:
        log.error(f"/changekeywords {e}")
        return jsonify(f"Bad Gateway to solr: {e}"), 502

    print("changed keywords on file " + id + " to " +
          ",".join([kw.value for kw in solDoc.keywords]),
          file=sys.stdout)
    return jsonify("success"), 200
Пример #4
0
def doc2_has_kw1(client, doc_with_0_keywords_1, solr_docs):
    doc2 = solr_docs.get(doc_with_0_keywords_1)
    doc2.keywords.add(SolrDocKeyword("keyword1", SolrDocKeywordTypes.MANUAL))
    client.patch("/changekeywords",
                 data=json.dumps(doc2.as_dict()),
                 **POST_JSON)
    return doc2.id
Пример #5
0
    def test_apply_tagging_method_kwm(self):
        data = json.dumps(dict(
            taggingMethod={'name': 'Keyword Model', 'type': 'KWM'},
            keywordModel={  'id': 'test',
                            'hierarchy': json.dumps([
                                {'item': 'test', 'nodeType': 'KEYWORD'},
                                {'item': 'text', 'nodeType': 'KEYWORD'},
                                {'item': 'faufm', 'nodeType': 'KEYWORD'},
                                ]),
                            'keywords': ['test', 'text', 'faufm'],
                         },
            documents=[{'id': "test.txt"}, {'id':"test.pdf"}],
            options={"applyToAllDocuments": False},
            jobId='JOB-ID'
        ))

        self.application.solr.docs.add(*self.docs)

        tester = self.app.test_client(self)
        response = tester.post("/apply", content_type="application/json", data=data)
        self.assertEqual(response.status_code, 200)

        # Wait for thread to finish.
        sleep(10)

        doc = self.application.solr.docs.get("test.txt")
        keywords = self.application.solr.docs.get("test.txt").keywords
        expected = [
            SolrDocKeyword("text", SolrDocKeywordTypes.KWM),
            SolrDocKeyword("test", SolrDocKeywordTypes.KWM),
        ]
        self.assertEqual(sorted(keywords), sorted(expected))

        keywords = self.application.solr.docs.get("test.pdf").keywords
        expected = [
            SolrDocKeyword("faufm", SolrDocKeywordTypes.KWM),
        ]
        self.assertEqual(sorted(keywords), sorted(expected))

        keywords = self.application.solr.docs.get("test.docx").keywords
        expected = []
        self.assertEqual(sorted(keywords), sorted(expected))

        keywords = self.application.solr.docs.get("test.pptx").keywords
        expected = []
        self.assertEqual(sorted(keywords), sorted(expected))
Пример #6
0
    def test_no_duplicate_keywords(self):
        self.doc.keywords = [SolrDocKeyword("key1", SolrDocKeywordTypes.KWM)]
        self.doc.content = "key1 and some other stuff"
        self.solr_docs.update(self.doc)

        self.solr_docs.apply_kwm(self.hierarchy.get_keywords())

        self.doc = self.solr_docs.get(self.doc.id)
        self.assertEqual(len(self.doc.keywords), 1)
Пример #7
0
def doc_with_keyword_in_keywords_field(solr_docs):
    doc_id = "doc_with_kw_in_kws"
    doc = SolrDoc(
        doc_id,
        SolrDocKeyword("keyword", SolrDocKeywordTypes.MANUAL),
        content="content",
        title="title",
        file_type="file_type",
        lang="lang",
        size=1,
    )
    solr_docs.update(doc)
    return doc_id
Пример #8
0
    def run(self):
        self.status = 'TAGGING_JOB.CREATE_KW'
        auto_keywords = create_automated_keywords(self.docs, self.num_clusters, self.num_keywords, self.default, self)
        self.status = 'TAGGING_JOB.KW_FOUND'

        doc_ids = auto_keywords.keys()
        docs = self.solr_service.docs.get(*doc_ids)
        if len(doc_ids)==1:
            docs=[docs]
        print("ids", doc_ids)
        self.status = 'TAGGING_JOB.APPLYING'
        start_time = time.time()
        time_index = 0
        iteration_time = None
        progress_step = 0

        for idx, doc in enumerate(docs):
            if self.cancelled:
                break
            if idx == 0:
                progress_step = self.progress / len(docs)

            new_keywords = auto_keywords[doc.id]
            doc.keywords.update(
                SolrDocKeyword(kw, SolrDocKeywordTypes.ML)
                for kw in new_keywords
            )

            if time_index == 0:
                end_time = time.time()
                iteration_time = end_time - start_time
                time_index = 1

            remaining_iterations = len(docs) - idx
            idx += 1
            if iteration_time != - 1:
                self.time_remaining = iteration_time * remaining_iterations
            self.progress += progress_step

        self.solr_service.docs.update(*docs)
        keywords_added = set()
        keywords_added.update(kw for doc in docs for kw in doc.keywords)
        self.solr_service.keyword_statistics.update({}, keywords_added)
        self.status = 'FINISHED'
Пример #9
0
    def test_change_keywords(self):
        self.application.solr.docs.add(self.docs[0])
        id = self.docs[0].id

        doc = self.application.solr.docs.get(id)
        self.assertEqual(doc.keywords, set())

        tester = self.app.test_client(self)
        data=json.dumps({
            "id":id,
            "keywords": [
                {"value": "a", "type": "MANUAL"},
                {"value": "b", "type": "MANUAL"},
                {"value": "c", "type": "MANUAL"}
            ],
        })
        response=tester.patch('/changekeywords', data=data, content_type='application/json')
        self.assertEqual(response.status_code, 200)

        doc = self.application.solr.docs.get(id)
        self.assertEqual(doc.keywords, {SolrDocKeyword(kw, SolrDocKeywordTypes.MANUAL) for kw in ["a", "b", "c"]})
Пример #10
0
    def run(self):
        """
        Applies a keyword model on every document in Solr.
        The idea is to search the content in Solr for the lemmatized_keyword if it is found
        the (normal)keyword and its parents are applied.

        :param keywords: dict of keywords and corresponding parents
        :param doc_ids:
        :param job_id
        :return:
        """
        self.status = 'TAGGING_JOB.LEMMA_START'
        lemmatized_keywords = lemmatize_keywords(self.keywords)
        lemmatize_progress = 20
        self.status = 'TAGGING_JOB.LEMMA_END'
        self.progress = lemmatize_progress

        self.status = 'TAGGING_JOB.DOC_FIND'
        id_query = self.solr_service.docs.build_id_query(self.doc_ids)
        self.status = 'TAGGING_JOB.DOC_FOUND'
        changed_docs = {}
        self.status = 'TAGGING_JOB.APPLY_KWM'
        start_time = time.time()
        time_index = 0
        iteration_time = None
        progress_step = 0

        for idx, (lemmatized_keyword, (keyword, parents)) in enumerate(zip(
            lemmatized_keywords, self.keywords.items()
        )):
            if self.cancelled:
                break
            if idx == 0:
                progress_step = (100 - lemmatize_progress) / len(lemmatized_keywords)
            query = self.solr_service.docs.build_kwm_query(id_query, lemmatized_keyword)

            res = self.solr_service.docs.search(query)
            res = [SolrDoc.from_hit(hit) for hit in res]

            for doc in res:
                if self.cancelled:
                    break

                # check whether the doc was already updated
                if doc.id in changed_docs:
                    doc = changed_docs[doc.id]

                # update keywords
                doc.keywords.add(
                    SolrDocKeyword(keyword, SolrDocKeywordTypes.KWM))
                doc.keywords.update(
                    SolrDocKeyword(parent, SolrDocKeywordTypes.KWM)
                    for parent in parents
                )

                # store for bulk update
                changed_docs[doc.id] = doc

            if time_index == 0:
                end_time = time.time()
                iteration_time = end_time - start_time
                time_index = 1

            remaining_iterations = len(lemmatized_keywords) - idx
            idx += 1
            if iteration_time != - 1:
                self.time_remaining = iteration_time * remaining_iterations
            self.progress += progress_step

        changed_docs = changed_docs.values()
        self.status = 'TAGGING_JOB.DOC_UPDATE'
        if not self.cancelled:
            self.solr_service.docs.update(*changed_docs)

        keywords_added = set()
        keywords_added.update(kw for doc in changed_docs for kw in doc.keywords)
        self.solr_service.keyword_statistics.update({}, keywords_added)

        self.status = 'FINISHED'
Пример #11
0
def kw3():
    return SolrDocKeyword("key3", SolrDocKeywordTypes.KWM)
Пример #12
0
def kw2():
    return SolrDocKeyword("key2", SolrDocKeywordTypes.KWM)
Пример #13
0
def kw1():
    return SolrDocKeyword("key1", SolrDocKeywordTypes.KWM)