예제 #1
0
    def build(self, callback_get_doc_content, bulk_size=1000):
        """Builds the DBpedia index from the mongo collection.

        To speedup indexing, we index documents as a bulk.
        There is an optimum value for the bulk size; try to figure it out.

        :param callback_get_doc_content: a function that get a documet from mongo and return the content for indexing
        :param bulk_size: Number of documents to be added to the index as a bulk
        """
        PLOGGER.info("Building " + self.__index_name + " ...")
        elastic = Elastic(self.__index_name)
        elastic.create_index(self.__mappings, model=self.__model, force=True)

        i = 0
        docs = dict()
        for mdoc in self.__mongo.find_all(no_timeout=True):
            docid = Mongo.unescape(mdoc[Mongo.ID_FIELD])

            # get back document from mongo with keys and _id field unescaped
            doc = callback_get_doc_content(Mongo.unescape_doc(mdoc))
            if doc is None:
                continue
            docs[docid] = doc

            i += 1
            if i % bulk_size == 0:
                elastic.add_docs_bulk(docs)
                docs = dict()
                PLOGGER.info(str(i / 1000) + "K documents indexed")
        # indexing the last bulk of documents
        elastic.add_docs_bulk(docs)
        PLOGGER.info("Finished indexing (" + str(i) + " documents in total)")
예제 #2
0
    def get_doc_content(self, doc):
        """create the index content for a given mongo document
        Here we keep both FSDM fields and individual fields for each document.

        :param doc: a Mongo document
        :return: a document ready for indexing
        """
        # Ignores document if the ID does not start with "<dbpedia:" (just to speed up)
        doc_id = Mongo.unescape(doc[Mongo.ID_FIELD])
        if not doc_id.startswith("<dbpedia:"):
            return None

        # Ignores document if it does not have must have fields
        for f in self._config["must_have"]:
            if f not in doc:
                return None

        self._doc_content = defaultdict(list)

        for f in doc:
            # Adds content for FSDM fields
            if f.lower() in self._config["names"]:
                self._doc_content["names"] += self.__get_field_value(doc[f])

            elif f in self._config["categories"]:
                self._doc_content["categories"] += self.__get_field_value(
                    doc[f])

            elif f in self._config["similar_entity_names"]:
                self._doc_content[
                    "similar_entity_names"] += self.__get_field_value(doc[f])

            elif f not in self._config["blacklist"]:
                if doc[f][0].startswith("<dbpedia:"):
                    self._doc_content[
                        "related_entity_names"] += self.__get_field_value(
                            doc[f], f)
                else:
                    self._doc_content["attributes"] += self.__get_field_value(
                        doc[f], f)

            # Adds content for each individual field
            if f in self.__top_fields:
                self._doc_content[f] += self.__get_field_value(doc[f])

        # keeps only unique phrases for each field
        # Adds everything to the catchall field
        for field in self._fsdm_fields:
            self._doc_content[field] = list(set(self._doc_content[field]))
            self._doc_content[
                Elastic.FIELD_CATCHALL] += self._doc_content[field]

        return self._doc_content