def build(self, callback_get_doc_content, bulk_size=1000): """Builds the DBpedia index from the mongo collection. To speedup indexing, we index documents as a bulk. There is an optimum value for the bulk size; try to figure it out. :param callback_get_doc_content: a function that get a documet from mongo and return the content for indexing :param bulk_size: Number of documents to be added to the index as a bulk """ PLOGGER.info("Building " + self.__index_name + " ...") elastic = Elastic(self.__index_name) elastic.create_index(self.__mappings, model=self.__model, force=True) i = 0 docs = dict() for mdoc in self.__mongo.find_all(no_timeout=True): docid = Mongo.unescape(mdoc[Mongo.ID_FIELD]) # get back document from mongo with keys and _id field unescaped doc = callback_get_doc_content(Mongo.unescape_doc(mdoc)) if doc is None: continue docs[docid] = doc i += 1 if i % bulk_size == 0: elastic.add_docs_bulk(docs) docs = dict() PLOGGER.info(str(i / 1000) + "K documents indexed") # indexing the last bulk of documents elastic.add_docs_bulk(docs) PLOGGER.info("Finished indexing (" + str(i) + " documents in total)")
def build_collection(self): """Adds all name variants from DBpedia.""" self.__mongo = Mongo(MONGO_HOST, MONGO_DB, self.__collection) self.__mongo.drop() # iterate through all DBpedia entities i = 0 for mdoc in self.__mongo_dbpedia.find_all(): entity = EntityUtils(Mongo.unescape_doc(mdoc)) # skips entities without names if not entity.has_name(): continue surface_form = entity.get_name() # the entity is redirect page if entity.is_redirect(): entity_id = entity.get_predicate( EntityUtils.PREDICATE_REDIRECT)[0] self.__add_surface_form(surface_form, EntityUtils.PREDICATE_REDIRECT, entity_id) # the entity is disambiguation page if entity.has_predicate(EntityUtils.PREDICATE_DISAMBIGUATE): entity_ids = entity.get_predicate( EntityUtils.PREDICATE_DISAMBIGUATE) for entity_id in entity_ids: self.__add_surface_form(surface_form, EntityUtils.PREDICATE_DISAMBIGUATE, entity_id) # entity is not a redirect/disambiguation page and has name and abstract if entity.is_entity(): entity_id = entity.get_id() # adds entity name self.__add_surface_form(surface_form, EntityUtils.PREDICATE_NAME, entity_id) # adds other entity names foaf_name_predicate = "<foaf:name>" if entity.has_predicate(foaf_name_predicate): for surface_form in entity.get_predicate( foaf_name_predicate): self.__add_surface_form(surface_form, foaf_name_predicate, entity_id) i += 1 if i % 1000 == 0: print(str(i // 1000) + "K entities processed")