def build_collection(self, mappings): """Builds Mongo collection""" mongo = Mongo(MONGO_HOST, MONGO_DB, self.__collection) mongo.drop() predicate = "!<owl:sameAs>" i = 0 for fb_id, dbp_ids in mappings.items(): for dbp_id in dbp_ids: mongo.append_set(fb_id, predicate, [dbp_id]) i += 1 if i % 1000 == 0: print(str(i // 1000) + "K entities are added!")
class NTriplesToMongoDB(object): def __init__(self, host, db, collection): self.__mongo = Mongo(host, db, collection) self.__prefix = URIPrefix() self.__m_id = None self.__m_contents = None logging.basicConfig(level="ERROR") # no warnings from the rdf parser def _next_triple(self, subj, pred, obj): """Processes a triple. - Appends to previous triple if it's the same subject. - Otherwise inserts last triple and creates a new one. """ if (self.__m_id is not None) and (self.__m_id == subj): if pred not in self.__m_contents: self.__m_contents[pred] = [] self.__m_contents[pred].append(obj) else: self._write_to_mongo() self.__m_id = subj self.__m_contents = {pred: [obj]} def _write_to_mongo(self): """Writes triple (inserts or appends existing) to MongoDB collection.""" if self.__m_id is not None: for field, value in self.__m_contents.items(): self.__mongo.append_set(self.__m_id, field, value) # self.mongo.add(self.m_id, self.m_contents) self.__m_id = None self.__m_contents = None def drop(self): """Deletes the collection.""" self.__mongo.drop() def add_file(self, filename, reverse_triple=False, predicate_prefix=None): """Adds contents from an NTriples file to MongoDB. :param filename: NTriples file. :param reverse_triple: if set True, the subject and object values are swapped. :param predicate_prefix: prefix to be added to predicates. :param subjects_redirecter: redirects dict. """ print("Processing " + filename + "...") t = Triple() p = NTriplesParser(t) self.__m_id = None # document id for MongoDB -- subj self.__m_contents = None # document contents for MongoDB -- pred, obj i = 0 with FileUtils.open_file_by_type(filename) as f: for line in f: try: p.parsestring(line.decode("utf-8")) except ParseError: # skip lines that couldn't be parsed continue if t.subject() is None: # only if parsed as a triple continue # subject prefixing subj = self.__prefix.get_prefixed(t.subject()) # predicate prefixing pred = self.__prefix.get_prefixed(t.predicate()) if predicate_prefix is not None: pred = predicate_prefix + pred # Object prefixing if type(t.object()) is URIRef: obj = self.__prefix.get_prefixed(t.object()) else: obj = t.object() if len(obj) == 0: continue # skip empty objects # write or append if reverse_triple: # reverse subj and obj self._next_triple(obj, pred, subj) else: # normal mode self._next_triple(subj, pred, obj) i += 1 if i % 100000 == 0: print( str(i // 1000) + "K lines processed from " + filename) # process last triple self._write_to_mongo()