Exemplo n.º 1
0
    def build_collection(self, mappings):
        """Builds Mongo collection"""
        mongo = Mongo(MONGO_HOST, MONGO_DB, self.__collection)
        mongo.drop()

        predicate = "!<owl:sameAs>"
        i = 0
        for fb_id, dbp_ids in mappings.items():
            for dbp_id in dbp_ids:
                mongo.append_set(fb_id, predicate, [dbp_id])
            i += 1
            if i % 1000 == 0:
                print(str(i // 1000) + "K entities are added!")
Exemplo n.º 2
0
class NTriplesToMongoDB(object):
    def __init__(self, host, db, collection):
        self.__mongo = Mongo(host, db, collection)
        self.__prefix = URIPrefix()
        self.__m_id = None
        self.__m_contents = None
        logging.basicConfig(level="ERROR")  # no warnings from the rdf parser

    def _next_triple(self, subj, pred, obj):
        """Processes a triple.

          - Appends to previous triple if it's the same subject.
          - Otherwise inserts last triple and creates a new one.
        """
        if (self.__m_id is not None) and (self.__m_id == subj):
            if pred not in self.__m_contents:
                self.__m_contents[pred] = []
            self.__m_contents[pred].append(obj)
        else:
            self._write_to_mongo()
            self.__m_id = subj
            self.__m_contents = {pred: [obj]}

    def _write_to_mongo(self):
        """Writes triple (inserts or appends existing) to MongoDB collection."""
        if self.__m_id is not None:
            for field, value in self.__m_contents.items():
                self.__mongo.append_set(self.__m_id, field, value)
            # self.mongo.add(self.m_id, self.m_contents)
            self.__m_id = None
            self.__m_contents = None

    def drop(self):
        """Deletes the collection."""
        self.__mongo.drop()

    def add_file(self, filename, reverse_triple=False, predicate_prefix=None):
        """Adds contents from an NTriples file to MongoDB.

        :param filename: NTriples file.
        :param reverse_triple: if set True, the subject and object values are swapped.
        :param predicate_prefix: prefix to be added to predicates.
        :param subjects_redirecter: redirects dict.
        """
        print("Processing " + filename + "...")

        t = Triple()
        p = NTriplesParser(t)
        self.__m_id = None  # document id for MongoDB -- subj
        self.__m_contents = None  # document contents for MongoDB -- pred, obj
        i = 0

        with FileUtils.open_file_by_type(filename) as f:
            for line in f:
                try:
                    p.parsestring(line.decode("utf-8"))
                except ParseError:  # skip lines that couldn't be parsed
                    continue
                if t.subject() is None:  # only if parsed as a triple
                    continue

                # subject prefixing
                subj = self.__prefix.get_prefixed(t.subject())

                # predicate prefixing
                pred = self.__prefix.get_prefixed(t.predicate())
                if predicate_prefix is not None:
                    pred = predicate_prefix + pred

                # Object prefixing
                if type(t.object()) is URIRef:
                    obj = self.__prefix.get_prefixed(t.object())
                else:
                    obj = t.object()
                    if len(obj) == 0:
                        continue  # skip empty objects

                # write or append
                if reverse_triple:  # reverse subj and obj
                    self._next_triple(obj, pred, subj)
                else:  # normal mode
                    self._next_triple(subj, pred, obj)

                i += 1
                if i % 100000 == 0:
                    print(
                        str(i // 1000) + "K lines processed from " + filename)

        # process last triple
        self._write_to_mongo()