Exemplo n.º 1
0
class Entity(object):
    def __init__(self):
        self.__coll_dbpedia = None
        self.__coll_sf_facc = None
        self.__coll_sf_dbpedia = None
        self.__coll_fb2dbp = None

    def __init_coll_dbpedia(self):
        """Makes connection to the entity (DBpedia) collection."""
        if self.__coll_dbpedia is None:
            self.__coll_dbpedia = Mongo(MONGO_HOST, MONGO_DB,
                                        MONGO_COLLECTION_DBPEDIA)

    def __init_coll_sf_facc(self):
        """Makes connection to the surface form collection."""
        if self.__coll_sf_facc is None:
            self.__coll_sf_facc = Mongo(MONGO_HOST, MONGO_DB,
                                        MONGO_COLLECTION_SF_FACC)

    def __init_coll_sf_dbpedia(self):
        """Makes connection to the surface form collection."""
        if self.__coll_sf_dbpedia is None:
            self.__coll_sf_dbpedia = Mongo(MONGO_HOST, MONGO_DB,
                                           MONGO_COLLECTION_SF_DBPEDIA)

    def __init_coll_fb2dbp(self):
        """Makes connection to Freebase2DBpedia collection."""
        if self.__coll_fb2dbp is None:
            self.__coll_fb2dbp = Mongo(MONGO_HOST, MONGO_DB,
                                       MONGO_COLLECTION_FREEBASE2DBPEDIA)

    def lookup_en(self, entity_id):
        """Looks up an entity by its identifier.

        :param entity_id: entity identifier ("<dbpedia:Audi_A4>")
        :return A dictionary with the entity document or None.
        """
        self.__init_coll_dbpedia()
        return self.__coll_dbpedia.find_by_id(entity_id)

    def lookup_name_facc(self, name):
        """Looks up a name in a surface form dictionary and returns all candidate entities."""
        self.__init_coll_sf_facc()
        res = self.__coll_sf_facc.find_by_id(name)
        return res if res else {}

    def lookup_name_dbpedia(self, name):
        """Looks up a name in a surface form dictionary and returns all candidate entities."""
        self.__init_coll_sf_dbpedia()
        res = self.__coll_sf_dbpedia.find_by_id(name)
        return res if res else {}

    def fb_to_dbp(self, fb_id):
        """Converts Freebase id to DBpedia; it returns list of DBpedia IDs."""
        self.__init_coll_fb2dbp()
        res = self.__coll_fb2dbp.find_by_id(fb_id)
        return res["!<owl:sameAs>"] if res else None
Exemplo n.º 2
0
    def create_sample_file(self):
        """Creates a sample file from the context of index"""
        mongo = Mongo(MONGO_HOST, MONGO_DB, MONGO_COLLECTION_DBPEDIA)

        example_docs = [
            "<dbpedia:Texhoma,_Oklahoma>", "<dbpedia:Karen_Spärck_Jones>",
            "<dbpedia:Audi_A4>", "<dbpedia:Barack_Obama>"
        ]
        doc_contents = {}
        for docid in example_docs:
            doc_contents[docid] = self.get_doc_content(mongo.find_by_id(docid))
        json.dump(doc_contents,
                  open("output/example_docs.json", "w"),
                  indent=4,
                  sort_keys=True,
                  ensure_ascii=False)
Exemplo n.º 3
0
class Freebase2DBpedia2Mongo(object):
    def __init__(self, config):
        self.__check_config(config)
        self.__collection = config[KEY_COLLECTION]
        self.__fb2dbp_file = config[KEY_MAPPING_FILE]
        self.__fb2dbp_file_39 = config[
            KEY_MAPPING_FILE_39]  # used for removing duplicates
        self.__prefix = URIPrefix()
        self.__mongo_dbpedia = Mongo(MONGO_HOST, MONGO_DB,
                                     MONGO_COLLECTION_DBPEDIA)

    @staticmethod
    def __check_config(config):
        """Checks params and set default values."""
        try:
            if KEY_COLLECTION not in config:
                raise Exception(KEY_COLLECTION + " is missing")
            if KEY_MAPPING_FILE not in config:
                raise Exception(KEY_MAPPING_FILE + " is missing")
            if KEY_MAPPING_FILE_39 not in config:
                raise Exception(KEY_MAPPING_FILE_39 + " is missing")
            if not (os.path.exists(config[KEY_MAPPING_FILE])) or not (
                    os.path.exists(config[KEY_MAPPING_FILE_39])):
                raise Exception("Mapping file path does not exist.")
        except Exception as e:
            print("Error in config file: ", e)
            sys.exit(1)
        return config

    def read_fb2dbp_file(self, is_39=False):
        """Reads the file and generates an initial mapping of Freebase to DBpedia IDs.
        Only proper DBpedia entities are considered; i.e. redirect and disambiguation pages are ignored.
        """
        fb2dbp_file = self.__fb2dbp_file_39 if is_39 else self.__fb2dbp_file
        print("Processing " + fb2dbp_file + "...")

        t = Triple()
        p = NTriplesParser(t)
        i = 0
        fb2dbp_mapping = defaultdict(set)
        with FileUtils.open_file_by_type(fb2dbp_file) as f:
            for line in f:
                try:
                    p.parsestring(line.decode("utf-8"))
                except ParseError:  # skip lines that couldn't be parsed
                    continue
                if t.subject() is None:  # only if parsed as a triple
                    continue

                # prefixing
                dbp_id = self.__prefix.get_prefixed(t.subject())
                fb_id = self.__prefix.get_prefixed(t.object())

                # if reading 3.9 file, converts ID to 2015-10 version
                if is_39:
                    dbp_id = EntityUtils.convert_39_to_201510(dbp_id)
                    fb2dbp_mapping[fb_id].add(dbp_id)

                # if reading 2015-10 file, keeps only the proper DBpedia entities
                else:
                    entity_utils = EntityUtils(
                        self.__mongo_dbpedia.find_by_id(dbp_id))
                    if entity_utils.is_entity():
                        fb2dbp_mapping[fb_id].add(dbp_id)
                i += 1
                if i % 1000 == 0:
                    print(str(i // 1000) + "K lines are processed!")

        return fb2dbp_mapping

    def load_fb2dbp_mapping(self):
        """Checks Freebase IDs that are mapped to more than one entity and keeps only one of them."""
        mappings = defaultdict(list)
        fb2dbp_39 = self.read_fb2dbp_file(is_39=True)
        fb2dbp = self.read_fb2dbp_file()

        for fb_id, dbp_ids in fb2dbp.items():
            if len(dbp_ids) > 1:
                dbp_ids_39 = fb2dbp_39.get(fb_id, None)
                dbp_id_39 = dbp_ids_39.pop() if dbp_ids_39 else None
                if dbp_id_39 in dbp_ids:
                    mappings[fb_id].append(dbp_id_39)
                else:
                    mappings[fb_id] = list(dbp_ids)
                    print(fb_id, "3.9", dbp_id_39, "2015", dbp_ids)
            else:
                mappings[fb_id] = list(dbp_ids)

        print(len(mappings))
        return mappings

    def build_collection(self, mappings):
        """Builds Mongo collection"""
        mongo = Mongo(MONGO_HOST, MONGO_DB, self.__collection)
        mongo.drop()

        predicate = "!<owl:sameAs>"
        i = 0
        for fb_id, dbp_ids in mappings.items():
            for dbp_id in dbp_ids:
                mongo.append_set(fb_id, predicate, [dbp_id])
            i += 1
            if i % 1000 == 0:
                print(str(i // 1000) + "K entities are added!")