示例#1
0
    def __load_entity_abstracts(self, filename):
        prefix = URIPrefix()
        t = Triple()
        p = NTriplesParser(t)
        lines_counter = 0
        PLOGGER.info("Loading entity abstracts from {}".format(filename))
        for line in FileUtils.read_file_as_list(filename):
            # basic line parsing
            line = line.decode("utf-8") if isinstance(line, bytes) else line
            try:
                p.parsestring(line)
            except ParseError:  # skip lines that couldn't be parsed
                continue
            if t.subject() is None:  # only if parsed as a triple
                continue

            # Subject and object identification
            subj = prefix.get_prefixed(t.subject())
            obj = ""
            if type(t.object()) is URIRef:
                # PLOGGER.error("Error: it is URIRef the parsed obj")
                pass
            else:
                obj = t.object().encode("utf-8")
                if len(obj) == 0:
                    continue  # skip empty objects
            self.__entity_abstracts[subj] = obj

            lines_counter += 1
            if lines_counter % 10000 == 0:
                PLOGGER.info("\t{}K lines processed".format(lines_counter // 1000))
                pass

        PLOGGER.info("\n### Loading entity abstracts... Done.")
示例#2
0
 def __init__(self, config):
     self.__check_config(config)
     self.__collection = config[KEY_COLLECTION]
     self.__fb2dbp_file = config[KEY_MAPPING_FILE]
     self.__fb2dbp_file_39 = config[
         KEY_MAPPING_FILE_39]  # used for removing duplicates
     self.__prefix = URIPrefix()
     self.__mongo_dbpedia = Mongo(MONGO_HOST, MONGO_DB,
                                  MONGO_COLLECTION_DBPEDIA)
示例#3
0
 def __init__(self, config):
     self.__elastic = None
     self.__config = config
     self.__index_name = config["index_name"]
     self.__dbpedia_path = config["dbpedia_files_path"]
     # For triple parsing
     self.__prefix = URIPrefix()
     self.__triple = Triple()
     self.__ntparser = NTriplesParser(self.__triple)
     # Entity abstract and type assignments kept in memory
     self.__entity_abstracts = {}
     self.__load_entity_abstracts()
     self.__types_entities = defaultdict(list)
     self.__load_entity_types()
示例#4
0
    def load_file(self, file_name, remap_by_exp=False):
        """Loads a TREC runfile.

        :param file_name: name of the run file
        :param remap_by_exp: whether scores are to be converted from the log-domain by taking their exp (default: False)
        """
        # load the file such that self.results[query_id] = res holds the results for a given query,
        # where res is a RetrievalResults object
        pre = URIPrefix()
        with open(file_name, "r") as f_baseline:
            for line in f_baseline:
                # Parse data
                fields = line.rstrip().split()
                if len(fields) != 6:
                    continue
                query_id, doc_id, score = fields[0], fields[2], float(
                    fields[4])
                if self.run_id is None:
                    self.run_id = fields[5]

                # Add parsed data
                if query_id not in self.__results:
                    self.__results[query_id] = RetrievalResults()  # initialize
                # remap exponentially the scores in log-domain to (0, 1)
                if remap_by_exp:
                    score = exp(score)
                self.__results[query_id].append(doc_id, score)
                # an additional data structure to make the normalization easier
                self.__sum_scores[query_id] = self.__sum_scores.get(
                    query_id, 0) + score
示例#5
0
    def parse_file(self, filename, triplehandler):
        """Parses file and calls callback function with the parsed triple"""
        PLOGGER.info("Processing " + filename + "...")

        prefix = URIPrefix()
        t = Triple(prefix)
        p = NTriplesParser(t)
        i = 0

        with open(filename) as f:
            for line in f:
                p.parsestring(line)
                if t.subject() is None:  # only if parsed as a triple
                    continue

                # call the handler object with the parsed triple
                triplehandler.triple_parsed(t)

                i += 1
                if i % 10000 == 0:
                    PLOGGER.info(str(i / 1000) + "K lines processed")
示例#6
0
 def __init__(self, host, db, collection):
     self.__mongo = Mongo(host, db, collection)
     self.__prefix = URIPrefix()
     self.__m_id = None
     self.__m_contents = None
     logging.basicConfig(level="ERROR")  # no warnings from the rdf parser
示例#7
0
class NTriplesToMongoDB(object):
    def __init__(self, host, db, collection):
        self.__mongo = Mongo(host, db, collection)
        self.__prefix = URIPrefix()
        self.__m_id = None
        self.__m_contents = None
        logging.basicConfig(level="ERROR")  # no warnings from the rdf parser

    def _next_triple(self, subj, pred, obj):
        """Processes a triple.

          - Appends to previous triple if it's the same subject.
          - Otherwise inserts last triple and creates a new one.
        """
        if (self.__m_id is not None) and (self.__m_id == subj):
            if pred not in self.__m_contents:
                self.__m_contents[pred] = []
            self.__m_contents[pred].append(obj)
        else:
            self._write_to_mongo()
            self.__m_id = subj
            self.__m_contents = {pred: [obj]}

    def _write_to_mongo(self):
        """Writes triple (inserts or appends existing) to MongoDB collection."""
        if self.__m_id is not None:
            for field, value in self.__m_contents.items():
                self.__mongo.append_set(self.__m_id, field, value)
            # self.mongo.add(self.m_id, self.m_contents)
            self.__m_id = None
            self.__m_contents = None

    def drop(self):
        """Deletes the collection."""
        self.__mongo.drop()

    def add_file(self, filename, reverse_triple=False, predicate_prefix=None):
        """Adds contents from an NTriples file to MongoDB.

        :param filename: NTriples file.
        :param reverse_triple: if set True, the subject and object values are swapped.
        :param predicate_prefix: prefix to be added to predicates.
        :param subjects_redirecter: redirects dict.
        """
        print("Processing " + filename + "...")

        t = Triple()
        p = NTriplesParser(t)
        self.__m_id = None  # document id for MongoDB -- subj
        self.__m_contents = None  # document contents for MongoDB -- pred, obj
        i = 0

        with FileUtils.open_file_by_type(filename) as f:
            for line in f:
                try:
                    p.parsestring(line.decode("utf-8"))
                except ParseError:  # skip lines that couldn't be parsed
                    continue
                if t.subject() is None:  # only if parsed as a triple
                    continue

                # subject prefixing
                subj = self.__prefix.get_prefixed(t.subject())

                # predicate prefixing
                pred = self.__prefix.get_prefixed(t.predicate())
                if predicate_prefix is not None:
                    pred = predicate_prefix + pred

                # Object prefixing
                if type(t.object()) is URIRef:
                    obj = self.__prefix.get_prefixed(t.object())
                else:
                    obj = t.object()
                    if len(obj) == 0:
                        continue  # skip empty objects

                # write or append
                if reverse_triple:  # reverse subj and obj
                    self._next_triple(obj, pred, subj)
                else:  # normal mode
                    self._next_triple(subj, pred, obj)

                i += 1
                if i % 100000 == 0:
                    print(
                        str(i // 1000) + "K lines processed from " + filename)

        # process last triple
        self._write_to_mongo()
示例#8
0
class IndexerDBpediaTypes(object):
    __DOC_TYPE = "doc"  # we don't make use of types
    __MAPPINGS = {
        "id": Elastic.notanalyzed_field(),
        "content": Elastic.analyzed_field(),
    }

    def __init__(self, config):
        self.__elastic = None
        self.__config = config
        self.__index_name = config["index_name"]
        self.__dbpedia_path = config["dbpedia_files_path"]
        # For triple parsing
        self.__prefix = URIPrefix()
        self.__triple = Triple()
        self.__ntparser = NTriplesParser(self.__triple)
        # Entity abstract and type assignments kept in memory
        self.__entity_abstracts = {}
        self.__load_entity_abstracts()
        self.__types_entities = defaultdict(list)
        self.__load_entity_types()

    @property
    def name(self):
        return self.__index_name

    def __parse_line(self, line):
        """Parses a line from a ttl file and returns subject and object pair.

        It is used for parsing DBpedia abstracts and entity types.
        The subject is always prefixed.
        For object URIs, it is returned prefixed if from DBpedia otherwise
        None (i.e., types); literal objects are always returned (i.e.,
        abstracts).
        """
        line = line.decode("utf-8") if isinstance(line, bytes) else line
        try:
            self.__ntparser.parsestring(line)
        except ParseError:  # skip lines that couldn't be parsed
            return None, None
        if self.__triple.subject() is None:  # only if parsed as a triple
            return None, None

        subj = self.__prefix.get_prefixed(self.__triple.subject())
        obj = None
        if type(self.__triple.object()) is URIRef:
            if self.__triple.object().startswith(
                    "http://dbpedia.org/ontology"):
                obj = self.__prefix.get_prefixed(self.__triple.object())
        else:
            obj = self.__triple.object().encode("utf-8")

        return subj, obj

    def __load_entity_abstracts(self):
        num_lines = 0
        filename = os.sep.join([self.__dbpedia_path, ENTITY_ABSTRACTS_FILE])
        PLOGGER.info("Loading entity abstracts from {}".format(filename))
        for line in FileUtils.read_file_as_list(filename):
            entity, abstract = self.__parse_line(line)
            if abstract and len(abstract) > 0:  # skip empty objects
                self.__entity_abstracts[entity] = abstract

            num_lines += 1
            if num_lines % 10000 == 0:
                PLOGGER.info("  {}K lines processed".format(num_lines // 1000))

        PLOGGER.info("  Done.")

    def __load_entity_types(self):
        num_lines = 0
        for types_file in ENTITY_TYPES_FILES:
            filename = os.sep.join([self.__dbpedia_path, types_file])
            PLOGGER.info("Loading entity types from {}".format(filename))
            for line in FileUtils.read_file_as_list(filename):
                entity, entity_type = self.__parse_line(line)
                if type(entity_type) != str:  # Likely result of parsing error
                    continue
                if not entity_type.startswith("<dbo:"):
                    PLOGGER.info("  Non-DBpedia type: {}".format(entity_type))
                    continue
                if not entity.startswith("<dbpedia:"):
                    PLOGGER.info("  Invalid entity: {}".format(entity))
                    continue
                self.__types_entities[entity_type].append(entity)

                num_lines += 1
                if num_lines % 10000 == 0:
                    PLOGGER.info("  {}K lines processed".format(num_lines //
                                                                1000))
            PLOGGER.info("  Done.")

    def __make_type_doc(self, type_name):
        """Gets the document representation of a type to be indexed, from its
        entity short abstracts."""
        content = "\n".join([
            self.__entity_abstracts.get(e, b"").decode("utf-8")
            for e in self.__types_entities[type_name]
        ])

        if len(content) > MAX_BULKING_DOC_SIZE:
            PLOGGER.info("Type {} has content larger than allowed: {}.".format(
                type_name, len(content)))

            # we randomly sample a subset of Y entity abstracts, s.t.
            # Y * AVG_SHORT_ABSTRACT_LEN <= MAX_BULKING_DOC_SIZE
            num_entities = len(self.__types_entities[type_name])
            amount_abstracts_to_sample = min(
                floor(MAX_BULKING_DOC_SIZE / AVG_SHORT_ABSTRACT_LEN),
                num_entities)
            entities_sample = [
                self.__types_entities[type_name][i] for i in sample(
                    range(num_entities), amount_abstracts_to_sample)
            ]
            content = ""  # reset content
            for entity in entities_sample:
                new_content_candidate = "\n".join([
                    content,
                    self.__entity_abstracts.get(entity, b"").decode("utf-8")
                ])
                # we add an abstract only if by doing so it will not exceed
                # MAX_BULKING_DOC_SIZE
                if len(new_content_candidate) > MAX_BULKING_DOC_SIZE:
                    break
                content = new_content_candidate

        return {"content": content}

    def build_index(self, force=False):
        """Builds the index.

        Note: since DBpedia only has a few hundred types, no bulk indexing is
        needed.

        :param force: True iff it is required to overwrite the index (i.e. by
        creating it by force); False by default.
        :type force: bool
        :return:
        """
        PLOGGER.info("Building type index {}".format(self.__index_name))
        self.__elastic = Elastic(self.__index_name)
        self.__elastic.create_index(mappings=self.__MAPPINGS, force=force)

        for type_name in self.__types_entities:
            PLOGGER.info("  Adding {} ...".format(type_name))
            contents = self.__make_type_doc(type_name)
            self.__elastic.add_doc(type_name, contents)

        PLOGGER.info("  Done.")
示例#9
0
class Freebase2DBpedia2Mongo(object):
    def __init__(self, config):
        self.__check_config(config)
        self.__collection = config[KEY_COLLECTION]
        self.__fb2dbp_file = config[KEY_MAPPING_FILE]
        self.__fb2dbp_file_39 = config[
            KEY_MAPPING_FILE_39]  # used for removing duplicates
        self.__prefix = URIPrefix()
        self.__mongo_dbpedia = Mongo(MONGO_HOST, MONGO_DB,
                                     MONGO_COLLECTION_DBPEDIA)

    @staticmethod
    def __check_config(config):
        """Checks params and set default values."""
        try:
            if KEY_COLLECTION not in config:
                raise Exception(KEY_COLLECTION + " is missing")
            if KEY_MAPPING_FILE not in config:
                raise Exception(KEY_MAPPING_FILE + " is missing")
            if KEY_MAPPING_FILE_39 not in config:
                raise Exception(KEY_MAPPING_FILE_39 + " is missing")
            if not (os.path.exists(config[KEY_MAPPING_FILE])) or not (
                    os.path.exists(config[KEY_MAPPING_FILE_39])):
                raise Exception("Mapping file path does not exist.")
        except Exception as e:
            print("Error in config file: ", e)
            sys.exit(1)
        return config

    def read_fb2dbp_file(self, is_39=False):
        """Reads the file and generates an initial mapping of Freebase to DBpedia IDs.
        Only proper DBpedia entities are considered; i.e. redirect and disambiguation pages are ignored.
        """
        fb2dbp_file = self.__fb2dbp_file_39 if is_39 else self.__fb2dbp_file
        print("Processing " + fb2dbp_file + "...")

        t = Triple()
        p = NTriplesParser(t)
        i = 0
        fb2dbp_mapping = defaultdict(set)
        with FileUtils.open_file_by_type(fb2dbp_file) as f:
            for line in f:
                try:
                    p.parsestring(line.decode("utf-8"))
                except ParseError:  # skip lines that couldn't be parsed
                    continue
                if t.subject() is None:  # only if parsed as a triple
                    continue

                # prefixing
                dbp_id = self.__prefix.get_prefixed(t.subject())
                fb_id = self.__prefix.get_prefixed(t.object())

                # if reading 3.9 file, converts ID to 2015-10 version
                if is_39:
                    dbp_id = EntityUtils.convert_39_to_201510(dbp_id)
                    fb2dbp_mapping[fb_id].add(dbp_id)

                # if reading 2015-10 file, keeps only the proper DBpedia entities
                else:
                    entity_utils = EntityUtils(
                        self.__mongo_dbpedia.find_by_id(dbp_id))
                    if entity_utils.is_entity():
                        fb2dbp_mapping[fb_id].add(dbp_id)
                i += 1
                if i % 1000 == 0:
                    print(str(i // 1000) + "K lines are processed!")

        return fb2dbp_mapping

    def load_fb2dbp_mapping(self):
        """Checks Freebase IDs that are mapped to more than one entity and keeps only one of them."""
        mappings = defaultdict(list)
        fb2dbp_39 = self.read_fb2dbp_file(is_39=True)
        fb2dbp = self.read_fb2dbp_file()

        for fb_id, dbp_ids in fb2dbp.items():
            if len(dbp_ids) > 1:
                dbp_ids_39 = fb2dbp_39.get(fb_id, None)
                dbp_id_39 = dbp_ids_39.pop() if dbp_ids_39 else None
                if dbp_id_39 in dbp_ids:
                    mappings[fb_id].append(dbp_id_39)
                else:
                    mappings[fb_id] = list(dbp_ids)
                    print(fb_id, "3.9", dbp_id_39, "2015", dbp_ids)
            else:
                mappings[fb_id] = list(dbp_ids)

        print(len(mappings))
        return mappings

    def build_collection(self, mappings):
        """Builds Mongo collection"""
        mongo = Mongo(MONGO_HOST, MONGO_DB, self.__collection)
        mongo.drop()

        predicate = "!<owl:sameAs>"
        i = 0
        for fb_id, dbp_ids in mappings.items():
            for dbp_id in dbp_ids:
                mongo.append_set(fb_id, predicate, [dbp_id])
            i += 1
            if i % 1000 == 0:
                print(str(i // 1000) + "K entities are added!")
示例#10
0
 def __init__(self, path_to_dbpedia, entities_file, output_dir):
     self.path_to_dbpedia = path_to_dbpedia
     self.entities_file = entities_file
     self.output_dir = output_dir
     self.sample_entities = []
     self.prefix = URIPrefix()
示例#11
0
class CreateDBpediaSample(object):
    def __init__(self, path_to_dbpedia, entities_file, output_dir):
        self.path_to_dbpedia = path_to_dbpedia
        self.entities_file = entities_file
        self.output_dir = output_dir
        self.sample_entities = []
        self.prefix = URIPrefix()

    def __load_sample_entities(self):
        """Loads the set of entities to be sampled from file."""
        self.sample_entities = FileUtils.read_file_as_list(self.entities_file)

    def __sample_file(self, dir, file):
        """Creates a local from a specific file in a given directory.

        :param dir: directory (relative to path_to_dbpedia)
        :param file:
        """
        t = Triple()
        p = NTriplesParser(t)
        infile = os.path.join(self.path_to_dbpedia, dir, file)
        outfile = os.path.join(self.output_dir, dir, file)
        print("Processing file " + file + " ...")
        i = 0
        with FileUtils.open_file_by_type(infile) as fin:
            fout = FileUtils.open_file_by_type(
                outfile,
                mode="w")  # output file will be of the same type as the input
            for line in fin:
                try:
                    p.parsestring(line.decode("utf-8"))
                except ParseError:  # skip lines that couldn't be parsed
                    continue
                if t.subject() is None:  # only if parsed as a triple
                    continue
                subj = self.prefix.get_prefixed(
                    t.subject())  # prefixing subject
                if subj in self.sample_entities:
                    fout.write(line)
                i += 1
                if i % 100000 == 0:
                    print(str(i // 1000) + "K lines processed")
            fout.close()

    def __sample_dir(self, dir, ext):
        """Creates a local from a specific directory.

        :param dir: directory (relative to path_to_dbpedia)
        :param ext: file extensions considered
        """
        print("Processing directory " + dir + " ...")
        # make sure the dir exists under the output directory
        outdir = os.path.join(self.output_dir, dir)
        if not os.path.exists(outdir):
            os.makedirs(outdir)
        # make a local of each file from that directory with the given extension
        for root, dirs, files in os.walk(
                os.path.join(self.path_to_dbpedia, dir)):
            print(root)
            for file in files:
                if file.endswith(ext):
                    self.__sample_file(dir, file)

    def create_sample(self):
        """Creates a local."""
        self.__load_sample_entities()
        self.__sample_dir("core-i18n/en/", ".ttl.bz2")
        self.__sample_dir("links/", ".nt.bz2")
示例#12
0
    def build_index(self, force=False):
        """Builds the index.

        :param force: True iff it is required to overwrite the index (i.e. by creating it by force); False by default.
        :type force: bool
        :return:
        """
        self.__elastic = Elastic(self.__index_name)
        self.__elastic.create_index(mappings=self.__MAPPINGS, force=force)
        prefix = URIPrefix()

        # For indexing types in bulk
        types_bulk = {}  # dict from type id to type(=doc)

        # process type2entity file
        last_type = None
        entities = []
        lines_counter = 0
        types_counter = 0
        with FileUtils.open_file_by_type(self.__type2entity_file) as f:
            for line in f:
                line = line.decode()  # o.w. line is made of bytes
                if not line.startswith("<"):  # bad-formed lines in dataset
                    continue
                subj, obj = line.rstrip().split()

                type = prefix.get_prefixed(subj)  # subject prefixed
                entity = prefix.get_prefixed(obj)

                # use only DBpedia Ontology native types (no bibo, foaf, schema, etc.)
                if not type.startswith(DBO_PREFIX):
                    continue

                if last_type is not None and type != last_type:
                    # moving to new type, so:
                    # create a doc for this type, with all the abstracts for its entities, and store it in a bulk
                    types_counter += 1
                    # PLOGGER.info("\n\tFound {}-th type: {}\t\t with # of entities: {}".format(types_counter,
                    #                                                                           last_type,
                    #                                                                           len(entities)))
                    types_bulk[last_type] = self.__make_type_doc(entities, last_type)
                    entities = []  # important to reset it

                    if types_counter % BULK_LEN == 0:  # index the bulk of BULK_LEN docs
                        self.__elastic.add_docs_bulk(types_bulk)
                        types_bulk.clear()  # NOTE: important to reset it
                        PLOGGER.info("\tIndexing a bulk of {} docs (types)... OK. "
                                     "{} types already indexed.".format(BULK_LEN, types_counter))

                last_type = type
                entities.append(entity)

                lines_counter += 1
                if lines_counter % 10000 == 0:
                    # PLOGGER.info("\t{}K lines processed".format(lines_counter // 1000))
                    pass
                pass

        # index the last type
        types_counter += 1

        PLOGGER.info("\n\tFound {}-th (last) type: {}\t\t with # of entities: {}".format(types_counter, last_type,
                                                                                         len(entities)))

        types_bulk[last_type] = self.__make_type_doc(entities, last_type)
        self.__elastic.add_docs_bulk(types_bulk)  # a tiny bulk :)
        # no need to reset neither entities nor types_bulk :P
        # PLOGGER.info("Indexing a bulk of {} docs (types)... OK.".format(BULK_LEN))

        PLOGGER.info("\n### Indexing all {} found docs (types)... Done.".format(types_counter))