Пример #1
0
def index(mapper, bulk_size=10000):
    """Indexing"""
    pres_prof_mapping = gen_mappings()
    file = open(WP_ST_F, "r")
    index_name = WP_ST_INDEX_ID
    mappings = {
        "content": Elastic.analyzed_field(),
        "professions": Elastic.notanalyzed_field()
    }
    elastic = Elastic(index_name)
    elastic.create_index(mappings, force=True)
    doc_id = 0
    docs = {}
    for line in file:
        doc_id += 1
        profs = []
        while ("[" in line):  # replace [A|B] with A
            matchObj = re.search('\[(.*?)\]', line)
            entity = matchObj.group(1).split("|")[0]
            name = entity.replace("_", " ")
            entity_id = mapper.get_id_from_person(name)
            prof_list = pres_prof_mapping[name]
            prof_list = [mapper.get_id_from_prof(prof) for prof in prof_list]
            profs += prof_list
            line = line.replace("[" + matchObj.group(1) + "]", entity_id)
        docs[doc_id] = {"content": line, "professions": list(set(profs))}
        if len(docs) == bulk_size:  # bulk add 10000 sentences into elastic
            elastic.add_docs_bulk(docs)
            docs = {}
            print(doc_id / 1000, "K documents indexed.")

    # if len(docs) < 10000: # index the last butch of sentences
    elastic.add_docs_bulk(docs)
Пример #2
0
 def get_mappings(self):
     """Sets the mappings"""
     mappings = {Elastic.FIELD_CATCHALL: Elastic.analyzed_field()}
     mappings["abstract"] = Elastic.analyzed_field()
     for field in self._fsdm_fields:
         mappings[field] = Elastic.analyzed_field()
     return mappings
Пример #3
0
    def __init__(self,
                 index_name,
                 association_file,
                 assoc_mode,
                 retr_model,
                 retr_params,
                 num_docs=None,
                 field="content",
                 run_id="fusion",
                 num=100):
        """

        :param index_name: name of index
        :param association_file: document-object association file
        :param assoc_mode: document-object weight mode, uniform or binary
        :param retr_model: document-object weight mode, uniform or binary
        :param retr_params: parameter in similarity method
        """
        self._index_name = index_name
        self._field = field
        self._num_docs = num_docs
        self._elastic = Elastic(self._index_name)
        self._model = retr_model
        self._params = retr_params
        self._elastic.update_similarity(self._model, self._params)
        self.association_file = association_file
        self.assoc_doc = {}
        self.assoc_obj = {}
        self.run_id = run_id
        self._assoc_mode = assoc_mode
        self._num = num
    def __init__(self,
                 index_name,
                 association_file,
                 object_length_file,
                 assoc_mode,
                 retr_params,
                 field="content",
                 run_id="fusion"):
        """

        :param index_name: name of index
        :param association_file: document-object association file
        :param object_length_file: object length file
        :param assoc_mode: document-object weight mode, uniform or binary
        :param retr_params: BM25 parameter dict
        :param field: field to be searched
        """
        self._index_name = index_name
        self._elastic = Elastic(self._index_name)
        self._k1 = retr_params.get("k1", 1.2)
        self._b = retr_params.get("b", 0.75)
        self._field = field
        self._o_l = object_length(object_length_file)
        self._collection_length = self._elastic.coll_length(self._field)
        self._N = self._elastic.num_docs()
        self._assoc_mode = assoc_mode
        self.association_file = association_file
        self.assoc_doc = {}
        self.assoc_obj = {}
        self.run_id = run_id
Пример #5
0
    def get_mappings(self):
        """Sets the mappings"""
        mappings = {Elastic.FIELD_CATCHALL: Elastic.notanalyzed_searchable_field()}
        for field in self._fsdm_fields:
            mappings[field] = Elastic.notanalyzed_searchable_field()

        self.get_top_fields()
        for field in self.__top_fields:
            mappings[field] = Elastic.notanalyzed_searchable_field()

        return mappings
Пример #6
0
    def build(self, callback_get_doc_content, bulk_size=1000):
        """Builds the DBpedia index from the mongo collection.

        To speedup indexing, we index documents as a bulk.
        There is an optimum value for the bulk size; try to figure it out.

        :param callback_get_doc_content: a function that get a documet from mongo and return the content for indexing
        :param bulk_size: Number of documents to be added to the index as a bulk
        """
        PLOGGER.info("Building " + self.__index_name + " ...")
        elastic = Elastic(self.__index_name)
        elastic.create_index(self.__mappings, model=self.__model, force=True)

        i = 0
        docs = dict()
        for mdoc in self.__mongo.find_all(no_timeout=True):
            docid = Mongo.unescape(mdoc[Mongo.ID_FIELD])

            # get back document from mongo with keys and _id field unescaped
            doc = callback_get_doc_content(Mongo.unescape_doc(mdoc))
            if doc is None:
                continue
            docs[docid] = doc

            i += 1
            if i % bulk_size == 0:
                elastic.add_docs_bulk(docs)
                docs = dict()
                PLOGGER.info(str(i / 1000) + "K documents indexed")
        # indexing the last bulk of documents
        elastic.add_docs_bulk(docs)
        PLOGGER.info("Finished indexing (" + str(i) + " documents in total)")
Пример #7
0
    def build_index(self, force=False):
        """Builds the index.

        Note: since DBpedia only has a few hundred types, no bulk indexing is
        needed.

        :param force: True iff it is required to overwrite the index (i.e. by
        creating it by force); False by default.
        :type force: bool
        :return:
        """
        PLOGGER.info("Building type index {}".format(self.__index_name))
        self.__elastic = Elastic(self.__index_name)
        self.__elastic.create_index(mappings=self.__MAPPINGS, force=force)

        for type_name in self.__types_entities:
            PLOGGER.info("  Adding {} ...".format(type_name))
            contents = self.__make_type_doc(type_name)
            self.__elastic.add_doc(type_name, contents)

        PLOGGER.info("  Done.")
Пример #8
0
    def __init__(self, index_name, association_file, assoc_mode, retr_params, run_id="fusion", field="content",
                 num=100):
        """

        :param index_name: name of index
        :param association_file: document-object association file
        :param assoc_mode: document-object weight mode, uniform or binary
        :param lambda: smoothing parameter
        :param field: field to be searched
        """
        self._index_name = index_name
        self._elastic = Elastic(self._index_name)
        self._lambda = retr_params.get("lambda", 0.1)
        self._field = field
        self._collection_length = self._elastic.coll_length(self._field)
        self._assoc_mode = assoc_mode
        self._num = num
        self.association_file = association_file
        self.assoc_doc = {}
        self.assoc_obj = {}
        self.run_id = run_id
Пример #9
0
def main():
    index_name = "toy_index"

    mappings = {
        # "id": Elastic.notanalyzed_field(),
        "title": Elastic.analyzed_field(),
        "content": Elastic.analyzed_field(),
    }

    docs = {
        1: {
            "title":
            "Rap God",
            "content":
            "gonna, gonna, Look, I was gonna go easy on you and not to hurt your feelings"
        },
        2: {
            "title":
            "Lose Yourself",
            "content":
            "Yo, if you could just, for one minute Or one split second in time, forget everything Everything that bothers you, or your problems Everything, and follow me"
        },
        3: {
            "title":
            "Love The Way You Lie",
            "content":
            "Just gonna stand there and watch me burn But that's alright, because I like the way it hurts"
        },
        4: {
            "title":
            "The Monster",
            "content": [
                "gonna gonna I'm friends with the monster",
                "That's under my bed Get along with the voices inside of my head"
            ]
        },
        5: {
            "title":
            "Beautiful",
            "content":
            "Lately I've been hard to reach I've been too long on my own Everybody has a private world Where they can be alone"
        }
    }

    elastic = Elastic(index_name)
    elastic.create_index(mappings, force=True)
    elastic.add_docs_bulk(docs)
    print("index has been built")
Пример #10
0
class EarlyFusionScorer(FusionScorer):
    def __init__(self, index_name, association_file, assoc_mode, retr_params, run_id="fusion", field="content",
                 num=100):
        """

        :param index_name: name of index
        :param association_file: document-object association file
        :param assoc_mode: document-object weight mode, uniform or binary
        :param lambda: smoothing parameter
        :param field: field to be searched
        """
        self._index_name = index_name
        self._elastic = Elastic(self._index_name)
        self._lambda = retr_params.get("lambda", 0.1)
        self._field = field
        self._collection_length = self._elastic.coll_length(self._field)
        self._assoc_mode = assoc_mode
        self._num = num
        self.association_file = association_file
        self.assoc_doc = {}
        self.assoc_obj = {}
        self.run_id = run_id

    def score_query(self, query):
        """
        Scores a given query.

        :param query: query to be searched
        :return: pqo
        """
        # retrieving documents
        aquery = self._elastic.analyze_query(query)
        pr = self._elastic.search(aquery, self._field, num=self._num)
        q = self.parse(aquery)

        # scoring objects, i.e., computing P(q|o)
        pqo = {}
        qt = Counter(q)
        for t, ftq in qt.items():
            # Scores each query term and sums up, i.e., computing P(t|o)

            # Gets term frequency in collections
            term = stemmer.stemWords(t.split())[0]
            try:
                ftc = self._elastic.coll_term_freq(term, self._field)
                if ftc == None:
                    print("Ignore term", t)
                    continue
            except:
                print("Ignore term", t)
                continue

            ptc = ftc / self._collection_length

            # Fuses ptd for each object
            ptd_fused = {}
            for item in pr.keys():
                doc_id = item
                if doc_id in self.assoc_doc:
                    try:
                        ftd = self._elastic.term_freq(doc_id, term, self._field)
                    except: # the content of doc is empty
                        ftd = 0
                    doc_length = self._elastic.doc_length(doc_id, self._field)
                    ptd = ftd / doc_length
                    for object_id in self.assoc_doc[doc_id]:
                        if self._assoc_mode == FusionScorer.ASSOC_MODE_BINARY:
                            w_do = 1
                        elif self._assoc_mode == FusionScorer.ASSOC_MODE_UNIFORM:
                            w_do = 1 / len(self.assoc_obj[object_id])
                        else:
                            w_do = 0  # this should never happen
                        ptd_fused[object_id] = ptd_fused.get(object_id, 0) + ptd * w_do

            # Adds pto to pqo
            for object_id in self.assoc_obj.keys():
                fptd = ptd_fused.get(object_id, 0)
                pto = math.log((1 - self._lambda) * fptd + self._lambda * ptc) * ftq
                pqo[object_id] = pqo.get(object_id, 0) + pto

        return RetrievalResults(pqo)
Пример #11
0
class IndexerDBpediaTypes(object):
    __DOC_TYPE = "doc"  # we don't make use of types
    __MAPPINGS = {
        "id": Elastic.notanalyzed_field(),
        "content": Elastic.analyzed_field(),
    }

    def __init__(self, config):
        self.__elastic = None
        self.__config = config
        self.__index_name = config["index_name"]
        self.__dbpedia_path = config["dbpedia_files_path"]
        # For triple parsing
        self.__prefix = URIPrefix()
        self.__triple = Triple()
        self.__ntparser = NTriplesParser(self.__triple)
        # Entity abstract and type assignments kept in memory
        self.__entity_abstracts = {}
        self.__load_entity_abstracts()
        self.__types_entities = defaultdict(list)
        self.__load_entity_types()

    @property
    def name(self):
        return self.__index_name

    def __parse_line(self, line):
        """Parses a line from a ttl file and returns subject and object pair.

        It is used for parsing DBpedia abstracts and entity types.
        The subject is always prefixed.
        For object URIs, it is returned prefixed if from DBpedia otherwise
        None (i.e., types); literal objects are always returned (i.e.,
        abstracts).
        """
        line = line.decode("utf-8") if isinstance(line, bytes) else line
        try:
            self.__ntparser.parsestring(line)
        except ParseError:  # skip lines that couldn't be parsed
            return None, None
        if self.__triple.subject() is None:  # only if parsed as a triple
            return None, None

        subj = self.__prefix.get_prefixed(self.__triple.subject())
        obj = None
        if type(self.__triple.object()) is URIRef:
            if self.__triple.object().startswith(
                    "http://dbpedia.org/ontology"):
                obj = self.__prefix.get_prefixed(self.__triple.object())
        else:
            obj = self.__triple.object().encode("utf-8")

        return subj, obj

    def __load_entity_abstracts(self):
        num_lines = 0
        filename = os.sep.join([self.__dbpedia_path, ENTITY_ABSTRACTS_FILE])
        PLOGGER.info("Loading entity abstracts from {}".format(filename))
        for line in FileUtils.read_file_as_list(filename):
            entity, abstract = self.__parse_line(line)
            if abstract and len(abstract) > 0:  # skip empty objects
                self.__entity_abstracts[entity] = abstract

            num_lines += 1
            if num_lines % 10000 == 0:
                PLOGGER.info("  {}K lines processed".format(num_lines // 1000))

        PLOGGER.info("  Done.")

    def __load_entity_types(self):
        num_lines = 0
        for types_file in ENTITY_TYPES_FILES:
            filename = os.sep.join([self.__dbpedia_path, types_file])
            PLOGGER.info("Loading entity types from {}".format(filename))
            for line in FileUtils.read_file_as_list(filename):
                entity, entity_type = self.__parse_line(line)
                if type(entity_type) != str:  # Likely result of parsing error
                    continue
                if not entity_type.startswith("<dbo:"):
                    PLOGGER.info("  Non-DBpedia type: {}".format(entity_type))
                    continue
                if not entity.startswith("<dbpedia:"):
                    PLOGGER.info("  Invalid entity: {}".format(entity))
                    continue
                self.__types_entities[entity_type].append(entity)

                num_lines += 1
                if num_lines % 10000 == 0:
                    PLOGGER.info("  {}K lines processed".format(num_lines //
                                                                1000))
            PLOGGER.info("  Done.")

    def __make_type_doc(self, type_name):
        """Gets the document representation of a type to be indexed, from its
        entity short abstracts."""
        content = "\n".join([
            self.__entity_abstracts.get(e, b"").decode("utf-8")
            for e in self.__types_entities[type_name]
        ])

        if len(content) > MAX_BULKING_DOC_SIZE:
            PLOGGER.info("Type {} has content larger than allowed: {}.".format(
                type_name, len(content)))

            # we randomly sample a subset of Y entity abstracts, s.t.
            # Y * AVG_SHORT_ABSTRACT_LEN <= MAX_BULKING_DOC_SIZE
            num_entities = len(self.__types_entities[type_name])
            amount_abstracts_to_sample = min(
                floor(MAX_BULKING_DOC_SIZE / AVG_SHORT_ABSTRACT_LEN),
                num_entities)
            entities_sample = [
                self.__types_entities[type_name][i] for i in sample(
                    range(num_entities), amount_abstracts_to_sample)
            ]
            content = ""  # reset content
            for entity in entities_sample:
                new_content_candidate = "\n".join([
                    content,
                    self.__entity_abstracts.get(entity, b"").decode("utf-8")
                ])
                # we add an abstract only if by doing so it will not exceed
                # MAX_BULKING_DOC_SIZE
                if len(new_content_candidate) > MAX_BULKING_DOC_SIZE:
                    break
                content = new_content_candidate

        return {"content": content}

    def build_index(self, force=False):
        """Builds the index.

        Note: since DBpedia only has a few hundred types, no bulk indexing is
        needed.

        :param force: True iff it is required to overwrite the index (i.e. by
        creating it by force); False by default.
        :type force: bool
        :return:
        """
        PLOGGER.info("Building type index {}".format(self.__index_name))
        self.__elastic = Elastic(self.__index_name)
        self.__elastic.create_index(mappings=self.__MAPPINGS, force=force)

        for type_name in self.__types_entities:
            PLOGGER.info("  Adding {} ...".format(type_name))
            contents = self.__make_type_doc(type_name)
            self.__elastic.add_doc(type_name, contents)

        PLOGGER.info("  Done.")
Пример #12
0
from nordlys.core.retrieval.elastic import Elastic

index_name = "cerc-expert"
query = ["climate", "change"]
elas = Elastic(index_name)
model = "LMJelinekMercer"
params = {"lambda": 0.1}
elas.update_similarity(model, params)
pr = elas.search(query, "content")['hits']
print(pr[0]['_score'])
print(pr[1]['_score'])
print(pr[2]['_score'])

mode1 = "BM25"
params1 = {"k1": 1.2, "b": 0.75}
elas1 = Elastic(index_name)
elas1.update_similarity(mode1, params1)
pr1 = elas.search(query, "content")['hits']
print(pr1[0]['_score'])
print(pr1[1]['_score'])
print(pr1[2]['_score'])
Пример #13
0
class Statistics(object):
    index_name = "blog_test6"
    elastic = Elastic(index_name)

    def __init__(self):
        self.index_name = self.index_name

    def TF(self, term, doc_id, field):  # return term frequency in doc
        stat = self.elastic.get_termvector_all(docid=doc_id)
        term1 = stemmer.stemWords(term.split())[0]
        if field == "title":
            try:
                tf_title = stat['term_vectors']['title']['terms'][term1][
                    'term_freq']
            except:
                tf_title = 0
            return tf_title
        elif field == "content":
            try:
                tf_con = stat['term_vectors']['content']['terms'][term1][
                    'term_freq']
            except:
                tf_con = 0
            return tf_con

    def DL(self, doc_id, field):  # return doc length
        stat = self.elastic.get_termvector_all(docid=doc_id)
        if field == "title":
            dl = stat['term_vectors']['title']['field_statistics']['sum_ttf']
            return dl
        elif field == "content":
            dl_c = stat['term_vectors']['content']['field_statistics'][
                'sum_ttf']
            return dl_c

    def TF_C(self, term, field):  # return term frequency in collection
        tf_c_t = 0
        tf_c_c = 0
        pr = self.elastic.search_all(self.index_name,
                                     term,
                                     fields_return="",
                                     start=0,
                                     num=100)['hits']
        if field == "content":
            for item in pr:
                record = self.elastic.get_termvector_all(docid=item['_id'])
                term1 = stemmer.stemWords(term.split())[0]
                try:
                    tf_c_c += record['term_vectors']['content']['terms'][
                        term1]['term_freq']
                except:
                    tf_c_c += 0
            return tf_c_c
        elif field == "title":
            for item in pr:
                record = self.elastic.get_termvector_all(docid=item['_id'])
                term1 = stemmer.stemWords(term.split())[0]
                try:
                    tf_c_t += record['term_vectors']['title']['terms'][term1][
                        'term_freq']
                except:
                    tf_c_t += 0
            return tf_c_t

    def CL(self, term, field):  # return collection length
        cl_t = 0
        cl_c = 0
        pr = self.elastic.search_all(self.index_name,
                                     term,
                                     fields_return="",
                                     start=0,
                                     num=100)['hits']
        for item in pr:
            record = self.elastic.get_termvector_all(docid=item['_id'])
            cl_t = cl_t + record['term_vectors']['title']['field_statistics'][
                'sum_ttf']
            cl_c = cl_c + record['term_vectors']['content'][
                'field_statistics']['sum_ttf']
        if field == "title":
            return cl_t
        elif field == "content":
            return cl_c
class EarlyFusionScorer(FusionScorer):
    def __init__(self,
                 index_name,
                 association_file,
                 object_length_file,
                 assoc_mode,
                 retr_params,
                 field="content",
                 run_id="fusion"):
        """

        :param index_name: name of index
        :param association_file: document-object association file
        :param object_length_file: object length file
        :param assoc_mode: document-object weight mode, uniform or binary
        :param retr_params: BM25 parameter dict
        :param field: field to be searched
        """
        self._index_name = index_name
        self._elastic = Elastic(self._index_name)
        self._k1 = retr_params.get("k1", 1.2)
        self._b = retr_params.get("b", 0.75)
        self._field = field
        self._o_l = object_length(object_length_file)
        self._collection_length = self._elastic.coll_length(self._field)
        self._N = self._elastic.num_docs()
        self._assoc_mode = assoc_mode
        self.association_file = association_file
        self.assoc_doc = {}
        self.assoc_obj = {}
        self.run_id = run_id

    def score_query(self, query):
        """
        Scores a given query.

        :param query: query to be searched
        :return: pqo
        """
        aquery = self._elastic.analyze_query(query)
        pr = self._elastic.search(aquery, self._field)
        avg_ol = self._collection_length / (len(self.assoc_obj))
        q = self.parse(aquery)

        # Scoring objects, i.e., computing P(q|o)
        pqo = {}
        qt = Counter(q)
        for t, ftq in qt.items():
            # Scores each query term and sums up, i.e., computing P(t|o)

            # Retrieving documents and gets IDF
            n = len(self._elastic.search(
                t, self._field))  # number of documents containing term t
            if n == 0:
                continue
            idf = math.log((self._N - n + 0.5) / (n + 0.5))

            # Fuses f(t,o) for each object
            term = stemmer.stemWords(t.split())[0]
            ftd_fused = {}
            for item in pr.keys():
                doc_id = item
                if doc_id in self.assoc_doc:
                    try:
                        ftd = self._elastic.term_freq(doc_id, term,
                                                      self._field)
                    except:  # doc without content
                        ftd = 0
                    for object_id in self.assoc_doc[doc_id]:
                        if self._assoc_mode == FusionScorer.ASSOC_MODE_BINARY:
                            w_do = 1
                        elif self._assoc_mode == FusionScorer.ASSOC_MODE_UNIFORM:
                            w_do = 1 / len(self.assoc_obj[object_id])
                        else:
                            w_do = 0  # this should never happen
                        ftd_fused[object_id] = ftd_fused.get(object_id,
                                                             0) + ftd * w_do

            # Add pto into pqo
            for object_id in self.assoc_obj.keys():
                ol = int(self._o_l[object_id])
                fftd = ftd_fused.get(object_id, 0)
                score = (fftd * (self._k1 + 1)) / (
                    fftd + self._k1 * (1 - self._b + self._b * ol / avg_ol))
                pqo[object_id] = pqo.get(object_id, 0) + idf * score

        return RetrievalResults(pqo)
Пример #15
0
class IndexerDBpediaTypes(object):
    __DOC_TYPE = "doc"  # we don't make use of types
    __MAPPINGS = {
        # ID_KEY: Elastic.notanalyzed_field(),
        CONTENT_KEY: Elastic.analyzed_field(),
    }

    def __init__(self, config):
        self.__elastic = None
        self.__config = config
        self.__model = config.get("model", Elastic.BM25)
        self.__index_name = config["index_name"]
        self.__type2entity_file = config["type2entity_file"]
        self.__entity_abstracts = {}
        self.__load_entity_abstracts(config["entity_abstracts_file"])

    @property
    def name(self):
        return self.__index_name

    def __load_entity_abstracts(self, filename):
        prefix = URIPrefix()
        t = Triple()
        p = NTriplesParser(t)
        lines_counter = 0
        PLOGGER.info("Loading entity abstracts from {}".format(filename))
        for line in FileUtils.read_file_as_list(filename):
            # basic line parsing
            line = line.decode("utf-8") if isinstance(line, bytes) else line
            try:
                p.parsestring(line)
            except ParseError:  # skip lines that couldn't be parsed
                continue
            if t.subject() is None:  # only if parsed as a triple
                continue

            # Subject and object identification
            subj = prefix.get_prefixed(t.subject())
            obj = ""
            if type(t.object()) is URIRef:
                # PLOGGER.error("Error: it is URIRef the parsed obj")
                pass
            else:
                obj = t.object().encode("utf-8")
                if len(obj) == 0:
                    continue  # skip empty objects
            self.__entity_abstracts[subj] = obj

            lines_counter += 1
            if lines_counter % 10000 == 0:
                PLOGGER.info("\t{}K lines processed".format(lines_counter // 1000))
                pass

        PLOGGER.info("\n### Loading entity abstracts... Done.")

    def __make_type_doc(self, entities, last_type):
        """Gets the document representation of a type to be indexed, from its entity short abstracts."""
        content = ABSTRACTS_SEPARATOR.join([self.__entity_abstracts.get(e, b"").decode("utf-8")
                                            for e in entities])

        if len(content) > MAX_BULKING_DOC_SIZE:

            PLOGGER.info("Type {} has content larger than allowed: {}.".format(last_type, len(content)))

            # we randomly sample a subset of Y entity abstracts, s.t. Y * AVG_SHORT_ABSTRACT_LEN <= MAX_BULKING_DOC_SIZE
            amount_abstracts_to_sample = min(floor(MAX_BULKING_DOC_SIZE / AVG_SHORT_ABSTRACT_LEN), len(entities))
            entities_sample = [entities[i] for i in sample(range(len(entities)), amount_abstracts_to_sample)]
            content = ""  # reset content

            for entity in entities_sample:
                new_content_candidate = (content + ABSTRACTS_SEPARATOR +
                                         self.__entity_abstracts.get(entity, b"").decode("utf-8"))
                # we add an abstract only if by doing so it will not exceed MAX_BULKING_DOC_SIZE
                if len(new_content_candidate) <= MAX_BULKING_DOC_SIZE:
                    content = new_content_candidate
                else:
                    break

        return {CONTENT_KEY: content}

    def build_index(self, force=False):
        """Builds the index.

        :param force: True iff it is required to overwrite the index (i.e. by creating it by force); False by default.
        :type force: bool
        :return:
        """
        self.__elastic = Elastic(self.__index_name)
        self.__elastic.create_index(mappings=self.__MAPPINGS, force=force)
        prefix = URIPrefix()

        # For indexing types in bulk
        types_bulk = {}  # dict from type id to type(=doc)

        # process type2entity file
        last_type = None
        entities = []
        lines_counter = 0
        types_counter = 0
        with FileUtils.open_file_by_type(self.__type2entity_file) as f:
            for line in f:
                line = line.decode()  # o.w. line is made of bytes
                if not line.startswith("<"):  # bad-formed lines in dataset
                    continue
                subj, obj = line.rstrip().split()

                type = prefix.get_prefixed(subj)  # subject prefixed
                entity = prefix.get_prefixed(obj)

                # use only DBpedia Ontology native types (no bibo, foaf, schema, etc.)
                if not type.startswith(DBO_PREFIX):
                    continue

                if last_type is not None and type != last_type:
                    # moving to new type, so:
                    # create a doc for this type, with all the abstracts for its entities, and store it in a bulk
                    types_counter += 1
                    # PLOGGER.info("\n\tFound {}-th type: {}\t\t with # of entities: {}".format(types_counter,
                    #                                                                           last_type,
                    #                                                                           len(entities)))
                    types_bulk[last_type] = self.__make_type_doc(entities, last_type)
                    entities = []  # important to reset it

                    if types_counter % BULK_LEN == 0:  # index the bulk of BULK_LEN docs
                        self.__elastic.add_docs_bulk(types_bulk)
                        types_bulk.clear()  # NOTE: important to reset it
                        PLOGGER.info("\tIndexing a bulk of {} docs (types)... OK. "
                                     "{} types already indexed.".format(BULK_LEN, types_counter))

                last_type = type
                entities.append(entity)

                lines_counter += 1
                if lines_counter % 10000 == 0:
                    # PLOGGER.info("\t{}K lines processed".format(lines_counter // 1000))
                    pass
                pass

        # index the last type
        types_counter += 1

        PLOGGER.info("\n\tFound {}-th (last) type: {}\t\t with # of entities: {}".format(types_counter, last_type,
                                                                                         len(entities)))

        types_bulk[last_type] = self.__make_type_doc(entities, last_type)
        self.__elastic.add_docs_bulk(types_bulk)  # a tiny bulk :)
        # no need to reset neither entities nor types_bulk :P
        # PLOGGER.info("Indexing a bulk of {} docs (types)... OK.".format(BULK_LEN))

        PLOGGER.info("\n### Indexing all {} found docs (types)... Done.".format(types_counter))
Пример #16
0
    def build_index(self, force=False):
        """Builds the index.

        :param force: True iff it is required to overwrite the index (i.e. by creating it by force); False by default.
        :type force: bool
        :return:
        """
        self.__elastic = Elastic(self.__index_name)
        self.__elastic.create_index(mappings=self.__MAPPINGS, force=force)
        prefix = URIPrefix()

        # For indexing types in bulk
        types_bulk = {}  # dict from type id to type(=doc)

        # process type2entity file
        last_type = None
        entities = []
        lines_counter = 0
        types_counter = 0
        with FileUtils.open_file_by_type(self.__type2entity_file) as f:
            for line in f:
                line = line.decode()  # o.w. line is made of bytes
                if not line.startswith("<"):  # bad-formed lines in dataset
                    continue
                subj, obj = line.rstrip().split()

                type = prefix.get_prefixed(subj)  # subject prefixed
                entity = prefix.get_prefixed(obj)

                # use only DBpedia Ontology native types (no bibo, foaf, schema, etc.)
                if not type.startswith(DBO_PREFIX):
                    continue

                if last_type is not None and type != last_type:
                    # moving to new type, so:
                    # create a doc for this type, with all the abstracts for its entities, and store it in a bulk
                    types_counter += 1
                    # PLOGGER.info("\n\tFound {}-th type: {}\t\t with # of entities: {}".format(types_counter,
                    #                                                                           last_type,
                    #                                                                           len(entities)))
                    types_bulk[last_type] = self.__make_type_doc(entities, last_type)
                    entities = []  # important to reset it

                    if types_counter % BULK_LEN == 0:  # index the bulk of BULK_LEN docs
                        self.__elastic.add_docs_bulk(types_bulk)
                        types_bulk.clear()  # NOTE: important to reset it
                        PLOGGER.info("\tIndexing a bulk of {} docs (types)... OK. "
                                     "{} types already indexed.".format(BULK_LEN, types_counter))

                last_type = type
                entities.append(entity)

                lines_counter += 1
                if lines_counter % 10000 == 0:
                    # PLOGGER.info("\t{}K lines processed".format(lines_counter // 1000))
                    pass
                pass

        # index the last type
        types_counter += 1

        PLOGGER.info("\n\tFound {}-th (last) type: {}\t\t with # of entities: {}".format(types_counter, last_type,
                                                                                         len(entities)))

        types_bulk[last_type] = self.__make_type_doc(entities, last_type)
        self.__elastic.add_docs_bulk(types_bulk)  # a tiny bulk :)
        # no need to reset neither entities nor types_bulk :P
        # PLOGGER.info("Indexing a bulk of {} docs (types)... OK.".format(BULK_LEN))

        PLOGGER.info("\n### Indexing all {} found docs (types)... Done.".format(types_counter))
Пример #17
0
class LateFusionScorer(FusionScorer):
    def __init__(self,
                 index_name,
                 association_file,
                 assoc_mode,
                 retr_model,
                 retr_params,
                 num_docs=None,
                 field="content",
                 run_id="fusion",
                 num=100):
        """

        :param index_name: name of index
        :param association_file: document-object association file
        :param assoc_mode: document-object weight mode, uniform or binary
        :param retr_model: document-object weight mode, uniform or binary
        :param retr_params: parameter in similarity method
        """
        self._index_name = index_name
        self._field = field
        self._num_docs = num_docs
        self._elastic = Elastic(self._index_name)
        self._model = retr_model
        self._params = retr_params
        self._elastic.update_similarity(self._model, self._params)
        self.association_file = association_file
        self.assoc_doc = {}
        self.assoc_obj = {}
        self.run_id = run_id
        self._assoc_mode = assoc_mode
        self._num = num

    def score_query(self, query):
        """
        Scores a given query.

        :param query: query to be searched
        :return: pqo dict
        """
        # retrieving documents
        aquery = self._elastic.analyze_query(query)  # analyzed query
        res = self._elastic.search(aquery, self._field, num=self._num)

        # scoring objects, i.e., computing P(q|o)
        pqo = {}
        for i, item in enumerate(list(res.keys())):
            if self._num_docs is not None and i + 1 == self._num_docs:  # consider only top documents
                break
            doc_id = item
            doc_score = res[doc_id]
            if doc_id in self.assoc_doc:
                for object_id in self.assoc_doc[doc_id]:
                    if self._assoc_mode == FusionScorer.ASSOC_MODE_BINARY:
                        w_do = 1
                    elif self._assoc_mode == FusionScorer.ASSOC_MODE_UNIFORM:
                        w_do = 1 / len(self.assoc_obj[object_id])
                    else:
                        w_do = 0  # this should never happen
                    pqo[object_id] = pqo.get(object_id, 0) + doc_score * w_do

        return RetrievalResults(pqo)