def retrieve(self, store_json=True):
        """Scores queries and outputs results."""
        self._open_index()
        self._load_queries()
        self._load_query_annotations()

        # init output file
        if os.path.exists(self.config['output_file']):
            os.remove(self.config['output_file'])
        out = open(self.config['output_file'], "w")
        print "Number of queries:", len(self.queries)

        for qid in sorted(self.queries):
            query = Lucene.preprocess(self.queries[qid])
            print "scoring [" + qid + "] " + query
            query_annot = QueryAnnot(self.query_annotations[qid], self.config['th'], qid=qid)

            # score documents
            res1 = self._first_pass_scoring(self.lucene_term, query)
            scorer = ScorerMRF.get_scorer(self.lucene_term, self.lucene_uri, self.config, query_annot)
            results = self._second_pass_scoring(res1, scorer)

            # write results to output file
            results.write_trec_format(qid, self.config['run_id'], out, self.config['num_docs'])
            break

        out.close()
        self._close_index()

        print "Output results: " + self.config['output_file']
    def retrieve(self, store_json=True):
        """Scores queries and outputs results."""
        self._open_index()
        self._load_queries()
        self._load_query_annotations()

        # init output file
        if os.path.exists(self.config['output_file']):
            os.remove(self.config['output_file'])
        out = open(self.config['output_file'], "w")
        print "Number of queries:", len(self.queries)

        for qid in sorted(self.queries):
            query = Lucene.preprocess(self.queries[qid])
            print "scoring [" + qid + "] " + query
            query_annot = QueryAnnot(self.query_annotations[qid],
                                     self.config['th'],
                                     qid=qid)

            # score documents
            res1 = self._first_pass_scoring(self.lucene_term, query)
            scorer = ScorerMRF.get_scorer(self.lucene_term, self.lucene_uri,
                                          self.config, query_annot)
            results = self._second_pass_scoring(res1, scorer)

            # write results to output file
            results.write_trec_format(qid, self.config['run_id'], out,
                                      self.config['num_docs'])
            break

        out.close()
        self._close_index()

        print "Output results: " + self.config['output_file']
Exemplo n.º 3
0
 def __get_field_value(self, value, only_uris=False):
     """
     Converts mongoDB field value to indexable values by resolving URIs.
     It may be a string or a list and the return value is of the same data type.
     """
     if type(value) is list:
         nval = []  # holds resolved values
         for v in value:
             if not only_uris:
                 nval.append(Lucene.preprocess(self.__resolve_uri(v)))
             elif only_uris and self.__is_uri(v):
                 nval.append(v)
         return nval
     else:
         if not only_uris:
             return Lucene.preprocess(self.__resolve_uri(value))
         elif only_uris and self.__is_uri(value):
             return value
         # return self.__resolve_uri(value) if only_uris else value
     return None
Exemplo n.º 4
0
 def __get_field_value(self, value, only_uris=False):
     """
     Converts mongoDB field value to indexable values by resolving URIs.
     It may be a string or a list and the return value is of the same data type.
     """
     if type(value) is list:
         nval = []  # holds resolved values
         for v in value:
             if not only_uris:
                 nval.append(Lucene.preprocess(self.__resolve_uri(v)))
             elif only_uris and self.__is_uri(v):
                 nval.append(v)
         return nval
     else:
         if not only_uris:
             return Lucene.preprocess(self.__resolve_uri(value))
         elif only_uris and self.__is_uri(value):
             return value
         # return self.__resolve_uri(value) if only_uris else value
     return None
 def mentions(self):
     """Returns all mentions (among all annotations)."""
     if self.__mentions is None:
         self.__mentions = {}
         for interpretation in self.annotations['interpretations'].values():
             for mention, annot in interpretation['annots'].iteritems():
                 if float(annot['score']) >= self.score_th:
                     analyzed_phrase = Lucene.preprocess(mention)
                     if (analyzed_phrase is not None) and (analyzed_phrase.strip() != ""):
                         self.__mentions[analyzed_phrase] = annot['score']
     return self.__mentions
 def get_all_phrases(self):
     """Returns phrases for the ordered part of the model. (bigram and n-gram of mentions)"""
     all_phrases = set()
     for s_t in self.mentions:
         if len(s_t.split(" ")) > 1:
             all_phrases.add(s_t)
     analyzed_query = Lucene.preprocess(self.query)
     query_terms = analyzed_query.split(" ")
     for i in range(0, len(query_terms)-1):
         bigram = " ".join([query_terms[i], query_terms[i+1]])
         all_phrases.add(bigram)
     return all_phrases
Exemplo n.º 7
0
 def mentions(self):
     """Returns all mentions (among all annotations)."""
     if self.__mentions is None:
         self.__mentions = {}
         for interpretation in self.annotations['interpretations'].values():
             for mention, annot in interpretation['annots'].iteritems():
                 if float(annot['score']) >= self.score_th:
                     analyzed_phrase = Lucene.preprocess(mention)
                     if (analyzed_phrase is not None) and (
                             analyzed_phrase.strip() != ""):
                         self.__mentions[analyzed_phrase] = annot['score']
     return self.__mentions
Exemplo n.º 8
0
 def get_all_phrases(self):
     """Returns phrases for the ordered part of the model. (bigram and n-gram of mentions)"""
     all_phrases = set()
     for s_t in self.mentions:
         if len(s_t.split(" ")) > 1:
             all_phrases.add(s_t)
     analyzed_query = Lucene.preprocess(self.query)
     query_terms = analyzed_query.split(" ")
     for i in range(0, len(query_terms) - 1):
         bigram = " ".join([query_terms[i], query_terms[i + 1]])
         all_phrases.add(bigram)
     return all_phrases
Exemplo n.º 9
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-index_dir", help="Indexes that should be merged")
    parser.add_argument("-merged_index_dir", help="Merged index directory")
    args = parser.parse_args()
    lucene = Lucene(args.merged_index_dir)
    lucene.open_writer()
    print "Merging indexes..."
    lucene.add_indexes(args.index_dir)
    print "Indexes is now merged: " + args.merged_index_dir
    lucene.close_writer()
Exemplo n.º 10
0
    def get_p_q_d(self, query):
        # Preprocess query
        """
        Returns the p(q|d) probabilities for the query

        """
        query = Lucene.preprocess(query)
        # score collection, to determine a set of relevant documents.
        res_first_pass = self._first_pass_scoring(self.lucene, query)
        # Use LM from scorer.py to calculate p(q|d)
        scorer = Scorer.get_scorer(self.config['model'], self.lucene, query, self.config)
        p_q_d = self._second_pass_scoring(res_first_pass, scorer)
        return p_q_d
Exemplo n.º 11
0
class Indexer(object):
    def __init__(self, output_dir):
        self.contents = None
        self.lucene = Lucene(output_dir)
        self.lucene.open_writer()

    def __add_to_contents(self, field_name, field_value, field_type):
        """Adds field to document contents."""
        self.contents.append({'field_name': field_name,
                              'field_value': field_value,
                              'field_type': field_type})

    def index_file(self, record_id, replaced_annotated_record, cleaned_record, entities_record):
        """Builds index.
        :param record_id: Trec-ID
        :param cleaned_record: Cleaned record (removed HTML-tags, etc.).
        :param replaced_annotated_records: Cleaned record with entity mentions replaced with Freebase-ID
        """
        self.contents = []
        self.__add_to_contents(Lucene.FIELDNAME_ID, record_id, Lucene.FIELDTYPE_ID)
        self.__add_to_contents(Lucene.FIELDNAME_CONTENTS, cleaned_record, Lucene.FIELDTYPE_TEXT_TVP)
        self.__add_to_contents("contents_annotated", replaced_annotated_record, Lucene.FIELDTYPE_TEXT_NTVP)
        self.__add_to_contents("entities", entities_record, Lucene.FIELDTYPE_TEXT_NTVP)
        self.lucene.add_document(self.contents)

    def index_files(self, results):
        """
        Call index_file() on each record in results
        :param results: List of dictionaries
        """
        
        for warc_file in results:
            # Annotation .tsv is empty
            if warc_file is False:
                continue
            for record in warc_file:
                replaced_annotated_record = self.lucene.preprocess(record['replaced_record'])
                cleaned_record = self.lucene.preprocess(record['cleaned_record'])
                self.index_file(record['record_id'], replaced_annotated_record, cleaned_record, record['entities_record'])
        self.lucene.close_writer()
class RetrievalELR(Retrieval):
    def __init__(self, model, query_file, annot_file, el_th=None, lambd=None, n_fields=None):
        query_file = query_file
        config = {'model': model,
                  'index_dir': TERM_INDEX_DIR,
                  'query_file': query_file,
                  'lambda': lambd,
                  'th': el_th,
                  'n_fields': n_fields,
                  'first_pass_num_docs': 1000,
                  'num_docs': 100,
                  'fields': None}

        lambd_str = "_lambda" + "_".join([str(l) for l in lambd]) if lambd is not None else ""
        th_str = "_th" + str(el_th) if el_th is not None else ""
        fields_str = str(n_fields) if n_fields is not None else ""
        run_id = model + fields_str + th_str + lambd_str
        config['run_id'] = run_id
        config['output_file'] = OUTPUT_DIR + "/" + run_id + ".treceval"
        super(RetrievalELR, self).__init__(config)

        self.annot_file = annot_file

    def _load_query_annotations(self):
        """Loads field annotation file."""
        self.query_annotations = json.load(open(self.annot_file))

    def _open_index(self):
        self.lucene_term = Lucene(TERM_INDEX_DIR)
        self.lucene_uri = Lucene(URI_INDEX_DIR)
        self.lucene_term.open_searcher()
        self.lucene_uri.open_searcher()

    def _close_index(self):
        self.lucene_term.close_reader()
        self.lucene_uri.close_reader()

    def _second_pass_scoring(self, res1, scorer):
        """
        Returns second-pass scoring of documents.

        :param res1: first pass results
        :param scorer: scorer object
        :return: RetrievalResults object
        """
        print "\tSecond pass scoring... "
        results = RetrievalResults()
        for doc_id, orig_score in res1.get_scores_sorted():
            score = scorer.score_doc(doc_id)
            results.append(doc_id, score)
        print "done"
        return results

    def retrieve(self, store_json=True):
        """Scores queries and outputs results."""
        self._open_index()
        self._load_queries()
        self._load_query_annotations()

        # init output file
        if os.path.exists(self.config['output_file']):
            os.remove(self.config['output_file'])
        out = open(self.config['output_file'], "w")
        print "Number of queries:", len(self.queries)

        for qid in sorted(self.queries):
            query = Lucene.preprocess(self.queries[qid])
            print "scoring [" + qid + "] " + query
            query_annot = QueryAnnot(self.query_annotations[qid], self.config['th'], qid=qid)

            # score documents
            res1 = self._first_pass_scoring(self.lucene_term, query)
            scorer = ScorerMRF.get_scorer(self.lucene_term, self.lucene_uri, self.config, query_annot)
            results = self._second_pass_scoring(res1, scorer)

            # write results to output file
            results.write_trec_format(qid, self.config['run_id'], out, self.config['num_docs'])
            break

        out.close()
        self._close_index()

        print "Output results: " + self.config['output_file']
Exemplo n.º 13
0
 def __init__(self, output_dir):
     self.contents = None
     self.lucene = Lucene(output_dir)
     self.lucene.open_writer()
Exemplo n.º 14
0
 def _open_index(self):
     self.lucene_term = Lucene(TERM_INDEX_DIR)
     self.lucene_uri = Lucene(URI_INDEX_DIR)
     self.lucene_term.open_searcher()
     self.lucene_uri.open_searcher()
Exemplo n.º 15
0
 def T(self):
     """Returns all query terms."""
     if self.__T is None:
         analyzed_query = Lucene.preprocess(self.query)
         self.__T = analyzed_query.split(" ")
     return self.__T
Exemplo n.º 16
0
class RetrievalELR(Retrieval):
    def __init__(self,
                 model,
                 query_file,
                 annot_file,
                 el_th=None,
                 lambd=None,
                 n_fields=None):
        query_file = query_file
        config = {
            'model': model,
            'index_dir': TERM_INDEX_DIR,
            'query_file': query_file,
            'lambda': lambd,
            'th': el_th,
            'n_fields': n_fields,
            'first_pass_num_docs': 1000,
            'num_docs': 100,
            'fields': None
        }

        lambd_str = "_lambda" + "_".join([str(l) for l in lambd
                                          ]) if lambd is not None else ""
        th_str = "_th" + str(el_th) if el_th is not None else ""
        fields_str = str(n_fields) if n_fields is not None else ""
        run_id = model + fields_str + th_str + lambd_str
        config['run_id'] = run_id
        config['output_file'] = OUTPUT_DIR + "/" + run_id + ".treceval"
        super(RetrievalELR, self).__init__(config)

        self.annot_file = annot_file

    def _load_query_annotations(self):
        """Loads field annotation file."""
        self.query_annotations = json.load(open(self.annot_file))

    def _open_index(self):
        self.lucene_term = Lucene(TERM_INDEX_DIR)
        self.lucene_uri = Lucene(URI_INDEX_DIR)
        self.lucene_term.open_searcher()
        self.lucene_uri.open_searcher()

    def _close_index(self):
        self.lucene_term.close_reader()
        self.lucene_uri.close_reader()

    def _second_pass_scoring(self, res1, scorer):
        """
        Returns second-pass scoring of documents.

        :param res1: first pass results
        :param scorer: scorer object
        :return: RetrievalResults object
        """
        print "\tSecond pass scoring... "
        results = RetrievalResults()
        for doc_id, orig_score in res1.get_scores_sorted():
            score = scorer.score_doc(doc_id)
            results.append(doc_id, score)
        print "done"
        return results

    def retrieve(self, store_json=True):
        """Scores queries and outputs results."""
        self._open_index()
        self._load_queries()
        self._load_query_annotations()

        # init output file
        if os.path.exists(self.config['output_file']):
            os.remove(self.config['output_file'])
        out = open(self.config['output_file'], "w")
        print "Number of queries:", len(self.queries)
        count = 1
        for qid in sorted(self.queries):
            query = Lucene.preprocess(self.queries[qid])
            print "scoring [" + qid + "] " + query
            query_annot = QueryAnnot(self.query_annotations[qid],
                                     self.config['th'],
                                     qid=qid)

            # score documents
            res1 = self._first_pass_scoring(self.lucene_term, query)
            scorer = ScorerMRF.get_scorer(self.lucene_term, self.lucene_uri,
                                          self.config, query_annot)
            results = self._second_pass_scoring(res1, scorer)

            # write results to output file
            results.write_trec_format(qid, self.config['run_id'], out,
                                      self.config['num_docs'])
            #break
            #count=count+1
            #if count >3:
            #    break

        out.close()
        self._close_index()

        print "Output results: " + self.config['output_file']
Exemplo n.º 17
0
    def build_index(self,
                    index_config,
                    only_uris=False,
                    max_shingle_size=None):
        """Builds index.

        :param index_config: index configuration
        """
        lucene = Lucene(index_config['index_dir'], max_shingle_size)
        lucene.open_writer(
        )  # generated shingle analyzer if the param is not None

        fieldtype_tv = Lucene.FIELDTYPE_ID_TV if only_uris else Lucene.FIELDTYPE_TEXT_TV
        fieldtype_tvp = Lucene.FIELDTYPE_ID_TV if only_uris else Lucene.FIELDTYPE_TEXT_TVP
        fieldtype_id = Lucene.FIELDTYPE_ID_TV if only_uris else Lucene.FIELDTYPE_ID
        fieldtype_ntv = Lucene.FIELDTYPE_ID_TV if only_uris else Lucene.FIELDTYPE_TEXT_NTV

        # iterate through MongoDB contents
        i = 0
        for mdoc in self.mongo.find_all():

            # this is just to speed up things a bit
            # we can skip the document right away if the ID does not start
            # with "<dbpedia:"
            if not mdoc[Mongo.ID_FIELD].startswith("<dbpedia:"):
                continue

            # get back document from mongo with keys and _id field unescaped
            doc = self.mongo.get_doc(mdoc)

            # check must_have fields
            skip_doc = False
            for f, v in index_config['fields'].iteritems():
                if ("must_have" in v) and (v['must_have']) and (f not in doc):
                    skip_doc = True
                    break

            if skip_doc:
                continue

            # doc contents is represented as a list of fields
            # (mind that fields are multi-valued)
            self.contents = []

            # each predicate to a separate field
            for f in doc:
                if f == Mongo.ID_FIELD:  # id is special
                    self.__add_to_contents(Lucene.FIELDNAME_ID, doc[f],
                                           fieldtype_id)
                if f in index_config['ignore']:
                    pass
                else:
                    # get resolved field value(s) -- note that it might be a list
                    field_value = self.__get_field_value(doc[f], only_uris)
                    # ignore empty fields
                    if (field_value is None) or (field_value == []):
                        continue

                    to_catchall_content = True if index_config[
                        'catchall_all'] else False

                    if f in index_config['fields']:
                        self.__add_to_contents(f, field_value, fieldtype_tvp)

                        # fields in index_config['fields'] are always added to catch-all content
                        to_catchall_content = True

                        # copy field value to other field(s)
                        # (copying is without term positions)
                        if "copy_to" in index_config['fields'][f]:
                            for f2 in index_config['fields'][f]['copy_to']:
                                self.__add_to_contents(f2, field_value,
                                                       fieldtype_tv)

                    # copy field value to catch-all content field
                    # (copying is without term positions)
                    if to_catchall_content:
                        self.__add_to_contents(Lucene.FIELDNAME_CONTENTS,
                                               field_value, fieldtype_tv)

            # add document to index
            lucene.add_document(self.contents)

            i += 1
            if i % 1000 == 0:
                print str(i / 1000) + "K documents indexed"
        # close Lucene index
        lucene.close_writer()

        print "Finished indexing (" + str(i) + " documents in total)"
Exemplo n.º 18
0
    def build_index(self, index_config, only_uris=False, max_shingle_size=None):
        """Builds index.

        :param index_config: index configuration
        """
        lucene = Lucene(index_config['index_dir'], max_shingle_size)
        lucene.open_writer()  # generated shingle analyzer if the param is not None

        fieldtype_tv = Lucene.FIELDTYPE_ID_TV if only_uris else Lucene.FIELDTYPE_TEXT_TV
        fieldtype_tvp = Lucene.FIELDTYPE_ID_TV if only_uris else Lucene.FIELDTYPE_TEXT_TVP
        fieldtype_id = Lucene.FIELDTYPE_ID_TV if only_uris else Lucene.FIELDTYPE_ID
        fieldtype_ntv = Lucene.FIELDTYPE_ID_TV if only_uris else Lucene.FIELDTYPE_TEXT_NTV

        # iterate through MongoDB contents
        i = 0
        for mdoc in self.mongo.find_all():

            # this is just to speed up things a bit
            # we can skip the document right away if the ID does not start
            # with "<dbpedia:"
            if not mdoc[Mongo.ID_FIELD].startswith("<dbpedia:"):
                continue

            # get back document from mongo with keys and _id field unescaped
            doc = self.mongo.get_doc(mdoc)

            # check must_have fields
            skip_doc = False
            for f, v in index_config['fields'].iteritems():
                if ("must_have" in v) and (v['must_have']) and (f not in doc):
                    skip_doc = True
                    break

            if skip_doc:
                continue

            # doc contents is represented as a list of fields
            # (mind that fields are multi-valued)
            self.contents = []

            # each predicate to a separate field
            for f in doc:
                if f == Mongo.ID_FIELD:  # id is special
                    self.__add_to_contents(Lucene.FIELDNAME_ID, doc[f], fieldtype_id)
                if f in index_config['ignore']:
                    pass
                else:
                    # get resolved field value(s) -- note that it might be a list
                    field_value = self.__get_field_value(doc[f], only_uris)
                    # ignore empty fields
                    if (field_value is None) or (field_value == []):
                        continue

                    to_catchall_content = True if index_config['catchall_all'] else False

                    if f in index_config['fields']:
                        self.__add_to_contents(f, field_value, fieldtype_tvp)

                        # fields in index_config['fields'] are always added to catch-all content
                        to_catchall_content = True

                        # copy field value to other field(s)
                        # (copying is without term positions)
                        if "copy_to" in index_config['fields'][f]:
                            for f2 in index_config['fields'][f]['copy_to']:
                                self.__add_to_contents(f2, field_value, fieldtype_tv)

                    # copy field value to catch-all content field
                    # (copying is without term positions)
                    if to_catchall_content:
                        self.__add_to_contents(Lucene.FIELDNAME_CONTENTS, field_value, fieldtype_tv)

            # add document to index
            lucene.add_document(self.contents)

            i += 1
            if i % 1000 == 0:
                print str(i / 1000) + "K documents indexed"
        # close Lucene index
        lucene.close_writer()

        print "Finished indexing (" + str(i) + " documents in total)"
 def T(self):
     """Returns all query terms."""
     if self.__T is None:
         analyzed_query = Lucene.preprocess(self.query)
         self.__T = analyzed_query.split(" ")
     return self.__T
 def _open_index(self):
     self.lucene_term = Lucene(TERM_INDEX_DIR)
     self.lucene_uri = Lucene(URI_INDEX_DIR)
     self.lucene_term.open_searcher()
     self.lucene_uri.open_searcher()