def main(args): config = FileUtils.load_config(args.config) elastic_term = ElasticCache(config["text_index"]) lambdas = config.get("lambdas", [0.9, 0.1]) queries = json.load(open(config["query_file"], "r")) mappings = json.load(open(config["mapping_file"], "r")) annots = load_annot(config["annot_file"]) run = load_run(config["run_file"]) instances = Instances() # gets the results out_file = open(config["output_file"], "w") qid_int = 0 for qid, query in sorted(queries.items()): print("Scoring ", qid, "...") results, libsvm_str = {}, "" query_len = len(elastic_term.analyze_query(query).split()) scorer = ScorerELR(ElasticCache(config["uri_index"]), annots[qid], query_len, lambdas) for doc_id, p_T_d in sorted(run[qid].items()): query_mappings = get_mapping_query(annots[qid], mappings) p_E_d = scorer.score_doc(doc_id, query_mappings) properties = { 'doc_id': doc_id, 'query': query, 'qid': qid, 'qid_int': qid_int } features = {'p_T_d': p_T_d, 'p_E_d': p_E_d} ins = Instance(qid + "_" + doc_id, features=features, properties=properties) instances.add_instance(ins) # libsvm_str += ins.to_libsvm(qid_prop="qod_int") results[doc_id] = (lambdas[0] * p_T_d) + (lambdas[1] * p_E_d) qid_int += 1 # Write trec format out_str = trec_format(results, qid, "elr") out_file.write(out_str) out_file.close() print("Output file:", config["output_file"]) instances.to_json(config["json_file"]) print("Output file:", config["json_file"])
class FeaturesTermStats(): CONTENT_FIELD = "content" STOPWORDS = [ "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with" ] def __init__(self, index_name=WP_ST_INDEX_ID): self.__elastic = ElasticCache(index_name) self.__stats = None def get_per_nat_tf(self, person_id, nats): """ Compute freqPerNat: \frac{|\{s : pe \in s, nt \in s\}|}{|S(pe)|} :param person_id: :param nats: nationality+adj, e.g. Germany, German :return: freqPerNat """ body = {"query": {"bool": {"must": {"term": {"content": person_id}}}}} doc_ids = self.__elastic.search_complex(body, self.CONTENT_FIELD, num=10000).keys() n_s_pe = len(doc_ids) # number of sentences containing person # print(n_s_pe) noun = nats[0] noun_query = self.__elastic.analyze_query(noun) body = { "query": { "bool": { "must": [{ "match": { "content": person_id } }, { "match_phrase": { "content": noun_query } }] } } } doc_ids_noun = self.__elastic.search_complex(body, self.CONTENT_FIELD, num=10000).keys() n_co_noun = len(doc_ids_noun) # print("Noun", n_co_noun) adj = nats[1] adj_query = self.__elastic.analyze_query(adj) body = { "query": { "bool": { "must": [{ "match": { "content": person_id } }, { "match_phrase": { "content": adj_query } }] } } } doc_ids_adj = self.__elastic.search_complex(body, self.CONTENT_FIELD, num=10000).keys() n_co_adj = len(doc_ids_adj) # print("Adj", n_co_adj) if n_s_pe == 0: return 0.0, 0.0 else: return n_co_noun / n_s_pe, n_co_adj / n_s_pe def generate_features(self, kb_file, output_file1, output_file2): """Generate features of freq-person-nationality""" fout1 = open(output_file1, "w") fout2 = open(output_file2, "w") # write tsv header header = ["person", "nationality", "freq_person_nationality_noun"] fout1.write("\t".join(header) + "\n") header = ["person", "nationality", "freq_person_nationality_adj"] fout2.write("\t".join(header) + "\n") with FileUtils.open_file_by_type(kb_file) as kb_f: line_count = 1 for line in kb_f: print(line_count) line_count += 1 person_id, nat_id, noun, adj = line.strip().split("\t") values_noun = [person_id, nat_id] values_adj = [person_id, nat_id] nats = [noun, adj] fpn_noun, fpn_adj = self.get_per_nat_tf(person_id, nats) values_noun.append(str(fpn_noun)) values_adj.append(str(fpn_adj)) fout1.write("\t".join(values_noun) + "\n") fout2.write("\t".join(values_adj) + "\n") fout1.close() fout2.close()
class Retrieval(object): """Loads config file, checks params, and sets default values. :param config: retrieval config (JSON config file or a dictionary) of the shape: :: { "index_name": name of the index, "first_pass": { "num_docs": number of documents in first-pass scoring (default: 1000) "field": field used in first pass retrieval (default: Elastic.FIELD_CATCHALL) "fields_return": comma-separated list of fields to return for each hit (default: "") }, "num_docs": number of documents to return (default: 100) "start": starting offset for ranked documents (default:0) "model": name of retrieval model; accepted values: [lm, mlm, prms] (default: lm) "field": field name for LM (default: catchall) "fields": list of fields for PRMS (default: [catchall]) "field_weights": dictionary with fields and corresponding weights for MLM (default: {catchall: 1}) "smoothing_method": accepted values: [jm, dirichlet] (default: dirichlet) "smoothing_param": value of lambda or mu; accepted values: [float or "avg_len"], (jm default: 0.1, dirichlet default: 2000) "query_file": name of query file (JSON), "output_file": name of output file, "run_id": run id for TREC output } """ FIELDED_MODELS = {"mlm", "prms"} LM_MODELS = {"lm", "mlm", "prms"} def __init__(self, config): self.check_config(config) self.__config = config self.__index_name = config["index_name"] self.__first_pass_num_docs = int(config["first_pass"]["num_docs"]) self.__first_pass_field = config["first_pass"]["field"] self.__first_pass_fields_return = config["first_pass"]["fields_return"] self.__first_pass_model = config["first_pass"]["model"] self.__start = int(config["start"]) self.__model = config.get("model", None) self.__num_docs = int(config.get("num_docs", None)) self.__query_file = config.get("query_file", None) self.__output_file = config.get("output_file", None) self.__run_id = config.get("run_id", self.__model) self.__elastic = ElasticCache(self.__index_name) @staticmethod def check_config(config): """Checks config parameters and sets default values.""" try: if config.get("index_name", None) is None: raise Exception("index_name is missing") # Checks first pass parameters if config.get("first_pass", None) is None: config["first_pass"] = {} if config["first_pass"].get("num_docs", None) is None: config["first_pass"]["num_docs"] = 1000 if config["first_pass"].get("field", None) is None: config["first_pass"]["field"] = Elastic.FIELD_CATCHALL if config["first_pass"].get("fields_return", None) is None: config["first_pass"]["fields_return"] = "" if config["first_pass"].get("model", None) is None: config["first_pass"]["model"] = Elastic.BM25 if config.get("start", None) is None: config["start"] = 0 if config.get("num_docs", None) is None: config["num_docs"] = 100 if config.get("model", None) is None: config["model"] = None if config.get("field", None) is None: config["field"] = Elastic.FIELD_CATCHALL if config.get("fields", None) is None: config["fields"] = [Elastic.FIELD_CATCHALL] if config.get("field_weights", None) is None: config["field_weights"] = {Elastic.FIELD_CATCHALL: 1} if config["model"] in Retrieval.LM_MODELS: if config.get("smoothing_method", None) is None: config["smoothing_method"] = ScorerLM.DIRICHLET if config.get("smoothing_param", None) is None: if config["smoothing_method"] == ScorerLM.DIRICHLET: config["smoothing_param"] = 2000 elif config["smoothing_method"] == ScorerLM.JM: config["smoothing_param"] = 0.1 else: raise Exception("Smoothing method is not supported.") except Exception as e: print("Error in config file: ", e) sys.exit(1) def _first_pass_scoring(self, analyzed_query): """Returns first-pass scoring of documents. :param analyzed_query: analyzed query :return: RetrievalResults object """ print("\tFirst pass scoring... ", ) # todo: add support for other similarities # body = {"query": { # "bool": { # "should": [ # {"match": { # "catchall": { # "query": analyzed_query # }}}, # {"match": { # "names": { # "query": analyzed_query, # "boost": 3 # }}}]}}} # self.__elastic.update_similarity(self.__first_pass_model, self.__first_pass_model_params) res1 = self.__elastic.search(analyzed_query, self.__first_pass_field, num=self.__first_pass_num_docs, fields_return=self.__first_pass_fields_return) # res1 = self.__elastic.search_complex(body=body, num=self.__first_pass_num_docs, # fields_return=self.__first_pass_fields_return) return res1 def _second_pass_scoring(self, res1, scorer): """Returns second-pass scoring of documents. :param res1: first pass results :param scorer: scorer object :return: RetrievalResults object """ print("\tSecond pass scoring... ", ) res2 = {} for doc_id in res1.keys(): res2[doc_id] = {"score": scorer.score_doc(doc_id), "fields": res1[doc_id].get("fields", {})} print("done") return res2 def retrieve(self, query, scorer=None): """Scores documents for the given query.""" query = self.__elastic.analyze_query(query) # 1st pass retrieval res1 = self._first_pass_scoring(query) if self.__model is None: return res1 # 2nd pass retrieval scorer = scorer if scorer else Scorer.get_scorer(self.__elastic, query, self.__config) res2 = self._second_pass_scoring(res1, scorer) return res2 def batch_retrieval(self): """Scores queries in a batch and outputs results.""" queries = json.load(open(self.__query_file)) # init output file open(self.__output_file, "w").write("") out = open(self.__output_file, "w") # retrieves documents for query_id in sorted(queries): print("scoring [" + query_id + "] " + queries[query_id]) results = self.retrieve(queries[query_id]) out.write(self.trec_format(results, query_id, self.__num_docs)) out.close() print("Output file:", self.__output_file) def trec_format(self, results, query_id, max_rank=100): """Outputs results in TREC format""" out_str = "" rank = 1 for doc_id, score in sorted(results.items(), key=lambda x: x[1], reverse=True): if rank > max_rank: break out_str += query_id + "\tQ0\t" + doc_id + "\t" + str(rank) + "\t" + str(score) + "\t" + self.__run_id + "\n" rank += 1 return out_str
class Retrieval(object): FIELDED_MODELS = {"mlm", "prms"} LM_MODELS = {"lm", "mlm", "prms"} def __init__(self, config): self.check_config(config) self.__config = config self.__index_name = config["index_name"] self.__first_pass_num_docs = int(config["first_pass"]["1st_num_docs"]) self.__first_pass_field = config["first_pass"]["field"] self.__first_pass_fields_return = config["first_pass"]["fields_return"] self.__first_pass_model = config["first_pass"]["model"] self.__start = int(config["start"]) self.__model = config.get("model", None) self.__num_docs = int(config.get("num_docs", None)) self.__query_file = config.get("query_file", None) self.__output_file = config.get("output_file", None) self.__run_id = config.get("run_id", self.__model) self.__elastic = ElasticCache(self.__index_name) @staticmethod def check_config(config): """Checks config parameters and sets default values.""" try: if config.get("index_name", None) is None: raise Exception("index_name is missing") # Checks first pass parameters if config.get("first_pass", None) is None: config["first_pass"] = {} if config["first_pass"].get("1st_num_docs", None) is None: config["first_pass"]["1st_num_docs"] = 1000 if config["first_pass"].get("field", None) is None: config["first_pass"]["field"] = Elastic.FIELD_CATCHALL if config["first_pass"].get("fields_return", None) is None: config["first_pass"]["fields_return"] = "" if config["first_pass"].get("model", None) is None: config["first_pass"]["model"] = Elastic.BM25 if config.get("start", None) is None: config["start"] = 0 if config.get("num_docs", None) is None: config["num_docs"] = 100 if config.get("model", None) in Retrieval.LM_MODELS: if config.get("smoothing_method", None) is None: config["smoothing_method"] = ScorerLM.DIRICHLET if config.get("smoothing_param", None) is None: if config["smoothing_method"] == ScorerLM.DIRICHLET: config["smoothing_param"] = 2000 elif config["smoothing_method"] == ScorerLM.JM: config["smoothing_param"] = 0.1 else: raise Exception("Smoothing method is not supported.") if config.get("model", None) == "lm": if config.get("fields", None) is None: config["fields"] = Elastic.FIELD_CATCHALL if config.get("model", None) == "mlm": if config.get("fields", None) is None: config["fields"] = {"similar_entity_names": 0.2, "catchall": 0.8} if config.get("model", None) == "prms": if config.get("fields", None) is None: config["fields"] = [Elastic.FIELD_CATCHALL] except Exception as e: PLOGGER.error("Error in config file: ", e) sys.exit(1) def __get_fields(self): """Returns the name of all fields that will be used in the retrieval model.""" fields = [] if type(self.__config["fields"]) == str: fields.append(self.__config["fields"]) elif type(self.__config["fields"]) == dict: fields = self.__config["fields"].keys() else: fields = self.__config["fields"] return fields def _first_pass_scoring(self, analyzed_query): """Returns first-pass scoring of documents. :param analyzed_query: analyzed query :return: RetrievalResults object """ PLOGGER.debug("\tFirst pass scoring... ", ) res1 = self.__elastic.search(analyzed_query, self.__first_pass_field, num=self.__first_pass_num_docs, fields_return=self.__first_pass_fields_return) return res1 def _second_pass_scoring(self, res1, scorer): """Returns second-pass scoring of documents. :param res1: first pass results :param scorer: scorer object :return: RetrievalResults object """ PLOGGER.debug("\tSecond pass scoring... ", ) for field in self.__get_fields(): self.__elastic.multi_termvector(list(res1.keys()), field) res2 = {} for doc_id in res1.keys(): res2[doc_id] = {"score": scorer.score_doc(doc_id), "fields": res1[doc_id].get("fields", {})} PLOGGER.debug("done") return res2 def retrieve(self, query, scorer=None): """Scores documents for the given query.""" query = self.__elastic.analyze_query(query) # 1st pass retrieval res1 = self._first_pass_scoring(query) if self.__model == "bm25": return res1 # 2nd pass retrieval scorer = scorer if scorer else Scorer.get_scorer(self.__elastic, query, self.__config) res2 = self._second_pass_scoring(res1, scorer) return res2 def batch_retrieval(self): """Scores queries in a batch and outputs results.""" queries = json.load(open(self.__query_file)) # init output file open(self.__output_file, "w").write("") out = open(self.__output_file, "w") # retrieves documents for query_id in sorted(queries): PLOGGER.info("scoring [" + query_id + "] " + queries[query_id]) results = self.retrieve(queries[query_id]) out.write(self.trec_format(results, query_id, self.__num_docs)) out.close() PLOGGER.info("Output file:" + self.__output_file) def trec_format(self, results, query_id, max_rank=100): """Outputs results in TREC format""" out_str = "" rank = 1 for doc_id, score in sorted(results.items(), key=lambda x: x[1]["score"], reverse=True): if rank > max_rank: break out_str += query_id + "\tQ0\t" + doc_id + "\t" + str(rank) + "\t" + str(score["score"]) + "\t" + self.__run_id + "\n" rank += 1 return out_str