def __entity_centric(self, query): """Entity-centric TTI. :param query: query string :type query: str """ types = dict() # to be returned # Set the configurations model = self.__config.get("model", TTI_MODEL_BM25) ec_cutoff = self.__config.get("ec_cutoff", DEFAULT_TTI_EC_K_CUTOFF) self.__ec_retr_config = dict() for param in ["smoothing_method", "smoothing_param"]: if self.__config.get(param, None) is not None: self.__ec_retr_config[param] = self.__config.get(param) # Perform EC TTI using late fusion support late_fusion_scorer = LateFusionScorer( self.__config["index"], model, self.__ec_retr_config, num_docs=ec_cutoff, field="catchall", run_id=self.__config["run_id"], num_objs=self.__config["num_docs"]) ret_res = late_fusion_scorer.score_query( query, assoc_fun=self.__entity_centric_mapper) for doc_id, score in ret_res.get_scores_sorted(): types[doc_id] = {"score": score} PLOGGER.info("done") return types
def to_str(self, file_name=None): """ Converts instances to string and write them to the given file. :param file_name :return: String format of instances """ out_file = None if file_name is not None: open(file_name, "w").close() # cleans previous contents out_file = open(file_name, "a") counter = 0 out = "" for ins in self.get_all(): out += ins.to_str() + "\n" counter += 1 # append instances to the file if (counter % 1000) == 0: # print "Converting is done until instance " + str(ins.id) if out_file is not None: out_file.write(out) out = "" if out_file is not None: out_file.write(out) PLOGGER.info("String output:\t" + file_name) return None return out
def batch_linking(self): """Scores queries in a batch and outputs results.""" results = {} if self.__config["step"] == "linking": queries = json.load(open(self.__query_file)) for qid in sorted(queries): results[qid] = self.link(queries[qid], qid) json.dump(results, open(self.__output_file, "w"), indent=4, sort_keys=True) # only ranking step if self.__config["step"] == "ranking": queries = json.load(open(self.__query_file)) for qid in sorted(queries): linker = self.__get_linker(Query(queries[qid], qid)) results[qid] = linker.rank_ens() ranked_inss = Instances( sum([inss.get_all() for inss in results.values()], [])) ranked_inss.to_json(self.__output_file) # only disambiguation step if self.__config["step"] == "disambiguation": inss = Instances.from_json(self.__config["test_set"]) inss_by_query = inss.group_by_property("qid") for qid, q_inss in sorted(inss_by_query.items()): linker = self.__get_linker("") results[qid] = linker.disambiguate(Instances(q_inss)) to_elq_eval(results, self.__output_file) PLOGGER.info("Output file: " + self.__output_file)
def __type_centric(self, query): """Type-centric TTI. :param query: query string :type query: str """ types = dict() model = self.__config.get("model", TTI_MODEL_BM25) elastic = ElasticCache( self.__tc_config.get("index", DEFAULT_TTI_TC_INDEX)) if model == TTI_MODEL_BM25: PLOGGER.info("TTI, TC, BM25") self.__tc_config["model"] = "bm25" # scorer = Scorer.get_scorer(elastic, query, self.__tc_config) types = Retrieval(self.__tc_config).retrieve(query) elif model == TTI_MODEL_LM: PLOGGER.debug("TTI, TC, LM") self.__tc_config["model"] = "lm" # Needed for 2nd-pass self.__tc_config["field"] = "content" # Needed for 2nd-pass self.__tc_config["second_pass"] = {"field": "content"} for param in ["smoothing_method", "smoothing_param"]: if self.__config.get(param, None) is not None: self.__tc_config["second_pass"][param] = self.__config.get( param) scorer = Scorer.get_scorer(elastic, query, self.__tc_config) types = Retrieval(self.__tc_config).retrieve(query, scorer) PLOGGER.info(types) return types
def __make_type_doc(self, type_name): """Gets the document representation of a type to be indexed, from its entity short abstracts.""" content = "\n".join([ self.__entity_abstracts.get(e, b"").decode("utf-8") for e in self.__types_entities[type_name] ]) if len(content) > MAX_BULKING_DOC_SIZE: PLOGGER.info("Type {} has content larger than allowed: {}.".format( type_name, len(content))) # we randomly sample a subset of Y entity abstracts, s.t. # Y * AVG_SHORT_ABSTRACT_LEN <= MAX_BULKING_DOC_SIZE num_entities = len(self.__types_entities[type_name]) amount_abstracts_to_sample = min( floor(MAX_BULKING_DOC_SIZE / AVG_SHORT_ABSTRACT_LEN), num_entities) entities_sample = [ self.__types_entities[type_name][i] for i in sample( range(num_entities), amount_abstracts_to_sample) ] content = "" # reset content for entity in entities_sample: new_content_candidate = "\n".join([ content, self.__entity_abstracts.get(entity, b"").decode("utf-8") ]) # we add an abstract only if by doing so it will not exceed # MAX_BULKING_DOC_SIZE if len(new_content_candidate) > MAX_BULKING_DOC_SIZE: break content = new_content_candidate return {"content": content}
def batch_identification(self): """Annotates, in a batch, queries with identified target types, and outputs results.""" queries = json.load(FileUtils.open_file_by_type(self.__query_file)) f_trec_out = None if "trec_output_file" in self.__config: # for TREC-formatted outputting f_trec_out = FileUtils.open_file_by_type( self.__config["trec_output_file"], mode="w") results = dict() for query_id in sorted(queries): PLOGGER.info("Identifying target types for [{}] {}".format( query_id, queries[query_id])) results[query_id] = self.identify(queries[query_id]) # Output resulting scores in TREC format if required if f_trec_out: type_to_score = dict() for d in results.get(query_id, {}).get("results", {}).values(): type_to_score[d["type"]] = d["score"] ret_res = RetrievalResults(type_to_score) ret_res.write_trec_format(query_id, self.__config["run_id"], f_trec_out, max_rank=self.__config["num_docs"]) json.dump(results, FileUtils.open_file_by_type(self.__output_file, mode="w"), indent=4, sort_keys=True) PLOGGER.info("Output file: {}".format(self.__output_file)) if f_trec_out: f_trec_out.close()
def __sample_file(self, dir, file): """Creates a local from a specific file in a given directory. :param dir: directory (relative to path_to_dbpedia) :param file: """ t = Triple() p = NTriplesParser(t) infile = os.path.join(self.path_to_dbpedia, dir, file) outfile = os.path.join(self.output_dir, dir, file) PLOGGER.info("Processing file " + file + " ...") i = 0 with FileUtils.open_file_by_type(infile) as fin: fout = FileUtils.open_file_by_type(outfile, mode="w") # output file will be of the same type as the input for line in fin: try: p.parsestring(line.decode("utf-8")) except ParseError: # skip lines that couldn't be parsed continue if t.subject() is None: # only if parsed as a triple continue subj = self.prefix.get_prefixed(t.subject()) # prefixing subject if subj in self.sample_entities: fout.write(line) i += 1 if i % 100000 == 0: PLOGGER.info(str(i // 1000) + "K lines processed") fout.close()
def to_treceval(self, file_name, qid_prop="qid", docid_prop="en_id"): """ Generates a TREC style run file - If there is an entity ranked more than once for the same query, the one with higher score is kept. :param file_name: File to write TREC file :param qid_prop: Name of instance property to be used as query ID (1st column) :param docid_prop: Name of instance property to be used as document ID (3rd column) """ unique_entries = defaultdict(dict) # sort and rank entities for ins in self.get_all(): if ins.score is not None: qid, doc_id = ins.get_property(qid_prop), ins.get_property( docid_prop) score = unique_entries.get(qid, {}).get(doc_id, None) if (score is None) or (score < ins.score): unique_entries[qid][doc_id] = ins.score out_str = "" for qid, docs in sorted(unique_entries.items()): rank = 1 for doc_id, score in sorted(docs.items(), key=lambda x: x[1], reverse=True): out_str += qid + "\tQ0\t" + doc_id + "\t" + str( rank) + "\t" + "{0:.5f}".format(score) + "\tnordlys\n" rank += 1 open(file_name, "w").write(out_str) PLOGGER.info("Trec-eval output:\t" + file_name)
def __add_file(self, tsv_filename): """Adds name variants from an FACC tsv file.""" PLOGGER.info("Adding name variants from '" + tsv_filename + "'...") infile = open(tsv_filename, "r") for line in infile: f = line.rstrip().split("\t") self.__add_surface_form(f[0], f[1], int(f[2])) infile.close()
def load_kb_snapshot(kb_file): """Loads DBpedia Snapshot of proper name entities (used for entity linking).""" if config.KB_SNAPSHOT is None: PLOGGER.info("Loading KB snapshot of proper named entities ...") kb_snapshot = set() with open(kb_file, "r") as f: for line in f: kb_snapshot.add(line.strip()) config.KB_SNAPSHOT = kb_snapshot
def build(self): """Builds surface form collection from FACC annotations.""" self.__mongo = Mongo(MONGO_HOST, MONGO_DB, self.__collection) self.__mongo.drop() for path, dirs, files in os.walk(self.__path): for fn in files: if fn.endswith(".tsv"): self.__add_file(os.path.join(path, fn)) PLOGGER.info("Collection " + self.__collection + " is built.")
def main(args): run = TrecRun(args.run_file) if args.operation == "stat": run.print_stat() elif args.operation == "filter": if len(args.doc_ids_file) == 0 or len(args.output_file) == 0: PLOGGER.info("doc_ids_file or output_file missing") else: run.filter(args.doc_ids_file, args.output_file)
def main(args): config = FileUtils.load_config(args.config) type2entity_file = os.path.expanduser(os.path.join(config.get("type2entity_file", ""))) entity_abstracts_file = os.path.expanduser(os.path.join(config.get("entity_abstracts_file", ""))) if (not os.path.isfile(type2entity_file)) or (not os.path.isfile(entity_abstracts_file)): exit(1) indexer = IndexerDBpediaTypes(config) indexer.build_index(force=True) PLOGGER.info("Index build: <{}>".format(indexer.name))
def to_elq_eval(annotations, output_file): """Write entity annotations to ELQ evaluation format. :param linked_ens: {qid:[{"mention":xx, "entity": yy, "score":zz}, ..], ..} """ out_str = "" for qid, q_annots in sorted(annotations.items()): for annot in q_annots: out_str += qid + "\t1\t" + annot["entity"] + "\n" open(output_file, "w").write(out_str) PLOGGER.info("ELQ evaluation file: " + output_file)
def get_top_term(self, en, n): """Returns top-n fields with highest document frequency for the given entity ID.""" doc_freq = {} if self.DEBUG: PLOGGER.info("Entity:[" + en + "]") for field in self.fields: df = self.elastic.doc_freq(en, field) if df > 0: doc_freq[field] = df top_fields = self.__get_top_n(doc_freq, n) return top_fields
def output(self, instances): """Writes results to output file. :param instances: Instances object """ with open(self.__config["output_file"], "w") as f: f.write("id\tscore\n") # output to file PLOGGER.info("id\ttarget\tscore\n") for ins in instances.get_all(): f.write(ins.id + "\t" + "{0:.5f}".format(ins.score) + "\n") # output to file PLOGGER.info("Output saved in: " + self.__config["output_file"])
def build(self): """Builds word2vec collection from GoogleNews 300-dim pre-trained corpus.""" self.__mongo = Mongo(MONGO_HOST, MONGO_DB, self.__collection) self.__mongo.drop() infile = FileUtils.open_file_by_type(self.__w2v_fname) i = 0 for line in infile: term, vector = self.__parse_line(line) self.__mongo.add(term, {'vector': vector}) i += 1 if i % 1000 == 0: PLOGGER.info(str(i / 1000) + "K lines are loaded.")
def build_collection(self, mappings): """Builds Mongo collection""" mongo = Mongo(MONGO_HOST, MONGO_DB, self.__collection) mongo.drop() predicate = "!<owl:sameAs>" i = 0 for fb_id, dbp_ids in mappings.items(): for dbp_id in dbp_ids: mongo.append_set(fb_id, predicate, [dbp_id]) i += 1 if i % 1000 == 0: PLOGGER.info(str(i // 1000) + "K entities are added!")
def main(args): config = FileUtils.load_config(args.config) dbpedia_path = config.get("dbpedia_files_path", "") # Check DBpedia files PLOGGER.info("Checking needed DBpedia files under {}".format(dbpedia_path)) for fname in [ENTITY_ABSTRACTS_FILE] + ENTITY_TYPES_FILES: if os.path.isfile(os.sep.join([dbpedia_path, fname])): PLOGGER.info(" - {}: OK".format(fname)) else: PLOGGER.error(" - {}: Missing".format(fname)) exit(1) indexer = IndexerDBpediaTypes(config) indexer.build_index(force=True)
def build_collection(self): """Adds all name variants from DBpedia.""" self.__mongo = Mongo(MONGO_HOST, MONGO_DB, self.__collection) self.__mongo.drop() # iterate through all DBpedia entities i = 0 for mdoc in self.__mongo_dbpedia.find_all(): entity = EntityUtils(Mongo.unescape_doc(mdoc)) # skips entities without names if not entity.has_name(): continue surface_form = entity.get_name() # the entity is redirect page if entity.is_redirect(): entity_id = entity.get_predicate( EntityUtils.PREDICATE_REDIRECT)[0] self.__add_surface_form(surface_form, EntityUtils.PREDICATE_REDIRECT, entity_id) # the entity is disambiguation page if entity.has_predicate(EntityUtils.PREDICATE_DISAMBIGUATE): entity_ids = entity.get_predicate( EntityUtils.PREDICATE_DISAMBIGUATE) for entity_id in entity_ids: self.__add_surface_form(surface_form, EntityUtils.PREDICATE_DISAMBIGUATE, entity_id) # entity is not a redirect/disambiguation page and has name and abstract if entity.is_entity(): entity_id = entity.get_id() # adds entity name self.__add_surface_form(surface_form, EntityUtils.PREDICATE_NAME, entity_id) # adds other entity names foaf_name_predicate = "<foaf:name>" if entity.has_predicate(foaf_name_predicate): for surface_form in entity.get_predicate( foaf_name_predicate): self.__add_surface_form(surface_form, foaf_name_predicate, entity_id) i += 1 if i % 1000 == 0: PLOGGER.info(str(i // 1000) + "K entities processed")
def to_json(self, json_file=None): """ Converts all instances to JSON and writes it to the file :param json_file: (string) :return: JSON dump of all instances. """ inss_json = {} for ins in self.get_all(): inss_json.update(ins.to_json()) if json_file is not None: # print "Writing JSON format of instances ..." out = open(json_file, "w") json.dump(inss_json, out, indent=4, sort_keys=True) PLOGGER.info("JSON output:\t" + json_file) return inss_json
def main(args): config = FileUtils.load_config(args.config) if "_uri" not in config["index_name"]: PLOGGER.error("index name might not be correct, please check again!") exit(0) if "fields_file" not in config: fields_count = compute_field_counts() else: fields_count = json.load(config["fields_file"]) indexer = IndexerDBpediaURI(config, fields_count) indexer.build() PLOGGER.info("Index build: " + config["index_name"])
def from_json(cls, json_file): """Loads instances from a JSON file. :param json_file: (string) :return Instances object """ PLOGGER.info("Reading JSON file " + json_file + " ...") json_data = open(json_file) data = json.load(json_data) instance_list = [] # read instances for ins_id, fields in data.items(): instance = Instance.from_json(ins_id, fields) instance_list.append(instance) return cls(instance_list)
def to_elq_eval(annotations, output_file): """Write entity annotations to ELQ evaluation format. :param linked_ens: {qid:[{"mention":xx, "entity": yy, "score":zz}, ..], ..} """ uniq_annots = set() out_str = "" for qid, q_annots in sorted(annotations.items()): for annot in q_annots["results"]: if (qid, annot["entity"]) not in uniq_annots: out_str += qid + "\t" + str( annot["score"]) + "\t" + annot["entity"] + "\n" uniq_annots.add((qid, annot["entity"])) open(output_file, "w").write(out_str) PLOGGER.info("ELQ evaluation file: " + output_file)
def batch_retrieval(self): """Scores queries in a batch and outputs results.""" queries = json.load(open(self.__query_file)) # init output file open(self.__output_file, "w").write("") out = open(self.__output_file, "w") # retrieves documents for query_id in sorted(queries): PLOGGER.info("scoring [" + query_id + "] " + queries[query_id]) results = self.retrieve(queries[query_id]) out.write(self.trec_format(results, query_id, self.__num_docs)) out.close() PLOGGER.info("Output file:" + self.__output_file)
def main(args): qrels = TrecQrels(args.qrels_file) if args.operation == CHOICE_STAT: qrels.print_stat() elif args.operation == CHOICE_FILTER_DOCS: if len(args.doc_ids_file) == 0 or len(args.output_file) == 0: PLOGGER.info("doc_ids_file or output_file missing") else: qrels.filter_by_doc_ids(args.doc_ids_file, args.output_file) elif args.operation == CHOICE_FILTER_QS: if len(args.query_ids_file) == 0 or len(args.output_file) == 0: PLOGGER.info("query_ids_file or output_file missing") else: qrels.filter_by_query_ids(args.query_ids_file, args.output_file)
def __sample_dir(self, dir, ext): """Creates a local from a specific directory. :param dir: directory (relative to path_to_dbpedia) :param ext: file extensions considered """ PLOGGER.info("Processing directory " + dir + " ...") # make sure the dir exists under the output directory outdir = os.path.join(self.output_dir, dir) if not os.path.exists(outdir): os.makedirs(outdir) # make a local of each file from that directory with the given extension for root, dirs, files in os.walk(os.path.join(self.path_to_dbpedia, dir)): PLOGGER.info(root) for file in files: if file.endswith(ext): self.__sample_file(dir, file)
def main(args): inss = Instances() # we assume that the 1st column is always the ins_id (unique) # the list specifies which property or feature the column value should be loaded to; columns with None are ignored # one file with properties inss.add_properties_from_tsv(args[0], ["sequence"]) # one or more files with features inss.add_features_from_tsv(args[1], [ "sentence_length", "article_length", "sentence_order", "predicate_tense" ]) # inss.add_features_from_tsv(feat_file_2, ["feature4"]) # inss.add_features_from_tsv(feat_file_3, ["feature5", "feature6"]) # one with target value inss.add_target_from_tsv(args[2]) PLOGGER.info(inss.to_str()) inss.to_json("data/maff.json")
def print_doc(doc): PLOGGER.info("_id: " + doc[Mongo.ID_FIELD]) for key, value in doc.items(): if key == Mongo.ID_FIELD: continue # ignore the id key if type(value) is list: PLOGGER.info(key + ":") for v in value: PLOGGER.info("\t" + str(v)) else: PLOGGER.info(key + ": " + str(value))
def link(self, query, qid=""): """Performs entity linking for the query. :param query: query string :return: annotated query """ PLOGGER.info("Linking query " + qid + " [" + query + "] ") q = Query(query, qid) linker = self.__get_linker(q) if self.__config["step"] == "ranking": res = linker.rank_ens() else: linked_ens = linker.link() res = { "query": q.raw_query, "processed_query": q.query, "results": linked_ens } return res