def __sample_file(self, dir, file): """Creates a local from a specific file in a given directory. :param dir: directory (relative to path_to_dbpedia) :param file: """ t = Triple() p = NTriplesParser(t) infile = os.path.join(self.path_to_dbpedia, dir, file) outfile = os.path.join(self.output_dir, dir, file) print("Processing file " + file + " ...") i = 0 with FileUtils.open_file_by_type(infile) as fin: fout = FileUtils.open_file_by_type( outfile, mode="w") # output file will be of the same type as the input for line in fin: try: p.parsestring(line.decode("utf-8")) except ParseError: # skip lines that couldn't be parsed continue if t.subject() is None: # only if parsed as a triple continue subj = self.prefix.get_prefixed( t.subject()) # prefixing subject if subj in self.sample_entities: fout.write(line) i += 1 if i % 100000 == 0: print(str(i // 1000) + "K lines processed") fout.close()
def batch_identification(self): """Annotates, in a batch, queries with identified target types, and outputs results.""" queries = json.load(FileUtils.open_file_by_type(self.__query_file)) f_trec_out = None if "trec_output_file" in self.__config: # for TREC-formatted outputting f_trec_out = FileUtils.open_file_by_type( self.__config["trec_output_file"], mode="w") results = dict() for query_id in sorted(queries): PLOGGER.info("Identifying target types for [{}] {}".format( query_id, queries[query_id])) results[query_id] = self.identify(queries[query_id]) # Output resulting scores in TREC format if required if f_trec_out: type_to_score = dict() for d in results.get(query_id, {}).get("results", {}).values(): type_to_score[d["type"]] = d["score"] ret_res = RetrievalResults(type_to_score) ret_res.write_trec_format(query_id, self.__config["run_id"], f_trec_out, max_rank=self.__config["num_docs"]) json.dump(results, FileUtils.open_file_by_type(self.__output_file, mode="w"), indent=4, sort_keys=True) PLOGGER.info("Output file: {}".format(self.__output_file)) if f_trec_out: f_trec_out.close()
def __init__(self): self.persons_ids = {} self.ids_persons = {} for line in FileUtils.read_file_as_list(PERSONS_IDS_F): person, id = line.split("\t", maxsplit=1) self.persons_ids[person] = id self.ids_persons[id] = person self.professions_ids = {} self.ids_professions = {} for line in FileUtils.read_file_as_list(PROFESSIONS_IDS_F): prof, id = line.split("\t", maxsplit=1) self.professions_ids[prof] = id self.ids_professions[id] = prof self.nationalities_ids = {} self.ids_nationalities = {} for line in FileUtils.read_file_as_list(NATIONALITIES_IDS_F): nation, id = line.split("\t", maxsplit=1) self.nationalities_ids[nation] = id self.ids_nationalities[id] = nation self.nationalities_countries = {} self.countries_nationalities = {} for line in FileUtils.read_file_as_list(COUNTRIES_NATIONALITIES_F): country, nation = line.split("\t", maxsplit=1) self.nationalities_countries[nation] = country self.countries_nationalities[country] = nation
def load_nordlys_config(file_name): """Loads nordlys config file. If local file is provided, global one is ignored.""" config_path = os.sep.join([BASE_DIR, "config"]) local_config = os.sep.join([config_path, "local", file_name]) if os.path.exists(local_config): return FileUtils.load_config(local_config) else: return FileUtils.load_config(os.sep.join([config_path, file_name]))
def generate_features(self, kb_file, output_file1, output_file2): """Generate features of freq-person-nationality""" fout1 = open(output_file1, "w") fout2 = open(output_file2, "w") # write tsv header header = ["person", "nationality", "freq_person_nationality_noun"] fout1.write("\t".join(header) + "\n") header = ["person", "nationality", "freq_person_nationality_adj"] fout2.write("\t".join(header) + "\n") with FileUtils.open_file_by_type(kb_file) as kb_f: line_count = 1 for line in kb_f: print(line_count) line_count += 1 person_id, nat_id, noun, adj = line.strip().split("\t") values_noun = [person_id, nat_id] values_adj = [person_id, nat_id] nats = [noun, adj] fpn_noun, fpn_adj = self.get_per_nat_tf(person_id, nats) values_noun.append(str(fpn_noun)) values_adj.append(str(fpn_adj)) fout1.write("\t".join(values_noun) + "\n") fout2.write("\t".join(values_adj) + "\n") fout1.close() fout2.close()
def __load_entity_abstracts(self, filename): prefix = URIPrefix() t = Triple() p = NTriplesParser(t) lines_counter = 0 PLOGGER.info("Loading entity abstracts from {}".format(filename)) for line in FileUtils.read_file_as_list(filename): # basic line parsing line = line.decode("utf-8") if isinstance(line, bytes) else line try: p.parsestring(line) except ParseError: # skip lines that couldn't be parsed continue if t.subject() is None: # only if parsed as a triple continue # Subject and object identification subj = prefix.get_prefixed(t.subject()) obj = "" if type(t.object()) is URIRef: # PLOGGER.error("Error: it is URIRef the parsed obj") pass else: obj = t.object().encode("utf-8") if len(obj) == 0: continue # skip empty objects self.__entity_abstracts[subj] = obj lines_counter += 1 if lines_counter % 10000 == 0: PLOGGER.info("\t{}K lines processed".format(lines_counter // 1000)) pass PLOGGER.info("\n### Loading entity abstracts... Done.")
def __load_items_stats(self, items_tfidf_fpath): """Loads pre-computed tf-idf stats for items. :param items_tfidf_fpath: :return: """ item_term_weights = {} for line in FileUtils.read_file_as_list(items_tfidf_fpath): item, term, _, _, weight = line.split("\t", maxsplit=4) if len(term) < 4: # avoid short terms continue # some cleanings if term.startswith("_"): term = term[1:] if term.endswith("_"): term = term[:-1] if term.startswith(WP_PREFIX): # it's a person name term = term.split(WP_PREFIX)[-1] # remove prefix term = "_".join([ word[0].upper() + word[1:] for word in term.split("_") ]) # capitalize every word item_d = item_term_weights.get(item, {}) item_d[term] = float(weight) item_term_weights[item] = item_d return item_term_weights
def main(args): s_t = time.time() # start time config = FileUtils.load_config(args.config) if args.config != "" else get_config() r = Retrieval(config) r.batch_retrieval() e_t = time.time() # end time print("Execution time(min):\t" + str((e_t - s_t) / 60) + "\n")
def main(args): config = FileUtils.load_config(args.config) el = EL(config, Entity()) if args.query: res = el.link(args.query) pprint(res) else: el.batch_linking()
def main(args): config = FileUtils.load_config(args.config) tti = TTI(config) if args.query: res = tti.identify(args.query) pprint(res) else: tti.batch_identification()
def main(args): config = FileUtils.load_config(args.config) er = ER(config, ElasticCache(DBPEDIA_INDEX)) if args.query: res = er.retrieve(args.query) pprint(res) else: er.batch_retrieval()
def main(args): config = FileUtils.load_config(args.config) er = ER(config) if args.query: res = er.retrieve(args.query) pprint(res) else: er.batch_retrieval()
def make_persons_fb_ids(persons_fpath): """Our person ID is the Freebase ID where the prefix 'm.' is replaced with 'fb_'. :param persons_fpath: 'persons'-named file path. """ with open(PERSONS_IDS_F, "w") as f_out: for line in FileUtils.read_file_as_list(persons_fpath): person, raw_fb_id = line.split("\t", maxsplit=1) fb_id = "fb_" + raw_fb_id[2:] f_out.write("{}\t{}\n".format(person, fb_id))
def make_professions_kb_translation(dest_translation_fpath, persons_ids_fpath, professions_ids_fpath, person_items_fpath): """A person ID -to- item ID translation schema is convenient. :param dest_translation_fpath: destination file path of IDs translation. :param persons_ids_fpath: person IDs file path. :param professions_ids_fpath: profession IDs file path. :param person_items_fpath: path to the file with person items (a '.kb'-extension file). :return: """ persons_ids = {} for line in FileUtils.read_file_as_list(persons_ids_fpath): person, id = line.split("\t", maxsplit=1) persons_ids[person] = id professions_ids = {} for line in FileUtils.read_file_as_list(professions_ids_fpath): prof, id = line.split("\t", maxsplit=1) professions_ids[prof] = id person_items = {} for line in FileUtils.read_file_as_list(person_items_fpath): person, item = line.split("\t", maxsplit=1) person_items.get(person, []).append(item) translations = {} for person, items in person_items.items(): if person not in persons_ids: continue person_id = persons_ids[person] items_ids = [] for item in items: if item not in professions_ids: continue items_ids.append(professions_ids[item]) translations[person_id] = items_ids with open(dest_translation_fpath, "w") as f_out: for person_id, items_ids in translations.items(): for item_id in items_ids: f_out.write("{}\t{}\n".format(person_id, item_id))
def generate_features(self, kb_file, output_file): """Generating features related to term statistics""" fout = open(output_file, "w") # write tsv header header = ["person", "profession"] for k in self.K_VALUES: header.append("sumProfTerms_" + str(k)) header.append("simCos_" + str(k)) fout.write("\t".join(header) + "\n") with FileUtils.open_file_by_type(kb_file) as kb_f: for line in kb_f: person_id, prof_id = line.strip().split("\t") values = [person_id, prof_id] person_tf, num_sent = self.get_person_tf(person_id) for k in self.K_VALUES: # we take top-K profession terms # Compute sumProfTerms: \sum_{t \in T_k(pr)}\sum_{s \in S(pe)} tf(t,s) w(t,pr) # where w(t,pe )= TFIDF(t,pr) = \frac{\sum_{s \in S(pr)} tf(t,s)} sum_prof_terms = 0 for term, tf in person_tf.items(): pt = self.__stats.get(prof_id, {}).get(term, {}) if pt.get("rank", 100000) > k: # skip term if not in top-K continue sum_prof_terms += tf * pt.get("tfidf", 0) values.append(str(sum_prof_terms)) # compute simCosK # where K is the top-K terms for the profession vec_pr = [] # construct prof vector vec_pe = [] # construct person vector if prof_id in self.__stats: for term, s in self.__stats[prof_id].items(): if s["rank"] <= k: vec_pr.append(s["tfidf"]) idf = s["tfidf"] / s[ "tf"] # we back-generate IDF from profession's TF-IDF vec_pe.append(person_tf.get(term, 0) * idf) cos = cos_sim(vec_pr, vec_pe) else: cos = 0 # in some exceptional cases the profession does not have any sentences values.append(str(cos)) fout.write("\t".join(values) + "\n") print(values) fout.close()
def main(args): config = FileUtils.load_config(args.config) type2entity_file = os.path.expanduser(os.path.join(config.get("type2entity_file", ""))) entity_abstracts_file = os.path.expanduser(os.path.join(config.get("entity_abstracts_file", ""))) if (not os.path.isfile(type2entity_file)) or (not os.path.isfile(entity_abstracts_file)): exit(1) indexer = IndexerDBpediaTypes(config) indexer.build_index(force=True) PLOGGER.info("Index build: <{}>".format(indexer.name))
def add_file(self, filename, reverse_triple=False, predicate_prefix=None): """Adds contents from an NTriples file to MongoDB. :param filename: NTriples file. :param reverse_triple: if set True, the subject and object values are swapped. :param predicate_prefix: prefix to be added to predicates. :param subjects_redirecter: redirects dict. """ print("Processing " + filename + "...") t = Triple() p = NTriplesParser(t) self.__m_id = None # document id for MongoDB -- subj self.__m_contents = None # document contents for MongoDB -- pred, obj i = 0 with FileUtils.open_file_by_type(filename) as f: for line in f: try: p.parsestring(line.decode("utf-8")) except ParseError: # skip lines that couldn't be parsed continue if t.subject() is None: # only if parsed as a triple continue # subject prefixing subj = self.__prefix.get_prefixed(t.subject()) # predicate prefixing pred = self.__prefix.get_prefixed(t.predicate()) if predicate_prefix is not None: pred = predicate_prefix + pred # Object prefixing if type(t.object()) is URIRef: obj = self.__prefix.get_prefixed(t.object()) else: obj = t.object() if len(obj) == 0: continue # skip empty objects # write or append if reverse_triple: # reverse subj and obj self._next_triple(obj, pred, subj) else: # normal mode self._next_triple(subj, pred, obj) i += 1 if i % 100000 == 0: print( str(i // 1000) + "K lines processed from " + filename) # process last triple self._write_to_mongo()
def make_relation_item_ids(rel_items_fpath): """Our relation ID is the original relation where the prefix any dash or blank is replaced with underscore. :param rel_items_fpath: items file path. """ basename = os.path.basename(rel_items_fpath) # professions or nationalities with open(os.sep.join([DATA_DIR, "{}_ids.tsv".format(basename)]), "w") as f_out: for item in FileUtils.read_file_as_list(rel_items_fpath): id = item.lower().replace(" ", "_").replace("-", "_") f_out.write("{}\t{}\n".format(item, id))
def main(args): conf = FileUtils.load_config(args.config) el = EL(conf, Entity(), ElasticCache(DBPEDIA_INDEX), FeatureCache()) if conf.get("gen_model", False): LTR.train(conf) elif args.query: res = el.link(args.query) pprint(res) else: el.batch_linking()
def build(self): """Builds word2vec collection from GoogleNews 300-dim pre-trained corpus.""" self.__mongo = Mongo(MONGO_HOST, MONGO_DB, self.__collection) self.__mongo.drop() infile = FileUtils.open_file_by_type(self.__w2v_fname) i = 0 for line in infile: term, vector = self.__parse_line(line) self.__mongo.add(term, {'vector': vector}) i += 1 if i % 1000 == 0: PLOGGER.info(str(i / 1000) + "K lines are loaded.")
def dump_differences(self, output_file): """Outputs query-level differences between two methods into a tab-separated file. The first method is considered the baseline, the differences are with respect to that. Output format: queryID res1 res2 diff(res2-res1) """ te_method1 = TrecEval() te_method1.evaluate(self.__qrels, self.__run1_file) te_method2 = TrecEval() te_method2.evaluate(self.__qrels, self.__run2_file) data = [] for query_id in te_method1.get_query_ids(): res1 = te_method1.get_score(query_id, self.__metric) res2 = te_method2.get_score(query_id, self.__metric) data.append([query_id, res1, res2, round(res2 - res1, 4)]) # sorts based on the differences desc sorted_data = sorted(data, key=lambda l: l[3], reverse=True) FileUtils.dump_tsv(output_file, sorted_data, header=["queryID", "method1", "method2", "diff"])
def generate_features(self, kb_file, output_file): """Core function for generating into output_file the features, with person-item data from kb_file. :param kb_file: path to the file with person items (a '.kb'-extension file). :param output_file: :return: """ feat_w2v_approx = FeaturesW2VSimApprox() with open(output_file, "w") as f_out: # write tsv header header = ["person_id", "prof_id"] for k in self.K_VALUES: header.append("simCos_w2v_" + str(k)) f_out.write("\t".join(header) + "\n") for line in FileUtils.read_file_as_list(kb_file): person_id, prof_id = line.split( "\t") # strip() done in read_file_as_list() values = [person_id, prof_id] person_tf, num_sent = self.get_person_tf(person_id) for k in self.K_VALUES: # we take top-K profession terms # compute simCosK # where K is the top-K terms for the profession term_weights_pr = { } # dict from top-K profession terms to their tfidf weights term_weights_pe = { } # dict from top-K person terms to their tfidf weights if prof_id in self.__stats: for term, s in self.__stats[prof_id].items(): if s["rank"] <= k: term_weights_pr[term] = float(s["tfidf"]) idf = s["tfidf"] / s[ "tf"] # we back-generate IDF from profession's TF-IDF term_weights_pe[term] = person_tf.get(term, 0) * idf vec_pr = feat_w2v_approx.get_vector(term_weights_pr) vec_pe = feat_w2v_approx.get_vector(term_weights_pe) cos = cos_sim(vec_pr, vec_pe) else: cos = 0 # in some exceptional cases the profession does not have any sentences values.append(str(cos)) f_out.write("\t".join(values) + "\n")
def main(args): config = FileUtils.load_config(args.config) if "_uri" not in config["index_name"]: print("index name might not be correct, please check again!") exit(0) indexer = IndexerDBpediaURI(config) fields_file = config.get("fields_file", "output/field_counts.json") if "fields_file" not in config: field_counts2json(fields_file) indexer.build() print("Index build: " + config["index_name"])
def main(args): config = FileUtils.load_config(args.config) dbpedia_path = config.get("dbpedia_files_path", "") # Check DBpedia files PLOGGER.info("Checking needed DBpedia files under {}".format(dbpedia_path)) for fname in [ENTITY_ABSTRACTS_FILE] + ENTITY_TYPES_FILES: if os.path.isfile(os.sep.join([dbpedia_path, fname])): PLOGGER.info(" - {}: OK".format(fname)) else: PLOGGER.error(" - {}: Missing".format(fname)) exit(1) indexer = IndexerDBpediaTypes(config) indexer.build_index(force=True)
def __load_entity_abstracts(self): num_lines = 0 filename = os.sep.join([self.__dbpedia_path, ENTITY_ABSTRACTS_FILE]) PLOGGER.info("Loading entity abstracts from {}".format(filename)) for line in FileUtils.read_file_as_list(filename): entity, abstract = self.__parse_line(line) if abstract and len(abstract) > 0: # skip empty objects self.__entity_abstracts[entity] = abstract num_lines += 1 if num_lines % 10000 == 0: PLOGGER.info(" {}K lines processed".format(num_lines // 1000)) PLOGGER.info(" Done.")
def main(args): config = FileUtils.load_config(args.config) if "_uri" not in config["index_name"]: PLOGGER.error("index name might not be correct, please check again!") exit(0) if "fields_file" not in config: fields_count = compute_field_counts() else: fields_count = json.load(config["fields_file"]) indexer = IndexerDBpediaURI(config, fields_count) indexer.build() PLOGGER.info("Index build: " + config["index_name"])
def main(args): config = FileUtils.load_config(args.config) elastic_term = ElasticCache(config["text_index"]) lambdas = config.get("lambdas", [0.9, 0.1]) queries = json.load(open(config["query_file"], "r")) mappings = json.load(open(config["mapping_file"], "r")) annots = load_annot(config["annot_file"]) run = load_run(config["run_file"]) instances = Instances() # gets the results out_file = open(config["output_file"], "w") qid_int = 0 for qid, query in sorted(queries.items()): print("Scoring ", qid, "...") results, libsvm_str = {}, "" query_len = len(elastic_term.analyze_query(query).split()) scorer = ScorerELR(ElasticCache(config["uri_index"]), annots[qid], query_len, lambdas) for doc_id, p_T_d in sorted(run[qid].items()): query_mappings = get_mapping_query(annots[qid], mappings) p_E_d = scorer.score_doc(doc_id, query_mappings) properties = { 'doc_id': doc_id, 'query': query, 'qid': qid, 'qid_int': qid_int } features = {'p_T_d': p_T_d, 'p_E_d': p_E_d} ins = Instance(qid + "_" + doc_id, features=features, properties=properties) instances.add_instance(ins) # libsvm_str += ins.to_libsvm(qid_prop="qod_int") results[doc_id] = (lambdas[0] * p_T_d) + (lambdas[1] * p_E_d) qid_int += 1 # Write trec format out_str = trec_format(results, qid, "elr") out_file.write(out_str) out_file.close() print("Output file:", config["output_file"]) instances.to_json(config["json_file"]) print("Output file:", config["json_file"])
def main(args): config = FileUtils.load_config(args.config) type2entity_file = os.path.expanduser(config.get("type2entity_file", "")) if not os.path.isfile(type2entity_file): print("invalid path to type-to-entity source file: ", type2entity_file) exit(1) entity_abstracts_file = os.path.expanduser( config.get("entity_abstracts_file", "")) if not os.path.isfile(entity_abstracts_file): print("invalid path to entity abstracts source file: ", entity_abstracts_file) exit(1) indexer = IndexerDBpediaTypes(config, type2entity_file, entity_abstracts_file) indexer.build_index(force=True) print("Index build: <{}>".format(indexer.name))
def main(args): example_config = {"index_name": "toy_index", # "query_file": "data/queries/test_queries.json", "first_pass": { "num_docs": 1000, "field": "content", # "model": "LMJelinekMercer", # "model_params": {"lambda": 0.1} }, "second_pass": { "field": "content", "model": "lm", "smoothing_method": "jm", "smoothing_param": 0.1 }, "output_file": "output/test_retrieval.txt" } config = FileUtils.load_config(args.config) if args.config != "" else example_config r = Retrieval(config) r.batch_retrieval()
def read_fb2dbp_file(self, is_39=False): """Reads the file and generates an initial mapping of Freebase to DBpedia IDs. Only proper DBpedia entities are considered; i.e. redirect and disambiguation pages are ignored. """ fb2dbp_file = self.__fb2dbp_file_39 if is_39 else self.__fb2dbp_file print("Processing " + fb2dbp_file + "...") t = Triple() p = NTriplesParser(t) i = 0 fb2dbp_mapping = defaultdict(set) with FileUtils.open_file_by_type(fb2dbp_file) as f: for line in f: try: p.parsestring(line.decode("utf-8")) except ParseError: # skip lines that couldn't be parsed continue if t.subject() is None: # only if parsed as a triple continue # prefixing dbp_id = self.__prefix.get_prefixed(t.subject()) fb_id = self.__prefix.get_prefixed(t.object()) # if reading 3.9 file, converts ID to 2015-10 version if is_39: dbp_id = EntityUtils.convert_39_to_201510(dbp_id) fb2dbp_mapping[fb_id].add(dbp_id) # if reading 2015-10 file, keeps only the proper DBpedia entities else: entity_utils = EntityUtils( self.__mongo_dbpedia.find_by_id(dbp_id)) if entity_utils.is_entity(): fb2dbp_mapping[fb_id].add(dbp_id) i += 1 if i % 1000 == 0: print(str(i // 1000) + "K lines are processed!") return fb2dbp_mapping