def load_redirects_mongo(name, path, overwrite=False): # pkl_path = path + ".pkl" # if os.path.exists(pkl_path): # logging.info("pkl found! loading map %s", pkl_path) # r2t = load(pkl_path) # else: mongo_r2t = MongoBackedDict(dbname=name) if mongo_r2t.size() == 0 or overwrite: logging.info("db not found at %s. creating ...", path) f = open(path) r2t = {} err = 0 logging.info("pkl not found ...") logging.info("loading map from %s", path) for idx, l in enumerate(f): parts = l.strip().split("\t") if len(parts) != 2: logging.info("error on line %d %s", idx, parts) err += 1 continue redirect, title = parts if redirect in r2t: logging.info("duplicate keys! was this on purpose?") r2t[redirect] = title logging.info("map of size %d loaded %d err lines", len(r2t), err) mongo_r2t.bulk_insert(regular_map=r2t, insert_freq=len(r2t)) logging.info("r2t of size %d", mongo_r2t.size()) return mongo_r2t
def load_map_mongo(path, overwrite=False): m = MongoBackedDict(dbname=path) rev_m = None if m.size() == 0 or overwrite: logging.info("dropping existing collection ...") m.drop_collection() tmp = {} # logging.info("pkl not found ...") logging.info("loading map from %s", path) abs_path = abs_path + path[5:] f = open(abs_path) err = 0 for idx, l in enumerate(f): parts = l.strip().split("\t") if len(parts) != 2: logging.info("error on line %d %s", idx, parts) err += 1 continue k, v = parts if k in tmp: logging.info("duplicate key %s was this on purpose?", k) tmp[k] = v rev_m = {v: k for k, v in tmp.items()} logging.info("inserting map of size %d to mongo (%d err lines)", len(tmp), err) m.bulk_insert(regular_map=tmp, insert_freq=len(tmp)) return m, rev_m
def load_id2title_mongo(name, path, overwrite=False): mongo_id2t = MongoBackedDict(dbname=name + ".id2t") # TODO Maybe you can use the same db and its reverse? mongo_t2id = MongoBackedDict(dbname=name + ".t2id") # TODO fix below redirect_set = None if mongo_id2t.size() == 0 or mongo_t2id.size() == 0 or overwrite: logging.info("db not found at %s. creating ...", path) id2t, t2id = {}, {} redirect_set = set([]) for line in open(path): parts = line.strip().split("\t") if len(parts) != 3: logging.info("bad line %s", line) continue # page_id, title = parts page_id, page_title, is_redirect = parts id2t[page_id] = page_title t2id[page_title] = page_id if is_redirect == "1": redirect_set.add(page_title) mongo_id2t.bulk_insert(regular_map=id2t, insert_freq=len(id2t)) mongo_t2id.bulk_insert(regular_map=t2id, insert_freq=len(t2id)) # obj = id2t, t2id, redirect_set # save(pkl_path, obj) # logging.info("saving id2t pkl to %s", pkl_path) logging.info("id2t of size %d", mongo_id2t.size()) logging.info("t2id of size %d", mongo_t2id.size()) return mongo_id2t, mongo_t2id, redirect_set
def load_prob_map_mongo(out_prefix, kind, dbname=None, force_rewrite=False, hostname="localhost"): path = out_prefix + "." + kind if dbname is None: dbname = path logging.info("dbname is %s", dbname) probmap = MongoBackedDict(dbname=dbname, hostname=hostname) logging.info("reading collection %s", path) if probmap.size() > 0 and not force_rewrite: logging.info( "collection already exists in db (size=%d). returning ...", probmap.size()) return probmap else: if force_rewrite: logging.info("dropping existing collection in db.") probmap.drop_collection() # mmap = defaultdict(lambda: defaultdict(float)) mmap = {} for idx, line in enumerate(open(path)): parts = line.split("\t") if idx > 0 and idx % 1000000 == 0: logging.info("read line %d", idx) if len(parts) != 4: logging.info("error on line %d: %s", idx, line) continue y, x, prob, _ = parts if y not in mmap: mmap[y] = {} mmap[y][x] = float(prob) for y in list(mmap.keys()): # TODO will below ever be false? # if y not in probmap: # Nested dict keys cannot have '.' and '$' in mongodb # tmpdict = {x: mmap[y][x] for x in mmap[y]} if len(mmap[y]) > 5000: logging.info( "string %s can link to %d items (>10k)... skipping", y, len(mmap[y])) # mmap[y] = [] # continue del mmap[y] else: # tmpdict = [(x, mmap[y][x]) for x in mmap[y]] mmap[y] = list(mmap[y].items()) # try: # probmap[y] = tmpdict # except DocumentTooLarge as e: # print(y, len(tmpdict)) # print(e) probmap.bulk_insert(regular_map=mmap, insert_freq=len(mmap)) return probmap
class Inlinks: """ reads the outlinks file and computes the inlinks dictionary from it. saves it in a pickled dict for fast access. """ def __init__(self, normalizer=None, overwrite=False, links_file=None, pkl_path="/shared/bronte/upadhya3/tac2018/inlinks.pkl"): # normalizer = TitleNormalizer() if normalizer is None else normalizer if links_file is None: links_file = "/shared/preprocessed/cddunca2/wikipedia/outlinks.t2t" self.normalizer = normalizer self.inlinks = MongoBackedDict(dbname="enwiki_inlinks") if self.inlinks.size() == 0 or overwrite: self.inlinks.drop_collection() start = time.time() logging.info("loading from file %s", links_file) self.load_link_info(links_file=links_file) logging.info("created in %d secs", time.time() - start) def load_link_info(self, links_file): logging.info("loading links %s ...", links_file) bad = 0 mmap = {} for idx, line in enumerate(open(links_file)): if idx > 0 and idx % 1000000 == 0: logging.info("read %d", idx) line = line.strip().split('\t') if len(line) != 2: # logging.info("skipping bad line %s", line) bad += 1 if bad % 10000 == 0: logging.info("bad %d total %d", bad, idx) continue src = line[0] trgs = line[1].split(' ') for trg in trgs: if trg not in mmap: mmap[trg] = [] mmap[trg].append(src) logging.info("inserting regular map into mongo") self.inlinks.bulk_insert(regular_map=mmap, insert_freq=len(mmap)) # DONT DO THIS! this inserts one by one, which is slow # for trg in mmap: # self.inlinks[trg] = mmap[trg] logging.info("mongo map made")
class Wiki2Lorelei: def __init__(self, ilcode, overwrite=False): cll_name = "wiki2eid_il" + ilcode self.wiki2eids = MongoBackedDict(dbname=cll_name) self.normalizer = TitleNormalizer() if overwrite: self.wiki2eids.drop_collection() logging.info("computing wiki2eids map ...") self.compute_map(ilcode) logging.info("wiki2eids map loaded (size=%d)", self.wiki2eids.size()) # @profile def compute_map(self, ilcode): basepath = "/shared/corpora/corporaWeb/lorelei/evaluation-2019/" kbfile = basepath + "il{}/source/kb/IL{}_kb/data/entities.tab".format( ilcode, ilcode, ilcode) tmp_map = {} for idx, line in enumerate(open(kbfile)): if idx > 0 and idx % 100000 == 0: logging.info("read %d lines", idx) parts = line.rstrip('\n').split('\t') if len(parts) < len(fields): logging.info("bad line %d nfields:%d expected:%d", idx, len(parts), len(fields)) continue kbentry = {} for field, v in zip(fields, parts): if len(v) != 0: kbentry[field] = v eid = kbentry["entityid"] title = get_normalized_wikititle_kbentry( title_normalizer=self.normalizer, kbentry=kbentry) if title == NULL_TITLE: continue if title not in tmp_map: tmp_map[title] = [] tmp_map[title].append(eid) self.wiki2eids.bulk_insert(regular_map=tmp_map, insert_freq=len(tmp_map))
def load_nicknames(path="/shared/experiments/upadhya3/ppoudyaltest/wiki_list", overwrite=False): nicknames = MongoBackedDict(dbname="nicknames") if nicknames.size() == 0 or overwrite: nn_map = {} # populate nn_map for idx,line in enumerate(open(path)): parts = line.strip().split('\t') if idx > 0 and idx % 10000==0: logging.info("read %d lines", idx) # if len(parts)!=3: # logging.info("bad line %s",line) # continue title, tid = parts[:2] fr_strs = parts[2:] # print(title,tid,fr_strs) for fr_str in fr_strs: if fr_str not in nn_map: nn_map[fr_str] = title nicknames.bulk_insert(regular_map=nn_map, insert_freq=len(nn_map)) return nicknames
class AbstractIndex: def __init__(self, index_name, kbfile, overwrite=False, ngramorders=[]): self.name2ent = MongoBackedDict(dbname=index_name + ".phrase") self.word2ent = MongoBackedDict(dbname=index_name + ".word") self.ngram2ent = {} self.kbfile = kbfile self.ngramorders = ngramorders for i in self.ngramorders: self.ngram2ent[i] = MongoBackedDict(dbname=index_name + ".ngram-{}".format(i)) index_type = None indices = [] all_empty = all([i.size() == 0 for i in indices]) if overwrite or all_empty: self.name2ent.drop_collection() self.word2ent.drop_collection() for i in self.ngramorders: self.ngram2ent[i].drop_collection() index_type = "all" else: # TODO The logic here is messed up if self.name2ent.size() == 0: self.name2ent.drop_collection() index_type = "name2ent" if self.word2ent.size() == 0: self.word2ent.drop_collection() index_type = "word2ent" for i in self.ngramorders: if self.ngram2ent[i].size() == 0: self.ngram2ent[i].drop_collection() index_type = "ngram2ent" if index_type is not None: start = time.time() logging.info("loading from file %s", index_name) self.load_kb(index_type=index_type) logging.info("created in %d secs", time.time() - start) logging.info("%s loaded", index_name) def process_kb(self): raise NotImplementedError def load_kb(self, index_type): name_map = {} word_map = {} ngram_map = {} logging.info("index type:%s", index_type) for i in self.ngramorders: ngram_map[i] = {} try: for names, eid in self.process_kb(): names = set(names) if index_type in ["all", "name2ent"]: add_to_dict(names, eid, name_map) if index_type in ["all", "word2ent"]: toks = set([tok for n in names for tok in n.split(" ")]) add_to_dict(toks, eid, word_map) if index_type in ["all", "ngram2ent"]: for i in self.ngramorders: ngramset = set([ gram for n in names for gram in getngrams(n, ngram=i) ]) add_to_dict(ngramset, eid, ngram_map[i]) self.put_in_mongo(index_type, name_map, word_map, ngram_map) except KeyboardInterrupt: logging.info("ending prematurely.") self.put_in_mongo(index_type, name_map, word_map, ngram_map) def put_in_mongo(self, index_type, name_map, word_map, ngram_map): if index_type in ["all", "name2ent"]: self.name2ent.bulk_insert(name_map, insert_freq=len(name_map), value_func=lambda x: list(x)) if index_type in ["all", "word2ent"]: self.word2ent.bulk_insert(word_map, insert_freq=len(word_map), value_func=lambda x: list(x)) if index_type in ["all", "ngram2ent"]: for i in self.ngramorders: ngram_map[i] = self.prune_map(ngram_map[i]) self.ngram2ent[i].bulk_insert(ngram_map[i], insert_freq=len(ngram_map[i]), value_func=lambda x: list(x)) def prune_map(self, nmap): # dict changes during iteration, so take care for k in list(nmap.keys()): if len(nmap[k]) > 10000: logging.info("pruning entry for %s len=%d", k, len(nmap[k])) del nmap[k] return nmap