예제 #1
0
파일: misc_utils.py 프로젝트: zeyofu/EDL
def load_redirects_mongo(name, path, overwrite=False):
    # pkl_path = path + ".pkl"
    # if os.path.exists(pkl_path):
    # logging.info("pkl found! loading map %s", pkl_path)
    # r2t = load(pkl_path)
    # else:
    mongo_r2t = MongoBackedDict(dbname=name)
    if mongo_r2t.size() == 0 or overwrite:
        logging.info("db not found at %s. creating ...", path)
        f = open(path)
        r2t = {}
        err = 0
        logging.info("pkl not found ...")
        logging.info("loading map from %s", path)
        for idx, l in enumerate(f):
            parts = l.strip().split("\t")
            if len(parts) != 2:
                logging.info("error on line %d %s", idx, parts)
                err += 1
                continue
            redirect, title = parts
            if redirect in r2t:
                logging.info("duplicate keys! was this on purpose?")
            r2t[redirect] = title
        logging.info("map of size %d loaded %d err lines", len(r2t), err)
        mongo_r2t.bulk_insert(regular_map=r2t, insert_freq=len(r2t))
    logging.info("r2t of size %d", mongo_r2t.size())
    return mongo_r2t
예제 #2
0
파일: misc_utils.py 프로젝트: zeyofu/EDL
def load_map_mongo(path, overwrite=False):
    m = MongoBackedDict(dbname=path)
    rev_m = None
    if m.size() == 0 or overwrite:
        logging.info("dropping existing collection ...")
        m.drop_collection()
        tmp = {}
        # logging.info("pkl not found ...")
        logging.info("loading map from %s", path)
        abs_path = abs_path + path[5:]
        f = open(abs_path)
        err = 0
        for idx, l in enumerate(f):
            parts = l.strip().split("\t")
            if len(parts) != 2:
                logging.info("error on line %d %s", idx, parts)
                err += 1
                continue
            k, v = parts
            if k in tmp:
                logging.info("duplicate key %s was this on purpose?", k)
            tmp[k] = v
        rev_m = {v: k for k, v in tmp.items()}
        logging.info("inserting map of size %d to mongo (%d err lines)",
                     len(tmp), err)
        m.bulk_insert(regular_map=tmp, insert_freq=len(tmp))
    return m, rev_m
예제 #3
0
파일: misc_utils.py 프로젝트: zeyofu/EDL
def load_id2title_mongo(name, path, overwrite=False):
    mongo_id2t = MongoBackedDict(dbname=name + ".id2t")
    # TODO Maybe you can use the same db and its reverse?
    mongo_t2id = MongoBackedDict(dbname=name + ".t2id")
    # TODO fix below
    redirect_set = None
    if mongo_id2t.size() == 0 or mongo_t2id.size() == 0 or overwrite:
        logging.info("db not found at %s. creating ...", path)
        id2t, t2id = {}, {}
        redirect_set = set([])
        for line in open(path):
            parts = line.strip().split("\t")
            if len(parts) != 3:
                logging.info("bad line %s", line)
                continue
            # page_id, title = parts
            page_id, page_title, is_redirect = parts
            id2t[page_id] = page_title
            t2id[page_title] = page_id
            if is_redirect == "1":
                redirect_set.add(page_title)
        mongo_id2t.bulk_insert(regular_map=id2t, insert_freq=len(id2t))
        mongo_t2id.bulk_insert(regular_map=t2id, insert_freq=len(t2id))
        # obj = id2t, t2id, redirect_set
        # save(pkl_path, obj)
        # logging.info("saving id2t pkl to %s", pkl_path)
    logging.info("id2t of size %d", mongo_id2t.size())
    logging.info("t2id of size %d", mongo_t2id.size())
    return mongo_id2t, mongo_t2id, redirect_set
예제 #4
0
def load_prob_map_mongo(out_prefix,
                        kind,
                        dbname=None,
                        force_rewrite=False,
                        hostname="localhost"):
    path = out_prefix + "." + kind
    if dbname is None:
        dbname = path
    logging.info("dbname is %s", dbname)
    probmap = MongoBackedDict(dbname=dbname, hostname=hostname)
    logging.info("reading collection %s", path)
    if probmap.size() > 0 and not force_rewrite:
        logging.info(
            "collection already exists in db (size=%d). returning ...",
            probmap.size())
        return probmap
    else:
        if force_rewrite:
            logging.info("dropping existing collection in db.")
            probmap.drop_collection()
        # mmap = defaultdict(lambda: defaultdict(float))
        mmap = {}
        for idx, line in enumerate(open(path)):
            parts = line.split("\t")
            if idx > 0 and idx % 1000000 == 0:
                logging.info("read line %d", idx)
            if len(parts) != 4:
                logging.info("error on line %d: %s", idx, line)
                continue
            y, x, prob, _ = parts
            if y not in mmap:
                mmap[y] = {}
            mmap[y][x] = float(prob)
        for y in list(mmap.keys()):
            # TODO will below ever be false?
            # if y not in probmap:
            # Nested dict keys cannot have '.' and '$' in mongodb
            # tmpdict = {x: mmap[y][x] for x in mmap[y]}
            if len(mmap[y]) > 5000:
                logging.info(
                    "string %s can link to %d items (>10k)... skipping", y,
                    len(mmap[y]))
                # mmap[y] = []
                # continue
                del mmap[y]
            else:
                # tmpdict = [(x, mmap[y][x]) for x in mmap[y]]
                mmap[y] = list(mmap[y].items())
                # try:
                #     probmap[y] = tmpdict
                # except DocumentTooLarge as e:
                #     print(y, len(tmpdict))
                #     print(e)
        probmap.bulk_insert(regular_map=mmap, insert_freq=len(mmap))
    return probmap
예제 #5
0
class Inlinks:
    """
    reads the outlinks file and computes the inlinks dictionary from it.
    saves it in a pickled dict for fast access.
    """
    def __init__(self,
                 normalizer=None,
                 overwrite=False,
                 links_file=None,
                 pkl_path="/shared/bronte/upadhya3/tac2018/inlinks.pkl"):
        # normalizer = TitleNormalizer() if normalizer is None else normalizer
        if links_file is None:
            links_file = "/shared/preprocessed/cddunca2/wikipedia/outlinks.t2t"
        self.normalizer = normalizer
        self.inlinks = MongoBackedDict(dbname="enwiki_inlinks")
        if self.inlinks.size() == 0 or overwrite:
            self.inlinks.drop_collection()
            start = time.time()
            logging.info("loading from file %s", links_file)
            self.load_link_info(links_file=links_file)
            logging.info("created in %d secs", time.time() - start)

    def load_link_info(self, links_file):
        logging.info("loading links %s ...", links_file)
        bad = 0
        mmap = {}
        for idx, line in enumerate(open(links_file)):
            if idx > 0 and idx % 1000000 == 0:
                logging.info("read %d", idx)
            line = line.strip().split('\t')
            if len(line) != 2:
                # logging.info("skipping bad line %s", line)
                bad += 1
                if bad % 10000 == 0:
                    logging.info("bad %d total %d", bad, idx)
                continue
            src = line[0]
            trgs = line[1].split(' ')
            for trg in trgs:
                if trg not in mmap:
                    mmap[trg] = []
                mmap[trg].append(src)
        logging.info("inserting regular map into mongo")
        self.inlinks.bulk_insert(regular_map=mmap, insert_freq=len(mmap))
        # DONT DO THIS! this inserts one by one, which is slow
        # for trg in mmap:
        #     self.inlinks[trg] = mmap[trg]
        logging.info("mongo map made")
예제 #6
0
class Wiki2Lorelei:
    def __init__(self, ilcode, overwrite=False):
        cll_name = "wiki2eid_il" + ilcode
        self.wiki2eids = MongoBackedDict(dbname=cll_name)
        self.normalizer = TitleNormalizer()
        if overwrite:
            self.wiki2eids.drop_collection()
            logging.info("computing wiki2eids map ...")
            self.compute_map(ilcode)
        logging.info("wiki2eids map loaded (size=%d)", self.wiki2eids.size())

    # @profile
    def compute_map(self, ilcode):
        basepath = "/shared/corpora/corporaWeb/lorelei/evaluation-2019/"
        kbfile = basepath + "il{}/source/kb/IL{}_kb/data/entities.tab".format(
            ilcode, ilcode, ilcode)
        tmp_map = {}
        for idx, line in enumerate(open(kbfile)):

            if idx > 0 and idx % 100000 == 0:
                logging.info("read %d lines", idx)

            parts = line.rstrip('\n').split('\t')
            if len(parts) < len(fields):
                logging.info("bad line %d nfields:%d expected:%d", idx,
                             len(parts), len(fields))
                continue

            kbentry = {}
            for field, v in zip(fields, parts):
                if len(v) != 0:
                    kbentry[field] = v

            eid = kbentry["entityid"]
            title = get_normalized_wikititle_kbentry(
                title_normalizer=self.normalizer, kbentry=kbentry)

            if title == NULL_TITLE:
                continue

            if title not in tmp_map:
                tmp_map[title] = []
            tmp_map[title].append(eid)
        self.wiki2eids.bulk_insert(regular_map=tmp_map,
                                   insert_freq=len(tmp_map))
예제 #7
0
def load_nicknames(path="/shared/experiments/upadhya3/ppoudyaltest/wiki_list", overwrite=False):
    nicknames = MongoBackedDict(dbname="nicknames")
    if nicknames.size() == 0 or overwrite:
        nn_map = {}
        # populate nn_map
        for idx,line in enumerate(open(path)):
            parts = line.strip().split('\t')
            if idx > 0 and idx % 10000==0:
                logging.info("read %d lines", idx)
            # if len(parts)!=3:
            #     logging.info("bad line %s",line)
            #     continue
            title, tid = parts[:2]
            fr_strs = parts[2:]
            # print(title,tid,fr_strs)
            for fr_str in fr_strs:
                if fr_str not in nn_map:
                    nn_map[fr_str] = title
        nicknames.bulk_insert(regular_map=nn_map, insert_freq=len(nn_map))
    return nicknames
class AbstractIndex:
    def __init__(self, index_name, kbfile, overwrite=False, ngramorders=[]):
        self.name2ent = MongoBackedDict(dbname=index_name + ".phrase")
        self.word2ent = MongoBackedDict(dbname=index_name + ".word")
        self.ngram2ent = {}
        self.kbfile = kbfile
        self.ngramorders = ngramorders

        for i in self.ngramorders:
            self.ngram2ent[i] = MongoBackedDict(dbname=index_name +
                                                ".ngram-{}".format(i))
        index_type = None
        indices = []
        all_empty = all([i.size() == 0 for i in indices])
        if overwrite or all_empty:
            self.name2ent.drop_collection()
            self.word2ent.drop_collection()
            for i in self.ngramorders:
                self.ngram2ent[i].drop_collection()
            index_type = "all"
        else:
            # TODO The logic here is messed up
            if self.name2ent.size() == 0:
                self.name2ent.drop_collection()
                index_type = "name2ent"

            if self.word2ent.size() == 0:
                self.word2ent.drop_collection()
                index_type = "word2ent"

            for i in self.ngramorders:
                if self.ngram2ent[i].size() == 0:
                    self.ngram2ent[i].drop_collection()
                    index_type = "ngram2ent"

        if index_type is not None:
            start = time.time()
            logging.info("loading from file %s", index_name)
            self.load_kb(index_type=index_type)
            logging.info("created in %d secs", time.time() - start)
        logging.info("%s loaded", index_name)

    def process_kb(self):
        raise NotImplementedError

    def load_kb(self, index_type):
        name_map = {}
        word_map = {}
        ngram_map = {}
        logging.info("index type:%s", index_type)
        for i in self.ngramorders:
            ngram_map[i] = {}
        try:
            for names, eid in self.process_kb():

                names = set(names)
                if index_type in ["all", "name2ent"]:
                    add_to_dict(names, eid, name_map)

                if index_type in ["all", "word2ent"]:
                    toks = set([tok for n in names for tok in n.split(" ")])
                    add_to_dict(toks, eid, word_map)

                if index_type in ["all", "ngram2ent"]:
                    for i in self.ngramorders:
                        ngramset = set([
                            gram for n in names
                            for gram in getngrams(n, ngram=i)
                        ])
                        add_to_dict(ngramset, eid, ngram_map[i])

            self.put_in_mongo(index_type, name_map, word_map, ngram_map)
        except KeyboardInterrupt:
            logging.info("ending prematurely.")
            self.put_in_mongo(index_type, name_map, word_map, ngram_map)

    def put_in_mongo(self, index_type, name_map, word_map, ngram_map):
        if index_type in ["all", "name2ent"]:
            self.name2ent.bulk_insert(name_map,
                                      insert_freq=len(name_map),
                                      value_func=lambda x: list(x))
        if index_type in ["all", "word2ent"]:
            self.word2ent.bulk_insert(word_map,
                                      insert_freq=len(word_map),
                                      value_func=lambda x: list(x))
        if index_type in ["all", "ngram2ent"]:
            for i in self.ngramorders:
                ngram_map[i] = self.prune_map(ngram_map[i])
                self.ngram2ent[i].bulk_insert(ngram_map[i],
                                              insert_freq=len(ngram_map[i]),
                                              value_func=lambda x: list(x))

    def prune_map(self, nmap):
        # dict changes during iteration, so take care
        for k in list(nmap.keys()):
            if len(nmap[k]) > 10000:
                logging.info("pruning entry for %s len=%d", k, len(nmap[k]))
                del nmap[k]
        return nmap