Пример #1
0
 def __init__(self):
     vocab_file = os.path.join(data_path, "bert_voca.txt")
     self.tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                                 do_lower_case=True)
     self.stemmer = CacheStemmer()
     self.stopword = load_stopwords()
     self.df = self.load_galgo_df_stat()
Пример #2
0
def segment_per_doc_index(task_id):
    token_reader = get_token_reader()
    stemmer = CacheStemmer()
    stopword = load_stopwords()

    p = os.path.join(cpath.data_path, "adhoc", "robust_seg_info.pickle")
    seg_info = pickle.load(open(p, "rb"))

    def get_doc_posting_list(doc_id):
        doc_posting = defaultdict(list)
        for interval in seg_info[doc_id]:
            (loc, loc_ed), (_, _) = interval
            tokens = token_reader.retrieve(doc_id)
            st_tokens = list([stemmer.stem(t) for t in tokens])
            ct = Counter(st_tokens[loc:loc_ed])
            for term, cnt in ct.items():
                if term in stopword:
                    continue
                doc_posting[term].append((loc, cnt))

        return doc_posting

    doc_id_list = get_doc_task(task_id)
    ticker = TimeEstimator(len(doc_id_list))
    doc_posting_d = {}
    for doc_id in doc_id_list:
        doc_posting_d[doc_id] = get_doc_posting_list(doc_id)
        ticker.tick()

    save_path = os.path.join(cpath.data_path, "adhoc",
                             "per_doc_posting_{}.pickle".format(task_id))
    pickle.dump(doc_posting_d, open(save_path, "wb"))
Пример #3
0
def get_idf_keyword_score(problems: List[QueryDoc],
                          get_idf) -> Iterable[Counter]:
    stemmer = CacheStemmer()
    ticker = TimeEstimator(len(problems))
    for p in problems:
        tokens = p.doc
        tf = Counter()
        reverse_map = {}  # Stemmed -> raw
        tokens = [t for t in tokens if t not in [".", ",", "!"]]
        for raw_t in tokens:
            stem_t = stemmer.stem(raw_t)
            reverse_map[stem_t] = raw_t
            tf[stem_t] += 1

        score_d = Counter()
        for term, cnt in tf.items():

            score = math.log(1 + cnt) * get_idf(term)
            assert type(score) == float
            score_d[term] = score

        score_d_surface_form: Counter = Counter(
            dict_key_map(lambda x: reverse_map[x], score_d))
        ticker.tick()
        yield score_d_surface_form
Пример #4
0
    def __init__(self, window_size):
        self.stemmer = CacheStemmer()
        self.window_size = window_size
        self.doc_posting = None
        self.stopword = load_stopwords()

        vocab_file = os.path.join(cpath.data_path, "bert_voca.txt")
        self.tokenizer = tokenization.FullTokenizer(
            vocab_file=vocab_file, do_lower_case=True)


        def load_pickle(name):
            p = os.path.join(cpath.data_path, "adhoc", name + ".pickle")
            return pickle.load(open(p, "rb"))

        self.doc_len_dict = load_pickle("doc_len")
        self.qdf = load_pickle("robust_qdf_ex")
        self.meta = load_pickle("robust_meta")
        self.head_tokens = load_pickle("robust_title_tokens")
        self.seg_info = load_pickle("robust_seg_info")
        self.not_found = set()

        self.total_doc_n =  len(self.doc_len_dict)
        self.avdl = sum(self.doc_len_dict.values()) / len(self.doc_len_dict)
        tprint("Init PassageRanker")
Пример #5
0
 def __init__(self):
     self.stopword = load_stopwords()
     self.stemmer = CacheStemmer()
     vocab_file = os.path.join(cpath.data_path, "bert_voca.txt")
     self.tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                                 do_lower_case=True)
     tprint("Loading inv_index for robust")
     self.collection = RobustCollection()
     tprint("Done")
     self.num_candidate = 10
Пример #6
0
def build_co_occur_from_pc_feature(data: Dict[str, List[List[str]]]) \
        -> List[Tuple[str, Counter]]:
    window_size = 10
    stemmer = CacheStemmer()
    r = []
    ticker = TimeEstimator(len(data))
    for cid, tokens_list in data.items():
        ticker.tick()
        counter = build_co_occurrence(tokens_list, window_size, stemmer)
        r.append((cid, counter))
    return r
Пример #7
0
def build_co_occur_from_pc_feature(
        data: Dict[str, List[ScoreParagraph]]) -> List[Tuple[str, Counter]]:
    window_size = 10
    stemmer = CacheStemmer()
    r = []

    ticker = TimeEstimator(len(data))
    for cid, para_list in data.items():
        ticker.tick()
        tokens_list: List[List[str]] = [e.paragraph.tokens for e in para_list]
        counter = build_co_occurrence(tokens_list, window_size, stemmer)
        r.append((cid, counter))
    return r
Пример #8
0
def explain_by_lime_idf(data: List[str], get_idf) -> List[Tuple[str, float]]:
    stemmer = CacheStemmer()

    def split(t):
        return t.split()

    explainer = lime_text.LimeTextExplainer(split_expression=split, bow=True)

    def evaluate_score(problems: List[str]):
        scores = []
        for problem in problems:
            score = solve(problem)
            scores.append([0, score])
        return np.array(scores)

    def solve(problem: str):
        tokens = split(problem)
        if "[SEP]" not in tokens:
            return 0
        e: QueryDoc = parse_problem(tokens)
        q_terms = lmap(stemmer.stem, e.query)
        doc_terms = lmap(stemmer.stem, e.doc)
        tf = Counter(doc_terms)
        q_terms_set = set(q_terms)
        score = 0
        for term, cnt in tf.items():
            if term in q_terms_set:
                idf = get_idf(term)
                score += log(1 + cnt) * idf
            # TODO add idf multiplication
        return score

    explains = []
    tick = TimeEstimator(len(data))
    for entry in data:
        assert type(entry) == str
        exp = explainer.explain_instance(entry,
                                         evaluate_score,
                                         num_features=512)
        # l = list(exp.local_exp[1])
        # l.sort(key=get_first)
        # indices, scores = zip(*l)
        l2 = exp.as_list()
        l2.sort(key=get_second, reverse=True)
        explains.append(l2)
        tick.tick()
    return explains
Пример #9
0
def work(q_res_path, save_name):
    ranked_list_d = load_galago_ranked_list(q_res_path)
    window_size = 10
    stemmer = CacheStemmer()
    print(q_res_path)

    ticker = TimeEstimator(len(ranked_list_d))
    r = []
    for claim_id, ranked_list in ranked_list_d.items():
        ticker.tick()
        doc_ids = list([e.doc_id for e in ranked_list])
        print("1")
        counter = build_co_occurrence(get_tokens_form_doc_ids(doc_ids), window_size, stemmer)
        print("2")
        r.append((claim_id, counter))

    save_to_pickle(r, save_name)
Пример #10
0
 def __init__(self):
     tprint("Pipeline Init")
     self.stemmer = CacheStemmer()
     vocab_file = os.path.join(cpath.data_path, "bert_voca.txt")
     self.tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                                 do_lower_case=True)
     self.iteration_dir = "/mnt/scratch/youngwookim/data/tlm_iter1"
     if not os.path.exists("/mnt/scratch/youngwookim/"):
         self.iteration_dir = "/mnt/nfs/work3/youngwookim/data/tlm_iter1"
     self.seg_max_seq = 256
     self.model_max_seq = 512
     self.rng = random.Random(0)
     self.masked_lm_prob = 0.15
     self.short_seq_prob = 0.1
     self.inst_per_job = 1000
     self.stopword = load_stopwords()
     self.pr = FeatureExtractor(self.seg_max_seq - 3)
     self.tf_record_maker = None
     self.code_tick = CodeTiming()
     tprint("Pipeline Init Done")
Пример #11
0
def count_it(
        data: Dict[str, List[ScoreParagraph]]) -> List[Tuple[str, Counter]]:
    stemmer = CacheStemmer()
    r = []
    stopword = load_stopwords()

    def remove_stopwords(tokens: List[str]) -> List[str]:
        return list([t for t in tokens if t not in stopword])

    ticker = TimeEstimator(len(data))
    for cid, para_list in data.items():
        ticker.tick()
        tokens_list: List[List[str]] = [e.paragraph.tokens for e in para_list]
        list_tokens: List[List[str]] = lmap(stemmer.stem_list, tokens_list)
        list_tokens: List[List[str]] = lmap(remove_stopwords, list_tokens)

        all_cnt = Counter()
        for tokens in list_tokens:
            all_cnt.update(Counter(tokens))

        r.append((cid, all_cnt))
    return r
Пример #12
0
def save_qdf_ex():
    ii_path = os.path.join(cpath.data_path, "adhoc", "robust_inv_index.pickle")
    inv_index = pickle.load(open(ii_path, "rb"))
    save_path = os.path.join(cpath.data_path, "adhoc", "robust_meta.pickle")
    meta = pickle.load(open(save_path, "rb"))
    stopwords = load_stopwords()
    stemmer = CacheStemmer()

    simple_posting = {}

    qdf_d = Counter()
    for term in inv_index:
        simple_posting[term] = set()
        for doc_id, _ in inv_index[term]:
            simple_posting[term].add(doc_id)

    for doc in meta:
        date, headline = meta[doc]
        tokens = nltk.tokenize.wordpunct_tokenize(headline)
        terms = set()
        for idx, t in enumerate(tokens):
            if t in stopwords:
                continue

            t_s = stemmer.stem(t)

            terms.add(t_s)

        for t in terms:
            simple_posting[t].add(doc)

    for term in inv_index:
        qdf = len(simple_posting[term])
        qdf_d[term] = qdf

    save_path = os.path.join(cpath.data_path, "adhoc", "robust_qdf_ex.pickle")
    pickle.dump(qdf_d, open(save_path, "wb"))