Пример #1
0
def get_docs_from_ranked_list(ranked_list: List[SimpleRankedListEntry]) -> List[List[str]]:
    doc_ids = lmap(lambda x: x.doc_id, ranked_list)
    preload_man.preload(TokenizedCluewebDoc, doc_ids)

    def get_tokens(doc_id) -> List[str]:
        return load(TokenizedCluewebDoc, doc_id)

    # tokens_d: Dict[str, List[str]] = load_multiple(TokenizedCluewebDoc, doc_ids, True)

    l : List[List[str]] = []
    cnt_not_found = 0
    for doc_id in doc_ids:
        try:
            r = get_tokens(doc_id)
            l.append(r)
            print(".", end="")
        except KeyError as e:
            cnt_not_found+= 1
            pass
    # l = list(tokens_d.values())
    cnt_not_found = len(doc_ids) - len(l)
    print("done")
    if cnt_not_found:
        print()
        print("not found : ", cnt_not_found)
    return l
Пример #2
0
    def select_paragraph_from_datapoint(x: TPDataPoint) -> ParagraphFeature:
        try:
            ranked_docs: List[SimpleRankedListEntry] = ci.fetch_from_q_res_id(dp_id_to_q_res_id_fn(x.id))
            ranked_docs = ranked_docs[:100]
        except KeyError:
            ranked_docs = []

        paragraph_scorer_local: Callable[[Paragraph], ScoreParagraph] = paragraph_scorer_factory(x)
        #  prefetch tokens and bert tokens
        doc_ids = lmap(lambda x: x.doc_id, ranked_docs)
        preload_man.preload(TokenizedCluewebDoc, doc_ids)
        preload_man.preload(BertTokenizedCluewebDoc, doc_ids)

        def get_best_paragraph_from_doc(doc: SimpleRankedListEntry) -> List[ScoreParagraph]:
            paragraph_list = paragraph_iterator(doc)
            score_paragraph = lmap(paragraph_scorer_local, paragraph_list)
            score_paragraph.sort(key=lambda p: p.score, reverse=True)
            return score_paragraph[:1]

        def get_all_paragraph_from_doc(doc: SimpleRankedListEntry) -> List[ScoreParagraph]:
            paragraph_list = paragraph_iterator(doc)
            score_paragraph = lmap(paragraph_scorer_local, paragraph_list)
            return score_paragraph

        if option.para_per_doc == ONE_PARA_PER_DOC:
            get_paragraphs = get_best_paragraph_from_doc
        else:
            get_paragraphs = get_all_paragraph_from_doc

        candidate_paragraph: List[ScoreParagraph] = list(flatten(lmap(get_paragraphs, ranked_docs)))
        candidate_paragraph.sort(key=lambda x: x.score, reverse=True)
        candidate_paragraph = remove_duplicate(candidate_paragraph)

        return ParagraphFeature(datapoint=x,
                                feature=candidate_paragraph[:n_passages])
Пример #3
0
def qk_candidate_gen(q_res_path: str, doc_score_path, split,
                     config) -> List[Tuple[QCKQuery, List[KDP]]]:
    queries: List[QCKQuery] = get_qck_queries(split)
    num_jobs = d_n_claims_per_split2[split]
    score_d = load_doc_scores(doc_score_path, num_jobs)

    tprint("loading ranked list")
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    query_ids = list(ranked_list.keys())
    query_ids.sort()
    print("num queries", len(query_ids))
    q_id_to_job_id = {q_id: job_id for job_id, q_id in enumerate(query_ids)}
    print("Pre loading docs")
    top_n = config['top_n']
    out_qk: List[Tuple[QCKQuery, List[KnowledgeDocumentPart]]] = []

    all_doc_parts = 0
    ticker = TimeEstimator(len(queries))
    for q in queries:
        job_id: int = q_id_to_job_id[q.query_id]
        entries: List = score_d[job_id]
        entries.sort(key=get_second, reverse=True)
        doc_ids = left(entries)
        doc_ids = doc_ids[:top_n]
        preload_man.preload(TokenizedCluewebDoc, doc_ids)
        docs = iterate_docs(doc_ids)
        doc_part_list: List[KDP] = iterate_document_parts(
            docs, config['window_size'], config['step_size'], 20)

        all_doc_parts += len(doc_part_list)
        out_qk.append((q, doc_part_list))
        ticker.tick()
    return out_qk
Пример #4
0
def preload_docs(ranked_list, claims, top_n):
    def get_doc_ids(claim: Dict):
        # Find the q_res
        q_res: List[SimpleRankedListEntry] = ranked_list[str(claim['cId'])]
        return list([q_res[i].doc_id for i in range(top_n)])

    all_doc_ids: Set[str] = set(flatten(lmap(get_doc_ids, claims)))
    print(f"total of {len(all_doc_ids)} docs")
    print("Accessing DB")
    #  Get the doc from DB
    preload_man.preload(TokenizedCluewebDoc, all_doc_ids)
Пример #5
0
def main():
    split = "train"
    subjectivity_path = sys.argv[1]
    q_res_path = sys.argv[2]
    ranked_list: Dict[str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)

    # load LM
    claim_lms: List[ClaimLM] = build_gold_lms_for_sub_split(split)
    bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms))
    log_bg_lm = get_lm_log(bg_lm)
    alpha = 0.1
    stopwords = load_stopwords_for_query()
    # load subjectivity predictions.
    subj_d: Dict[str, Tuple[int, int]] = load_subjectivity(subjectivity_path)
    doc_ids = subj_d.keys()
    preload_man.preload(TokenizedCluewebDoc, doc_ids)
    tokenizer = PCTokenizer()

    lm_scores = []
    rates = []
    num_subj_list = []
    num_sent_list = []
    for claim_lm in claim_lms:
        qid = str(claim_lm.cid)
        log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha))
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)

        def get_passage_score(p):
            def get_score(t):
                if t in stopwords:
                    return 0
                return log_odd[tokenizer.stemmer.stem(t)]

            return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0

        for entry in ranked_list[qid]:
            if entry.doc_id in subj_d:
                tokens = load_doc(entry.doc_id)
                assert type(tokens[0]) == str
                lm_score = get_passage_score(tokens)
                num_subj, num_sent = subj_d[entry.doc_id]
                rate = num_subj / num_sent
                lm_scores.append(lm_score)
                rates.append(rate)
                num_subj_list.append(num_subj)
                num_sent_list.append(num_sent)



    print("lm scores correlation with ")
    print("rates: ", pearsonr(lm_scores, rates))
    print("num subj: ", pearsonr(lm_scores, num_subj_list))
    print("num sent: ", pearsonr(lm_scores, num_sent_list))
Пример #6
0
def write_tfrecord(ranked_list_d: RankedListDict, queries: List[Query],
                   q_rels: Dict[str, List[str]], save_path):
    max_seq_length = 512
    tokenizer = get_tokenizer()
    encoder = AllSegmentAsDoc(max_seq_length)
    writer = RecordWriterWrap(save_path)
    data_id = 0

    data_info = []
    for query in queries:
        if query.qid not in ranked_list_d:
            print("Warning query {} not found".format(query.qid))
            continue
        print(query.qid)
        ranked_list = ranked_list_d[query.qid]
        doc_ids = [doc_entry.doc_id for doc_entry in ranked_list]
        preload_man.preload(BertTokenizedCluewebDoc, doc_ids)
        q_tokens = tokenizer.tokenize(query.text)

        for doc_entry in ranked_list:
            try:
                tokens_list: List[List[str]] = load(BertTokenizedCluewebDoc,
                                                    doc_entry.doc_id)
                tokens = flatten(tokens_list)
                insts: List[Tuple[List,
                                  List]] = encoder.encode(q_tokens, tokens)
                for inst in insts:
                    label = doc_entry.doc_id in q_rels[query.qid]

                    input_tokens, segment_ids = inst
                    feature = get_basic_input_feature(tokenizer,
                                                      max_seq_length,
                                                      input_tokens,
                                                      segment_ids)
                    feature["label_ids"] = create_int_feature([int(label)])
                    feature["data_id"] = create_int_feature([int(data_id)])
                    writer.write_feature(feature)

                    data_info.append((data_id, query.qid, doc_entry.doc_id))
                    data_id += 1
            except KeyError as e:
                print("doc {} not found".format(doc_entry.doc_id))

    return data_info
Пример #7
0
def preload_docs(ranked_list: Dict[str, List[SimpleRankedListEntry]], top_n):
    all_doc_ids = set()
    for entries in ranked_list.values():
        for entry in entries[:top_n]:
            all_doc_ids.add(entry.doc_id)

    tprint(f"total of {len(all_doc_ids)} docs")
    tprint("Accessing DB")
    #  Get the doc from DB

    doc_ids_list = list(all_doc_ids)
    block_size = 1000
    idx = 0
    while idx < len(doc_ids_list):
        print(idx)
        st = idx
        ed = idx + block_size
        preload_man.preload(TokenizedCluewebDoc, doc_ids_list[st:ed])
        idx += block_size

    #preload_man.preload(TokenizedCluewebDoc, all_doc_ids)
    tprint("Done")
Пример #8
0
    def select_paragraph_from_datapoint(
            x: PerspectiveCandidate) -> ParagraphClaimPersFeature:
        ranked_docs: List[SimpleRankedListEntry] = ci.fetch(x.cid, x.pid)
        ranked_docs = ranked_docs[:100]
        cp_tokens = nltk.word_tokenize(x.claim_text) + nltk.word_tokenize(
            x.p_text)
        cp_tokens = lmap(lambda x: x.lower(), cp_tokens)
        cp_tokens = set(cp_tokens)

        #  prefetch tokens and bert tokens
        doc_ids = lmap(lambda x: x.doc_id, ranked_docs)
        preload_man.preload(TokenizedCluewebDoc, doc_ids)
        preload_man.preload(BertTokenizedCluewebDoc, doc_ids)

        def paragraph_scorer_local(p):
            return paragraph_scorer(p, cp_tokens)

        def get_best_paragraph_from_doc(
                doc: SimpleRankedListEntry) -> List[ScoreParagraph]:
            paragraph_list = paragraph_iterator(doc)
            score_paragraph = lmap(paragraph_scorer_local, paragraph_list)
            score_paragraph.sort(key=lambda p: p.score, reverse=True)
            return score_paragraph[:1]

        def get_all_paragraph_from_doc(
                doc: SimpleRankedListEntry) -> List[ScoreParagraph]:
            paragraph_list = paragraph_iterator(doc)
            score_paragraph = lmap(paragraph_scorer_local, paragraph_list)
            return score_paragraph

        if option:
            get_paragraphs = get_best_paragraph_from_doc
        else:
            get_paragraphs = get_all_paragraph_from_doc

        candidate_paragraph: Iterable[ScoreParagraph] = flatten(
            lmap(get_paragraphs, ranked_docs))
        return ParagraphClaimPersFeature(claim_pers=x,
                                         feature=list(candidate_paragraph))
Пример #9
0
 def get_instances(self, cid, data_id_manager, entries):
     doc_ids = lmap(lambda x: x.doc_id, entries)
     preload_man.preload(BertTokenizedCluewebDoc, doc_ids)
     n_doc_not_found = 0
     for entry in entries[:self.top_n]:
         try:
             tokens: List[List[str]] = load(BertTokenizedCluewebDoc,
                                            entry.doc_id)
             for sent_idx, sent in enumerate(tokens[:self.num_sent]):
                 for pid in self.pid_dict[int(cid)]:
                     info = {
                         'cid': cid,
                         'pid': pid,
                         'doc_id': entry.doc_id,
                         'sent_idx': sent_idx
                     }
                     yield Instance(pid, sent, data_id_manager.assign(info))
         except KeyError:
             n_doc_not_found += 1
     if n_doc_not_found:
         print("{} of {} docs not found".format(n_doc_not_found,
                                                len(doc_ids)))
Пример #10
0
    def generate(claim_lm: ClaimLM, ranked_list: List[SimpleRankedListEntry]):
        claim_text = claim_lm.claim
        claim_tokens = bert_tokenizer.tokenize(claim_text)
        claim_token_len = len(claim_tokens)

        log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha))
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)
        doc_ids = lmap(lambda x: x.doc_id, ranked_list[:top_n])
        print("loading docs")
        preload_man.preload(BertTokenizedCluewebDoc, doc_ids)

        window_size = max_seq_length - claim_token_len - 3
        step_size = max_seq_length - 112
        enum_paragraph = enum_paragraph_functor(step_size, window_size)

        def get_record(tokens):
            scores, masks = get_target_labels(tokens, log_odd, stopwords,
                                              fail_logger)
            sum(scores)
            return Record(claim_tokens, tokens, scores, masks)

        tokens_list: List[List[str]] = []
        not_found = 0
        for doc_id in doc_ids:
            try:
                tokens: List[str] = list(
                    flatten(load(BertTokenizedCluewebDoc, doc_id)))
                tokens_list.append(tokens)
            except KeyError:
                not_found += 1
                pass

        print("{} of {} not found".format(not_found, len(tokens_list)))
        paragraph_list: Iterable[List[str]] = enum_paragraph(tokens_list)
        records: List[Record] = lmap(get_record, paragraph_list)

        return records
Пример #11
0
def preload_docs(doc_ids):
    preload_man.preload(TokenizedCluewebDoc, doc_ids)
Пример #12
0
def preload_tf(doc_ids):
    preload_man.preload(CluewebDocTF, doc_ids)