Пример #1
0
    def generate_instances(self, job_id, data_id_man):
        q_id = self.job_id_to_q_id[job_id]
        query_text = self.query_d[int(q_id)]
        query_tokens = self.tokenizer.tokenize(query_text)
        ranked_list = self.ranked_list[q_id][:1000]
        doc_ids = list([e.doc_id for e in ranked_list])
        tprint("Loading documents start")
        docs_d: Dict[str, List[List[str]]] = load_multiple(BertTokenizedCluewebDoc, doc_ids, True)
        tprint("Loading documents done")
        avail_seq_length = self.max_seq_length - len(query_tokens) - 3

        label_dummy = 0
        not_found = 0
        for doc_id in doc_ids:
            try:
                doc: List[List[str]] = docs_d[doc_id]
                passages: Iterable[List[str]] = enum_passages(doc, avail_seq_length)

                for passage_idx, p in enumerate(passages):
                    if passage_idx > 9:
                        break
                    data_id = data_id_man.assign({
                        'query_id': q_id,
                        'doc_id': doc_id,
                        'passage_idx': passage_idx
                    })
                    yield Instance(query_tokens, p, label_dummy, data_id)
            except KeyError:
                not_found += 1
        print("{} of {} docs not found".format(not_found, len(doc_ids)))
Пример #2
0
def get_feature_binary_model(claim_id,
                             perspective_id,
                             claim_text,
                             perspective_text,
                             ci: DynRankedListInterface,
                             is_mention_fn: Callable[[Counter[str], str, str], bool],
                             ) -> Tuple[Counter, int]:

    def is_mention(doc: Counter) -> bool:
        return is_mention_fn(doc, claim_text, perspective_text)

    print(claim_id, perspective_id)
    ranked_docs: List[SimpleRankedListEntry] = ci.query(claim_id, perspective_id, claim_text, perspective_text)
    ranked_docs = ranked_docs[:100]
    print("{} docs in ranked list".format(len(ranked_docs)))

    doc_id_list: List[str] = lmap(get_doc_id, ranked_docs)

    tf_d = load_multiple(CluewebDocTF, doc_id_list, True)
    not_found = []
    for idx, doc_id in enumerate(doc_id_list):
        if doc_id not in tf_d:
            not_found.append(idx)

    ranked_docs_tf = tf_d.values()
    mentioned_docs: List[Counter] = lfilter(is_mention, ranked_docs_tf)
    print("Found doc", len(tf_d), "mentioned doc", len(mentioned_docs))

    docs_rel_freq: List[Counter] = lmap(div_by_doc_len, mentioned_docs)
    num_doc: int = len(docs_rel_freq)
    p_w_m: Counter = average_tf_over_docs(docs_rel_freq, num_doc)

    return p_w_m, num_doc
Пример #3
0
def generate(pos_doc_ids, all_doc_list, max_seq_length) -> List[Instance]:
    # load list of documents
    # make list of negative documents.
    # remove duplicates.
    seq_length = max_seq_length - 2
    neg_docs_ids = list([d for d in all_doc_list if d not in pos_doc_ids])
    pos_docs: List[List[List[str]]] = load_multiple(BertTokenizedCluewebDoc,
                                                    pos_doc_ids, True)
    hashes = lmap(doc_hash, pos_docs)
    duplicate_indice = get_duplicate_list(hashes)
    pos_docs: List[List[List[str]]] = list(
        [doc for i, doc in enumerate(pos_docs) if i not in duplicate_indice])
    neg_docs: List[List[List[str]]] = load_multiple_divided(
        BertTokenizedCluewebDoc, neg_docs_ids, True)

    data_id_man = DataIDManager()

    def enum_instances(doc_list: List[List[List[str]]],
                       label: int) -> Iterator[Instance]:
        for d in doc_list:
            for passage in enum_passages(d, seq_length):
                yield Instance(passage, data_id_man.assign([]), label)

    pos_insts = list(enum_instances(pos_docs, 1))
    neg_insts = list(enum_instances(neg_docs, 0))
    all_insts = pos_insts + neg_insts
    print("{} instances".format(len(all_insts)))
    random.shuffle(all_insts)
    return all_insts
Пример #4
0
def remove_duplicate(doc_id_list: List[str]) -> List[str]:
    docs_d: Dict[str, List[str]] = load_multiple(TokenizedCluewebDoc,
                                                 doc_id_list, True)
    hashes = lmap(doc_hash, [
        docs_d[doc_id] if doc_id in docs_d else None for doc_id in doc_id_list
    ])
    duplicate_indice = get_duplicate_list(hashes)
    non_duplicate = list([
        doc_id_list[i] for i in range(len(doc_id_list))
        if i not in duplicate_indice
    ])
    return non_duplicate
Пример #5
0
def main():
    ranked_list_path = sys.argv[1]
    save_path = sys.argv[2]
    rl: List[TrecRankedListEntry] = load_ranked_list(ranked_list_path)
    doc_ids = list([e.doc_id for e in rl])
    docs_d: Dict[str, List[str]] = {}
    idx = 0
    target_len = 10000
    step = 100
    #

    while idx < target_len:
        print(idx)
        doc_ids_window = doc_ids[idx:idx+step]
        docs_d.update(load_multiple(RawCluewebDoc, doc_ids_window, True))
        idx += step
    print("{} docs_loaded".format(len(docs_d)))
    json.dump(docs_d, open(save_path, "w"))
Пример #6
0
def main():
    doc_ids = list(set(load_doc_ids()))
    print("num docs", len(doc_ids))
    save_dir = os.path.join(output_path, "pc_docs_html")

    k = 0
    step = 1000
    while k < len(doc_ids):
        print(k, k + step)
        cur_doc_ids = doc_ids[k:k + step]
        docs = load_multiple(RawCluewebDoc, cur_doc_ids, True)
        exist_or_mkdir(save_dir)
        for doc_id in cur_doc_ids:
            try:
                doc_html = docs[doc_id]
                save_path = os.path.join(save_dir, doc_id + ".html")
                open(save_path, "w").write(doc_html)
            except KeyError:
                pass
        k += step
Пример #7
0
def load_tf_multiple(doc_ids):
    return load_multiple(CluewebDocTF, doc_ids)