예제 #1
0
def get_candidate_all_passage_w_samping_predict(
        max_seq_length=256) -> Dict[str, List[QCKCandidateWToken]]:
    qrel_path = os.path.join(data_path, "robust", "qrels.rob04.txt")
    galago_rank = load_bm25_best()
    tokens_d = load_robust_tokens_for_predict(4)
    queries = load_robust04_title_query()
    tokenizer = get_tokenizer()
    out_d: Dict[str, List[QCKCandidateWToken]] = {}
    for query_id in queries:
        query = queries[query_id]
        query_tokens = tokenizer.tokenize(query)

        ranked_list = galago_rank[query_id]
        ranked_list = ranked_list[:100]
        doc_ids = list([e.doc_id for e in ranked_list])

        candidate = []
        for doc_id in doc_ids:
            tokens = tokens_d[doc_id]
            for idx, passage in enumerate(enum_passage(tokens,
                                                       max_seq_length)):
                if idx == 0:
                    include = True
                else:
                    include = random.random() < 0.1

                if include:
                    c = QCKCandidateWToken(doc_id, "", passage)
                    candidate.append(c)

        out_d[query_id] = candidate
    return out_d
예제 #2
0
 def __init__(self, query_type="desc", neg_k=1000):
     qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt"
     self.judgement = load_qrels_structured(qrel_path)
     self.queries = load_robust_04_query(query_type)
     self.tokenizer = get_tokenizer()
     self.galago_rank = load_bm25_best()
     self.neg_k = neg_k
예제 #3
0
 def __init__(self, encoder, max_seq_length, top_k=100, query_type="title"):
     self.data = self.load_tokens_from_pickles()
     self.max_seq_length = max_seq_length
     self.queries = load_robust_04_query(query_type)
     self.galago_rank = load_bm25_best()
     self.top_k = top_k
     self.encoder = encoder
     self.tokenizer = get_tokenizer()
예제 #4
0
 def __init__(self, encoder, max_seq_length, query_type="title"):
     self.data = self.load_tokens()
     qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt"
     self.judgement = load_qrels_structured(qrel_path)
     self.max_seq_length = max_seq_length
     self.queries = load_robust_04_query(query_type)
     self.encoder = encoder
     self.tokenizer = get_tokenizer()
     self.galago_rank = load_bm25_best()
예제 #5
0
 def __init__(self, doc_max_length, query_type="title", neg_k=1000, pos_only=True):
     self.data = self.load_tokens()
     qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt"
     self.judgement = load_qrels_structured(qrel_path)
     self.doc_max_length = doc_max_length
     self.queries = load_robust_04_query(query_type)
     self.tokenizer = get_tokenizer()
     self.galago_rank = load_bm25_best()
     self.neg_k = neg_k
     self.pos_only = pos_only
예제 #6
0
    def __init__(self, encoder, max_seq_length, query_type,
                 target_selection_fn: Callable[[str, str, List], List[int]]):
        self.data = self.load_tokens()
        qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt"
        self.judgement = load_qrels_structured(qrel_path)
        self.max_seq_length = max_seq_length
        self.queries = load_robust_04_query(query_type)
        self.encoder = encoder
        self.tokenizer = get_tokenizer()
        self.galago_rank = load_bm25_best()

        self.target_selection_fn: Callable[[str, str, List],
                                           List[int]] = target_selection_fn
예제 #7
0
def load_candidate_all_passage(
        max_seq_length,
        max_passage_per_doc=10) -> Dict[str, List[QCKCandidateWToken]]:
    candidate_docs: Dict[str, List[SimpleRankedListEntry]] = load_bm25_best()

    def get_doc_id(l: List[SimpleRankedListEntry]):
        return list([e.doc_id for e in l])

    candidate_doc_ids: Dict[str, List[str]] = dict_value_map(
        get_doc_id, candidate_docs)
    token_data: Dict[str, List[str]] = load_robust_tokens_for_predict()
    return load_candidate_all_passage_inner(candidate_doc_ids, token_data,
                                            max_seq_length,
                                            max_passage_per_doc)
예제 #8
0
 def __init__(self,
              encoder,
              max_seq_length,
              score_d,
              query_type="title",
              neg_k=1000):
     self.data = self.load_tokens()
     qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt"
     self.score_d: Dict[str, List[float]] = score_d
     self.judgement = load_qrels_structured(qrel_path)
     self.max_seq_length = max_seq_length
     self.queries = load_robust_04_query(query_type)
     self.encoder = encoder
     self.tokenizer = get_tokenizer()
     self.galago_rank = load_bm25_best()
     self.neg_k = neg_k
     self.n_seg_per_doc = 4
예제 #9
0
def all_doc_ids_of_interest() -> List[str]:
    qrel = load_robust_qrel()
    all_doc_id_set = set()
    for query in qrel.keys():
        judgement = qrel[query]
        for doc_id, score in judgement.items():
            all_doc_id_set.add(doc_id)

    top_k = 1000
    galago_rank = load_bm25_best()
    for query_id, ranked_list in galago_rank.items():
        ranked_list.sort(key=lambda x:x[1])
        all_doc_id_set.update([x[0] for x in ranked_list[:top_k]])

    all_doc_id_list = list(all_doc_id_set)
    all_doc_id_list.sort()

    return all_doc_id_list
예제 #10
0
파일: var_length.py 프로젝트: clover3/Chair
    def __init__(self,
                 encoder,
                 max_seq_length_per_inst,
                 num_doc_per_inst,
                 num_seg_per_inst,
                 query_type="title",
                 neg_k=1000):
        self.data = self.load_tokens()
        qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt"
        self.judgement = load_qrels_structured(qrel_path)
        self.max_seq_length = max_seq_length_per_inst
        self.queries = load_robust_04_query(query_type)
        self.num_doc_per_inst = num_doc_per_inst
        self.num_seg_per_inst = num_seg_per_inst

        self.all_segment_encoder = encoder
        self.tokenizer = get_tokenizer()
        self.galago_rank = load_bm25_best()
        self.neg_k = neg_k
예제 #11
0
def load_candidate_head_as_doc(
        doc_len=400) -> Dict[str, List[QCKCandidateWToken]]:
    top_k = 100
    candidate_docs: Dict[str, List[SimpleRankedListEntry]] = load_bm25_best()
    print("Num queries : ", len(candidate_docs))
    print("Loading robust collection tokens...", end="")
    data: Dict[str, List[str]] = load_robust_tokens_for_predict()
    print("Done")
    print("Total of {} docs".format(len(data)))

    def make_candidate(doc_id: str):
        tokens = data[doc_id]
        return QCKCandidateWToken(doc_id, "", tokens[:doc_len])

    def fetch_docs(
            ranked_list: List[SimpleRankedListEntry]
    ) -> List[QCKCandidateWToken]:
        return list([make_candidate(e.doc_id) for e in ranked_list[:top_k]])

    return dict_value_map(fetch_docs, candidate_docs)
예제 #12
0
def main():
    top_k = 1000
    galago_rank = load_bm25_best()

    doc_id_set = set()
    for query_id, ranked_list in galago_rank.items():
        ranked_list.sort(key=lambda x :x[1])
        doc_id_set.update([x[0] for x in ranked_list[:top_k]])
    doc_id_list = list(doc_id_set)
    robust_path = "/mnt/nfs/work3/youngwookim/data/robust04"
    data = load_robust(robust_path)

    save_d = {}
    for doc_id in doc_id_list:
        try:
            save_d[doc_id] = data[doc_id]
        except KeyError:
            print(doc_id, 'not found')


    save_to_pickle(save_d, "robust04_docs_predict")
예제 #13
0
def load_candidate_d():
    candidate_docs: Dict[str, List[SimpleRankedListEntry]] = load_bm25_best()

    def get_doc_id(l: List[SimpleRankedListEntry]):
        return list([e.doc_id for e in l])

    candidate_doc_ids: Dict[str, List[str]] = dict_value_map(
        get_doc_id, candidate_docs)
    # token_data: Dict[str, List[str]] = load_robust_tokens_for_predict()
    docs = load_from_pickle("robust04_docs_predict")

    out_d = {}
    top_k = 100
    for query_id, doc_id_list in candidate_doc_ids.items():
        new_entries = []
        for doc_id in doc_id_list[:top_k]:
            # tokens = token_data[doc_id]
            content = docs[doc_id]
            new_entries.append((doc_id, content))

        out_d[query_id] = new_entries
    return out_d
예제 #14
0
 def __init__(self,
              encoder,
              max_seq_length,
              scores,
              query_type="title",
              target_selection="best"):
     self.data = self.load_tokens()
     qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt"
     self.judgement = load_qrels_structured(qrel_path)
     self.max_seq_length = max_seq_length
     self.queries = load_robust_04_query(query_type)
     self.encoder = encoder
     self.tokenizer = get_tokenizer()
     self.galago_rank = load_bm25_best()
     self.scores: Dict[Tuple[str, str, int], float] = scores
     self.get_target_indices: Callable[[], List[int]] = {
         'best': get_target_indices_get_best,
         'all': get_target_indices_all,
         'first_and_best': get_target_indices_first_and_best,
         'best_or_over_09': get_target_indices_best_or_over_09,
         'random_over_09': get_target_indices_random_over_09
     }[target_selection]
예제 #15
0
def get_candidate_all_passage_w_samping(
        max_seq_length=256, neg_k=1000) -> Dict[str, List[QCKCandidateWToken]]:
    qrel_path = os.path.join(data_path, "robust", "qrels.rob04.txt")
    galago_rank = load_bm25_best()
    tokens_d = load_robust_tokens_for_train()
    tokens_d.update(load_robust_tokens_for_predict(4))
    queries = load_robust04_title_query()
    tokenizer = get_tokenizer()
    judgement: Dict[str, Dict] = load_qrels_structured(qrel_path)
    out_d: Dict[str, List[QCKCandidateWToken]] = {}
    for query_id in judgement.keys():
        if query_id not in judgement:
            continue
        query = queries[query_id]
        query_tokens = tokenizer.tokenize(query)

        judge_entries = judgement[query_id]
        doc_ids = set(judge_entries.keys())

        ranked_list = galago_rank[query_id]
        ranked_list = ranked_list[:neg_k]
        doc_ids.update([e.doc_id for e in ranked_list])

        candidate = []
        for doc_id in doc_ids:
            tokens = tokens_d[doc_id]
            for idx, passage in enumerate(enum_passage(tokens,
                                                       max_seq_length)):
                if idx == 0:
                    include = True
                else:
                    include = random.random() < 0.1

                if include:
                    c = QCKCandidateWToken(doc_id, "", passage)
                    candidate.append(c)

        out_d[query_id] = candidate
    return out_d
예제 #16
0
 def __init__(self, top_k=150):
     super(RobustPreprocessPredict, self).__init__()
     self.galago_rank = load_bm25_best()
     self.top_k = top_k