Пример #1
0
 def __init__(self, max_seq_length,
              out_dir):
     self.query_group: List[List[QueryID]] = load_query_group("train")
     self.seg_resource_loader = SegmentResourceLoader(job_man_dir, "train")
     self.max_seq_length = max_seq_length
     self.out_dir = out_dir
     self.info_dir = self.out_dir + "_info"
     exist_or_mkdir(self.info_dir)
Пример #2
0
    def __init__(self, split):
        query_group: List[List[QueryID]] = load_query_group(split)
        qrel: SimpleQrel = load_msmarco_simple_qrels(split)

        self.split = split
        self.queires = dict(load_queries(split))
        self.query_group = query_group
        self.tokenizer = get_tokenizer()
        self.qrel = qrel
Пример #3
0
def run_tokenize_jobs_for_prediction_split(split):
    query_group = load_query_group(split)
    candidate_docs = top100_doc_ids(split)

    def factory(out_dir):
        return TokenizeDocTitleBodyWorker(split, query_group, candidate_docs, out_dir)

    runner = JobRunnerS(job_man_dir, len(query_group), "MSMARCO_{}_title_body_tokens".format(split), factory)
    runner.start()
Пример #4
0
def run_tokenize_jobs_for_train_split(split):
    query_group = load_query_group(split)
    candidate_docs = load_candidate_doc_list_10(split)

    def factory(out_dir):
        return SentLevelTokenizeWorker(split, query_group, candidate_docs,
                                       out_dir)

    runner = JobRunnerS(job_man_dir, len(query_group),
                        "MSMARCO_{}_sent_tokens".format(split), factory)
    runner.start()
Пример #5
0
    def __init__(self, split, load_candidate_doc_list_fn):
        query_group: List[List[QueryID]] = load_query_group(split)
        candidate_docs_d: Dict[QueryID, List[str]] = load_candidate_doc_list_fn(split)
        qrel: SimpleQrel = load_msmarco_simple_qrels(split)

        self.split = split
        self.queires = dict(load_queries(split))
        self.query_group = query_group
        self.tokenizer = get_tokenizer()
        self.candidate_doc_d: Dict[QueryID, List[str]] = candidate_docs_d
        self.qrel = qrel
        self.tokenizer = get_tokenizer()
def run_tokenize_jobs_for_pred_split(split):
    query_group = load_query_group(split)
    candidate_docs = top100_doc_ids(split)
    max_sent_length = 64 * 5
    max_title_length = 64 * 5

    def factory(out_dir):
        return MultipleTokenizeWorker(split, query_group, candidate_docs, max_sent_length, max_title_length, out_dir)

    runner = JobRunnerS(job_man_dir, len(query_group),
                        "MSMARCO_{}_multiple_tokenize".format(split),
                        factory)
    runner.start()
Пример #7
0
    def __init__(self, split):
        super(ProcessedResource10doc, self).__init__(split)
        query_group: List[List[QueryID]] = load_query_group(split)
        candidate_docs_d: Dict[QueryID, List[str]] = load_candidate_doc_list_10(split)
        qrel: SimpleQrel = load_msmarco_simple_qrels(split)

        self.split = split
        self.queires = dict(load_queries(split))
        self.query_group = query_group
        self.tokenizer = get_tokenizer()
        self.candidate_doc_d: Dict[QueryID, List[str]] = candidate_docs_d
        self.qrel = qrel
        self.tokenizer = get_tokenizer()
Пример #8
0
def main():
    query_groups = load_query_group("train")
    path_format = os.path.join(job_man_dir, "seg_resource_train", "{}")
    not_found_list = []
    for job_id in range(train_query_group_len):
        qids = query_groups[job_id]
        for qid in qids:
            resource_path = path_format.format(qid)
            if not os.path.exists(resource_path):
                print(job_id, qid)
                not_found_list.append((job_id, qid))

    print("{} files not found".format(len(not_found_list)))
Пример #9
0
 def __init__(self,
              max_seq_length,
              split,
              skip_single_seg,
              pick_for_pairwise,
              out_dir):
     self.query_group: List[List[QueryID]] = load_query_group(split)
     self.seg_resource_loader = SegmentResourceLoader(job_man_dir, split)
     self.max_seq_length = max_seq_length
     self.out_dir = out_dir
     self.skip_single_seg = skip_single_seg
     self.pick_for_pairwise = pick_for_pairwise
     self.info_dir = self.out_dir + "_info"
     exist_or_mkdir(self.info_dir)
Пример #10
0
def main():
    split = "train"
    resource = ProcessedResource10docMulti(split)

    query_group: List[List[QueryID]] = load_query_group(split)
    msmarco_passage_qrel_path = at_data_dir("msmarco", "qrels.train.tsv")
    passage_qrels: QRelsDict = load_qrels_structured(msmarco_passage_qrel_path)

    qids = query_group[0]
    qids = qids[:100]
    pickle_name = "msmarco_passage_doc_analyze_passage_dict_evidence_loc"
    try:
        passage_dict = load_from_pickle(pickle_name)
    except FileNotFoundError:
        print("Reading passages...")
        passage_dict = get_passages(qids, passage_qrels)
        save_to_pickle(passage_dict, pickle_name)
    def get_rel_doc_id(qid):
        if qid not in resource.get_doc_for_query_d():
            raise KeyError
        for doc_id in resource.get_doc_for_query_d()[qid]:
            label = resource.get_label(qid, doc_id)
            if label:
                return doc_id
        raise KeyError

    def translate_token_idx_to_sent_idx(stemmed_body_tokens_list, loc_in_body):
        acc = 0
        for idx, tokens in enumerate(stemmed_body_tokens_list):
            acc += len(tokens)
            if loc_in_body < acc:
                return idx
        return -1

    pc_tokenize = PCTokenizer()
    bert_tokenizer = get_tokenizer()

    for qid in qids:
        try:
            doc_id = get_rel_doc_id(qid)
            stemmed_tokens_d = resource.get_stemmed_tokens_d(qid)
            stemmed_title_tokens, stemmed_body_tokens_list = stemmed_tokens_d[doc_id]
            rel_passages = list([passage_id for passage_id, score in passage_qrels[qid].items() if score])
            success = False
            found_idx = -1
            for rel_passage_id in rel_passages:
                passage_text = passage_dict[rel_passage_id].strip()
                passage_tokens = pc_tokenize.tokenize_stem(passage_text)
                stemmed_body_tokens_flat = lflatten(stemmed_body_tokens_list)
                n, log = lcs(passage_tokens, stemmed_body_tokens_flat, True)
                if len(passage_tokens) > 4 and n > len(passage_tokens) * 0.7 and n > 0:
                    success = True
                    _, loc_in_body = log[0]

                    sent_idx = translate_token_idx_to_sent_idx(stemmed_body_tokens_list, loc_in_body)
                    prev = stemmed_body_tokens_flat[:loc_in_body]

                    loc_by_bert_tokenize = len(bert_tokenizer.tokenize(" ".join(prev)))
                    print(sent_idx, loc_in_body, loc_by_bert_tokenize, len(stemmed_body_tokens_list))
                    found_idx = sent_idx
            if not success:
                print("Not found. doc_lines={} passage_len={}".format(len(stemmed_body_tokens_list), len(passage_tokens)))

        except KeyError:
            pass
Пример #11
0
from data_generator.job_runner import JobRunner
from dataset_specific.msmarco.common import load_query_group, top100_doc_ids
from dataset_specific.msmarco.tokenize_jobs.run_tokenize import DummyWorker
from epath import job_man_dir

if __name__ == "__main__":
    split = "dev"
    query_group = load_query_group(split)
    candidate_docs = top100_doc_ids(split)

    def factory(out_dir):
        return DummyWorker(split, query_group, candidate_docs, out_dir)

    runner = JobRunner(job_man_dir, len(query_group)-1, "MSMARCO_{}_tokens_debug".format(split), factory)
    runner.start()