Пример #1
0
def main():
    max_passage_length = 256
    encoder = PassageSampling(max_passage_length)
    max_seq_length = 512
    worker_factory = partial(RobustWorker, RobustPointwiseTrainGenEx(encoder, max_seq_length))
    runner = JobRunner(job_man_dir, 4, "robust_train_sampling_256", worker_factory)
    runner.start()
Пример #2
0
def generate_robust_all_seg_for_predict():
    max_seq_length = 128
    step_size = 64
    encoder = OverlappingSegments(max_seq_length, step_size)
    worker_factory = partial(RobustWorker, RobustPredictGen(encoder, max_seq_length, 100, "desc"))
    runner = JobRunner(job_man_dir, 4, "robust_predict_desc_128_overlap", worker_factory)
    runner.start()
Пример #3
0
def generate_robust_sero_for_train():
    total_sequence_length = 128 * 32
    src_window_size = 128
    encoder = MultiWindowOverlap(src_window_size, total_sequence_length)
    worker_factory = partial(RobustWorker, RobustPointwiseTrainGenEx(encoder, total_sequence_length, "desc"))
    runner = JobRunner(job_man_dir, 4, "RobustSero_128_32_overlap", worker_factory)
    runner.start()
Пример #4
0
def main():
    qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt"
    judgement = load_qrels_structured(qrel_path)

    def is_correct(query: QCKQuery, candidate: QCKCandidate):
        qid = query.query_id
        doc_part_id = candidate.id
        doc_id = "_".join(doc_part_id.split("_")[:-1])
        if qid not in judgement:
            return 0
        d = judgement[qid]
        if doc_id in d:
            return d[doc_id]
        else:
            return 0

    qk_candidate: List[QKUnit] = load_from_pickle(
        "robust_on_clueweb_qk_candidate")
    candidate_dict: \
        Dict[str, List[QCKCandidateI]] = load_candidate_all_passage_from_qrel(256)
    generator = QCKInstanceGenerator(candidate_dict, is_correct)
    num_jobs = 250

    def worker_factory(out_dir):
        worker = QCKWorker(qk_candidate, generator, out_dir)
        return worker

    ##
    job_name = "robust_qck_6"
    runner = JobRunner(job_man_dir, num_jobs, job_name, worker_factory)
    runner.start()
Пример #5
0
def main():
    max_seq_length = 512
    encoder = AllSegmentAsDoc(max_seq_length)
    worker_factory = partial(RobustWorkerWDataID,
                             RobustTrainGenWDataID(encoder, max_seq_length))
    runner = JobRunner(job_man_dir, 4, "robust_w_data_id", worker_factory)
    runner.start()
Пример #6
0
def generate_robust_sero_for_train():
    total_sequence_length = 512 * 4
    src_window_size = 512
    encoder = MultiWindow(src_window_size, total_sequence_length)
    worker_factory = partial(RobustWorker, RobustPointwiseTrainGenEx(encoder, total_sequence_length, "desc"))
    runner = JobRunner(sydney_working_dir, 4, "RobustSero5", worker_factory)
    runner.start()
Пример #7
0
def main():
    qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt"
    judgement = load_qrels_structured(qrel_path)

    def is_correct(query: QCKQuery, candidate: QCKCandidate):
        qid = query.query_id
        doc_id = candidate.id
        if qid not in judgement:
            return 0
        d = judgement[qid]
        label = 1 if doc_id in d and d[doc_id] > 0 else 0
        return label

    qk_candidate: List[QKUnit] = load_from_pickle(
        "robust_on_clueweb_qk_candidate_filtered")

    candidate_dict = load_cache("candidate_for_robust_qck_7")
    if candidate_dict is None:
        candidate_dict: \
            Dict[str, List[QCKCandidateI]] = get_candidate_all_passage_w_samping()
        save_to_pickle(candidate_dict, "candidate_for_robust_qck_7")

    generator = QCKInstanceGenerator(candidate_dict, is_correct)
    num_jobs = 250

    def worker_factory(out_dir):
        worker = QCKWorker(qk_candidate, generator, out_dir)
        return worker

    ##
    job_name = "robust_qck_10"
    runner = JobRunner(job_man_dir, num_jobs, job_name, worker_factory)
    runner.start()
Пример #8
0
def start_generate_jobs_for_train_val(
        generator_functor: Callable[[Dict[int, List[Tuple[List[str], float]]]],
                                    CPPNCGeneratorInterface], writer,
        name_prefix):
    # claim ids split to train/val
    d_ids: List[int] = list(load_train_claim_ids())
    claims = get_claims_from_ids(d_ids)
    train, val = split_7_3(claims)
    data = load_from_pickle("pc_train_a_passages")
    entries, all_passages = data
    cid_to_passages: Dict[int, List[Tuple[List[str], float]]] = {
        claim['cId']: p
        for claim, p in entries
    }
    generator = generator_functor(cid_to_passages)

    print("Generate instances : train")

    def worker_factory(out_dir):
        return CPPNCWorker(train, generator, writer, out_dir)

    runner = JobRunner(job_man_dir, 378, name_prefix + "_train",
                       worker_factory)
    runner.start()

    print("Generate instances : val")

    def worker_factory(out_dir):
        return CPPNCWorker(val, generator, writer, out_dir)

    runner = JobRunner(job_man_dir, 162, name_prefix + "_val", worker_factory)
    runner.start()
Пример #9
0
def main():
    max_seq_length = 128
    encoder = TwoPieceSegmentComposer(max_seq_length, True)
    worker_factory = partial(RobustPerQueryWorker, RobustTrainGenLight(encoder, max_seq_length))
    num_jobs = 250
    runner = JobRunner(job_man_dir, num_jobs-1, "robust_two_piece2", worker_factory)
    runner.start()
Пример #10
0
def start_generate_jobs_for_train_val(generator: InstanceGenerator,
                                      name_prefix):
    # claim ids split to train/val
    print("Loading data ....")
    d_ids: List[int] = list(load_train_claim_ids())
    claims = get_claims_from_ids(d_ids)
    train, val = split_7_3(claims)

    train_cids = {str(t['cId']) for t in train}
    val_cids = {str(t['cId']) for t in val}
    qk_candidate: List[QKUnit] = load_qk_candidate_train()
    print("Generate instances : train")
    qk_candidate_train: List[QKUnit] = list(
        [qk for qk in qk_candidate if qk[0].query_id in train_cids])
    qk_candidate_val = list(
        [qk for qk in qk_candidate if qk[0].query_id in val_cids])

    def worker_factory(out_dir):
        return QCKWorker(qk_candidate_train, generator, out_dir)

    runner = JobRunner(job_man_dir, 378, name_prefix + "_train",
                       worker_factory)
    runner.start()

    print("Generate instances : val")

    def worker_factory(out_dir):
        return QCKWorker(qk_candidate_val, generator, out_dir)

    runner = JobRunner(job_man_dir, 162, name_prefix + "_val", worker_factory)
    runner.start()
Пример #11
0
def main():
    max_passage_length = 128
    num_segment = 1
    encoder = LeadingN(max_passage_length, num_segment)
    max_seq_length = max_passage_length
    worker_factory = partial(RobustWorker, RobustPointwiseTrainGenEx(encoder, max_seq_length, "desc"))
    runner = JobRunner(job_man_dir, 4, "first_128_desc", worker_factory)
    runner.start()
Пример #12
0
def main():
    max_passage_length = 128
    g = 0.5
    encoder = GeoSampler(max_passage_length, g)
    max_seq_length = max_passage_length
    worker_factory = partial(RobustWorker, RobustPointwiseTrainGenEx(encoder, max_seq_length, "desc"))
    runner = JobRunner(job_man_dir, 4, "robust_geo05", worker_factory)
    runner.start()
Пример #13
0
def generate_robust_first_for_prediction():
    max_seq_length = 512
    encoder = FirstSegmentAsDoc(max_seq_length)
    worker_factory = partial(RobustWorker,
                             RobustPredictGenOld(encoder, max_seq_length))
    runner = JobRunner(sydney_working_dir, 4, "RobustFirstPred3",
                       worker_factory)
    runner.start()
Пример #14
0
def generate_robust_all_seg_for_train():
    max_seq_length = 512
    encoder = AllSegmentAsDoc(max_seq_length)
    worker_factory = partial(
        RobustWorker, RobustPredictGen(encoder, max_seq_length, 100, "desc"))
    runner = JobRunner(job_man_dir, 4, "robust_predict_desc_query",
                       worker_factory)
    runner.start()
Пример #15
0
def generate_robust_all_seg_for_train():
    max_seq_length = 256
    encoder = AllSegmentAsDoc(max_seq_length)
    worker_factory = partial(RobustWorker,
                             RobustPredictGen(encoder, max_seq_length))
    runner = JobRunner(job_man_dir, 4, "robust_all_passage_predict_256",
                       worker_factory)
    runner.start()
Пример #16
0
def generate_robust_first_for_pred():
    doc_len = 256 + 3
    max_seq_length = 512
    encoder = FirstSegmentAsDoc(doc_len)
    worker_factory = partial(RobustWorker,
                             RobustPredictGenOld(encoder, max_seq_length))
    runner = JobRunner(job_man_dir, 4, "robust_first_256_pred", worker_factory)
    runner.start()
Пример #17
0
def generate_robust_first_for_train():
    doc_len = 256 + 3
    max_seq_length = 512
    encoder = FirstSegmentAsDoc(doc_len)
    worker_factory = partial(RobustWorker,
                             RobustPairwiseTrainGen(encoder, max_seq_length))
    runner = JobRunner(job_man_dir, 4, "RobustFirst256", worker_factory)
    runner.start()
Пример #18
0
def main():
    max_seq_length = 512
    encoder = AllSegmentAsDoc(max_seq_length)
    worker_factory = partial(
        RobustWorker, RobustPointwiseTrainGenEx(encoder, max_seq_length))
    runner = JobRunner(job_man_dir, 4, "robust_all_passage_pointwise_ex",
                       worker_factory)
    runner.start()
Пример #19
0
def generate_robust_first_for_train():
    max_seq_length = 512
    encoder = FirstSegmentAsDoc(max_seq_length)
    worker_factory = partial(RobustWorker,
                             RobustPairwiseTrainGen(encoder, max_seq_length))
    runner = JobRunner(sydney_working_dir, 4, "RobustFirstClean",
                       worker_factory)
    runner.start()
Пример #20
0
def main():
    input_path_format = "/mnt/nfs/work3/youngwookim/data/msmarco/triple_pieces/x{0:04}"

    def factory(out_dir):
        return TripletWorker(input_path_format, out_dir)

    num_jobs = 360
    runner = JobRunner(job_man_dir, num_jobs - 1, "MMD_pair_triplet", factory)
    runner.start()
Пример #21
0
def main():
    max_seq_length = 512
    score_d = load_from_pickle("robust_score_d2")
    encoder = AllSegmentAsDoc(max_seq_length)
    worker_factory = partial(
        RobustWorkerWDataID,
        RobustTrainGenSelected(encoder, max_seq_length, score_d))
    runner = JobRunner(job_man_dir, 4, "robust_selected2", worker_factory)
    runner.start()
Пример #22
0
def main():
    max_passage_length = 512
    encoder = FirstEquiSero(max_passage_length, 128, 4)
    max_seq_length = max_passage_length
    worker_factory = partial(
        RobustWorker, RobustPointwiseTrainGenEx(encoder, max_seq_length,
                                                "desc"))
    runner = JobRunner(job_man_dir, 4, "first_512_equi_sero", worker_factory)
    runner.start()
Пример #23
0
def main():
    split = "train"

    def factory(out_dir):
        return BestSegmentPredictionGen(512, split, True, False, out_dir)

    runner = JobRunner(job_man_dir, train_query_group_len - 1,
                       "MMD_best_seg_prediction_{}".format(split), factory)
    runner.start()
Пример #24
0
def generate_robust_all_seg_for_train():
    limited_length = 256
    encoder = AllSegmentAsDoc(limited_length)
    max_seq_length = 512
    worker_factory = partial(RobustWorker,
                             RobustPairwiseTrainGen(encoder, max_seq_length))
    runner = JobRunner(job_man_dir, 4, "robust_all_passage_256",
                       worker_factory)
    runner.start()
Пример #25
0
def main():
    max_seq_length = 512
    score_d = load_score_set1()
    encoder = AllSegmentAsDoc(max_seq_length)
    for target_selection in ["random_over_09", "best", "first_and_best", "best_or_over_09"]:
        worker_factory = partial(RobustWorkerWDataID,
                                 RobustTrainGenSelected(encoder, max_seq_length, score_d, "desc", target_selection))
        runner = JobRunner(job_man_dir, 3, "robust_selected2_{}".format(target_selection), worker_factory)
        runner.start()
Пример #26
0
def generate_robust_all_seg_for_predict():
    doc_max_length = 512
    worker_factory = partial(
        RobustPerQueryWorker,
        RobustSeparateEncoder(doc_max_length, "desc", 1000, False))
    num_jobs = 250
    runner = JobRunner(job_man_dir, num_jobs - 1, "robust_query_doc",
                       worker_factory)
    runner.start()
Пример #27
0
def generate_robust_sero_for_prediction():
    total_sequence_length = 512 * 4
    src_window_size = 512 - 2
    encoder = MultiWindow(src_window_size, total_sequence_length)
    worker_factory = partial(
        RobustWorker, RobustPredictGenOld(encoder, total_sequence_length))
    runner = JobRunner(sydney_working_dir, 4, "RobustSeroPred4",
                       worker_factory)
    runner.start()
Пример #28
0
def generate_robust_sero_for_train():
    total_sequence_length = 128 * 4
    src_window_size = 128
    encoder = MultiWindow(src_window_size, total_sequence_length)
    worker_factory = partial(
        RobustWorker,
        RobustPredictGen(encoder, total_sequence_length, 100, "desc"))
    runner = JobRunner(job_man_dir, 4, "RobustSero5_128_pred", worker_factory)
    runner.start()
Пример #29
0
def generate_robust_all_seg_for_predict():
    max_seq_length = 128
    encoder = ManyTwoPieceSegmentComposer(max_seq_length)
    worker_factory = partial(RobustPerQueryWorker,
                             RobustPredictGenLight(encoder, max_seq_length))
    num_jobs = 250
    runner = JobRunner(job_man_dir, num_jobs - 1, "robust_two_piece_pred_m",
                       worker_factory)
    runner.start()
Пример #30
0
def main():
    max_passage_length = 128
    num_segment = 4
    encoder = FirstAndRandom(max_passage_length, num_segment)
    max_seq_length = max_passage_length
    worker_factory = partial(
        RobustWorker, RobustPointwiseTrainGenEx(encoder, max_seq_length))
    runner = JobRunner(job_man_dir, 4, "leading_segments", worker_factory)
    runner.start()