def select_window(score: List[float], window_size, n_seg_per_doc) -> List[Tuple[int, int]]: assert len(score) >= window_size * n_seg_per_doc line_len = 16 n_line = ceil_divide(len(score), line_len) line_scores: List[float] = list( [sum(score[i * line_len:(i + 1) * line_len]) for i in range(n_line)]) idx_sorted = argsort(line_scores)[::-1] line_per_window = ceil_divide(window_size, line_len) best_idx = idx_sorted[0] selected_windows = [] for _ in range(n_seg_per_doc): candidate_list = [] for i in range(line_per_window): st = best_idx - i ed = best_idx - i + line_per_window score_sum = sum(line_scores[st:ed]) e = (st, ed, score_sum) candidate_list.append(e) candidate_list.sort(key=lambda x: x[2], reverse=True) st, ed, _ = candidate_list[0] for j in range(st, ed): line_scores[j] = 0 selected_windows.append((st, ed)) selected_windows.sort(key=get_first) return selected_windows
def encode(self, query_id: str, doc_id: str) -> Iterable[TokensAndSegmentIds]: try: sp_list: List[ ScoredPieceFromPair] = self.piece_score_parser.get_piece_score( query_id, doc_id) query = self.queries[str(query_id)] query_tokens: List[str] = self.tokenizer.tokenize(query) q_term_len = len(query_tokens) available_length = self.max_seq_length - q_term_len - 4 maybe_doc_length = self.probe_config.n_factor * self.probe_config.step_size n_piece = ceil_divide(maybe_doc_length, self.probe_config.max_seq_length) * 2 n1 = n_piece n2 = max(n_piece - 1, 1) two_piece_list: Iterable[ PiecewiseSegment] = select_many_two_piece_segment( self.probe_config, available_length, sp_list, n1, n2) def format_as_tas(two_piece): return to_tokens_and_segment_ids(query_tokens, sp_list, two_piece, self.max_seq_length, self.use_many_seg_ids) return map(format_as_tas, two_piece_list) except KeyError: return []
def main(): # counter = get_doc_length_counter() # save_to_pickle(counter, "robust_doc_length_counter") counter: Counter = get_doc_length_counter_from_pickle() seg_length = 500 all_keys = list(counter.keys()) all_keys.sort() num_seg_count = Counter() for l in all_keys: num_seg = ceil_divide(l, seg_length) cnt = counter[l] assert type(cnt) == int num_seg_count[num_seg] += cnt num_docs = sum(counter.values()) acc_portion = 0 for key in sorted(num_seg_count.keys()): cnt = num_seg_count[key] assert type(cnt) == int portion = cnt / num_docs acc_portion += portion # print("{0}\t{1}\t{2:.2f}\t{3:.2f}".format(key, cnt, portion, acc_portion)) print("{0}\t{1}\t{2:.4f}\t{3:.4f}".format(key, cnt, portion, acc_portion))
def select_many_two_piece_segment(probe_config, available_length, sp_list: List[ScoredPieceFromPair], n1, n2)\ -> Iterable[PiecewiseSegment]: seg1_num_piece = ceil_divide(ceil_divide(available_length, 2), probe_config.step_size) seg1_length = seg1_num_piece * probe_config.step_size # Select first segment first_piece_candidates: List[Tuple[ScoredInterval, float]] = make_first_piece_candidates( seg1_num_piece, sp_list) seg2_length = available_length - seg1_length seg2_num_seg = int(seg2_length / probe_config.step_size) for i1 in range(min(n1, len(first_piece_candidates))): first_piece, _ = first_piece_candidates[i1] second_piece_candidates: List[Tuple[ScoredInterval, float]] = \ make_second_piece_candidates(seg2_num_seg, sp_list, first_piece) for i2 in range(min(n2, len(second_piece_candidates))): second_piece, _ = second_piece_candidates[i2] output_piece_list = combine_interval(first_piece, second_piece) yield output_piece_list
def select_a_two_piece_segment( probe_config, available_length, sp_list: List[ScoredPieceFromPair]) -> PiecewiseSegment: seg1_num_piece = ceil_divide(ceil_divide(available_length, 2), probe_config.step_size) seg1_length = seg1_num_piece * probe_config.step_size # Select first segment first_piece_candidates: List[Tuple[ScoredInterval, float]] = make_first_piece_candidates( seg1_num_piece, sp_list) first_piece, _ = first_piece_candidates[0] seg2_length = available_length - seg1_length seg2_num_seg = int(seg2_length / probe_config.step_size) second_piece_candidates: List[Tuple[ScoredInterval, float]] = make_second_piece_candidates( seg2_num_seg, sp_list, first_piece) second_piece, _ = second_piece_candidates[0] return combine_interval(first_piece, second_piece)
def get_query_split(): xml_path = "/mnt/nfs/work3/youngwookim/code/Chair/data/CLEFeHealth2017IRtask/queries/queries2016.xml" queries: List[Query] = load_xml_query(xml_path) n_query = len(queries) n_split = 5 split_size = ceil_divide(n_query, n_split) cut = (n_split-1) * split_size train_queries = queries[:cut] test_queries = queries[cut:] return train_queries, test_queries
def __init__(self, option, out_dir): self.out_dir = out_dir self.ci = RankedListInterface() print("load__data_point") self.all_data_points: List[TPDataPoint] = lmap(ukp_datapoint_to_tp_datapoint, load_all_data_flat()) self.data_step_size = 50 total_jobs = ceil_divide(len(self.all_data_points), self.data_step_size) print("total_jobs :", total_jobs ) print("Load term stat") _, clue12_13_df = load_clueweb12_B13_termstat() self.clue12_13_df = clue12_13_df self.dp_id_to_q_res_id_fn = build_dp_id_to_q_res_id_fn() self.tokenizer = get_tokenizer() self.option = option
def serialize(features_list): num_worker = 4 output = [] with ProcessPoolExecutor(max_workers=num_worker) as executor: future_list = [] job_per_worker = ceil_divide(len(features_list), num_worker) for idx in range(num_worker): st = idx * job_per_worker ed = (idx + 1) * job_per_worker future = executor.submit(enc, features_list[st:ed]) future_list.append(future) for future in future_list: sub_outputs = future.result() output.extend(sub_outputs) return output
def main(): is_correct_fn = get_is_correct_fn() for split in splits[:2]: qk_candidate = load_from_pickle("pc_evi_filtered_qk_{}".format(split)) qk_candidate = sample_kdps(qk_candidate) tprint("Loading candidates..") candidate_dict = load_bal_candidate(split) tprint("{} dict keys".format(len(candidate_dict))) tprint("Initializing generator..") generator = QCKInstanceGenerator(candidate_dict, is_correct_fn) n_qk_per_job = 10 num_jobs = ceil_divide(d_n_pc_per_split[split], n_qk_per_job) def worker_factory(out_dir): worker = QCKWorkerMultiple(qk_candidate, generator, n_qk_per_job, out_dir) return worker job_name = "pc_evi_qck3_{}".format(split) runner = JobRunnerS(job_man_dir, num_jobs, job_name, worker_factory) runner.start()
def qck_gen_w_ranked_list_multiple(job_name, qk_candidate_name, ranked_list_path, split, n_qk_per_job): claim_ids = load_claim_ids_for_split(split) cids: List[str] = lmap(str, claim_ids) qk_candidate: List[QKUnit] = load_from_pickle(qk_candidate_name) print("cids", len(cids)) print("len(qk_candidate)", len(qk_candidate)) print("Generate instances : ", split) generator = QCKInstanceGenerator( get_qck_candidate_from_ranked_list_path(ranked_list_path), is_correct_factory()) qk_candidate_train: List[QKUnit] = list( [qk for qk in qk_candidate if qk[0].query_id in cids]) def worker_factory(out_dir): return QCKWorkerMultiple(qk_candidate_train, generator, n_qk_per_job, out_dir) num_qks = d_n_claims_per_split2[split] num_jobs = ceil_divide(num_qks, n_qk_per_job) runner = JobRunnerS(job_man_dir, num_jobs, job_name + "_" + split, worker_factory) runner.start()
from data_generator.job_runner import JobRunner, sydney_working_dir ### from galagos.process_jsonl_doc_lines import JsonlWorker from misc_lib import ceil_divide if __name__ == "__main__": num_lines = 231423 block_size = 100 num_jobs = ceil_divide(num_lines, block_size) jsonl_path = "/mnt/nfs/work3/youngwookim/data/counter_arg/q_res/ca_docs.jsonl" print("Start") runner = JobRunner(sydney_working_dir, num_jobs - 1, "ca_docs", lambda out_dir: JsonlWorker(jsonl_path, out_dir)) runner.start()