def build_cartesian_domain_sent_corpora(self): """ aggregate descriptive sentences by their labels. """ des_pattern_stripped = re.compile(self.DES_PATTERN_STRIPPED) sent_pattern = re.compile(self.SENT_PATTERN_TAGGED) # reserve start and end dom_sents_dict = dict() src_fns = [fn for fn in listdir(self.dp_dataset_test_filtered) if isfile(join(self.dp_dataset_test_filtered, fn))] for src_fn in tqdm(src_fns): label = '_'.join(src_fn.split('_')[1:]) src_fp = join(self.dp_dataset_test_filtered, src_fn) with io.open(src_fp, encoding='utf-8') as src_f: text_stripped = re.findall(des_pattern_stripped, src_f.read())[0] sents_stripped = re.findall(sent_pattern, text_stripped) if label in dom_sents_dict: dom_sents_dict[label].extend(sents_stripped) else: dom_sents_dict[label] = sents_stripped for dom, sents in dom_sents_dict.items(): logger.info('{0} sents: {1}'.format(dom, str(len(sents)))) out_fp = join(self.dp_dataset_dom_sent_corpora, dom) with io.open(out_fp, mode='a', encoding='utf-8') as out_f: out_f.write('\n'.join(sents))
def _build_components(cid, query): sim_items = tfidf_tools.build_sim_items_e2e(cid, query, mask_intra=mask_intra) rel_scores = sim_items['rel_scores'] sim_mat = sim_items['doc_sim_mat'] processed_sents = sim_items['processed_sents'] rel_vec = rel_scores / np.sum(rel_scores) # l1 norm to make a distribution np.fill_diagonal(sim_mat, 0.0) # avoid self-transition logger.info('sim_mat: {}'.format(sim_mat)) sid2abs = {} sid_abs = 0 for doc_idx, doc in enumerate(processed_sents): for sent_idx, sent in enumerate(doc): sid = config.SEP.join((str(doc_idx), str(sent_idx))) sid2abs[sid] = sid_abs sid_abs += 1 components = { 'sim_mat': sim_mat, 'rel_vec': rel_vec, 'sid2abs': sid2abs, } return components
def compute_rouge_for_oracle(): """ The rec dp for oracle saves text for comparing against refecence. :return: """ ir_rec_dp = join(path_parser.summary_rank, ir_config.IR_RECORDS_DIR_NAME_TF) if exists(ir_rec_dp): raise ValueError('ir_rec_dp exists: {}'.format(ir_rec_dp)) os.mkdir(ir_rec_dp) cids = tools.get_test_cc_ids() for cid in tqdm(cids): retrieval_params = { 'model_name': ir_config.IR_MODEL_NAME_TF, 'cid': cid, 'filter_var': ir_config.FILTER_VAR, 'filter': ir_config.FILTER, 'deduplicate': ir_config.DEDUPLICATE, 'prune': True, } retrieved_items = ir_tools.retrieve(**retrieval_params) summary = '\n'.join([item[-1] for item in retrieved_items]) with open(join(ir_rec_dp, cid), mode='a', encoding='utf-8') as out_f: out_f.write(summary) performance = rouge.compute_rouge_for_ablation_study(ir_rec_dp) logger.info(performance)
def init(): # parse args parser = ArgumentParser() parser.add_argument( 'n_devices', nargs='?', default=4, help='num of devices on which model will be running on') args = parser.parse_args() all_device_ids = [0, 1, 2, 3] device = all_device_ids[:int(args.n_devices)] config_meta['device'] = device if not torch.cuda.is_available(): placement = 'cpu' logger.info('[MAIN INIT] path mode: {0}, placement: {1}'.format( config.path_type, placement)) else: if len(device) == 1: placement = 'single' torch.cuda.set_device(device[0]) elif config_meta['auto_parallel']: placement = 'auto' else: placement = 'manual' logger.info( '[MAIN INIT] path mode: {0}, placement: {1}, n_devices: {2}'. format(config.path_type, placement, args.n_devices)) config_meta['placement'] = placement
def _compute_rel_scores_tf_dot(processed_sents, query): """ :param processed_sents: :param query_sents: :param mask_intra: :return: """ doc_sents = list(itertools.chain(*processed_sents)) # 1d sent list doc_sents = copy.deepcopy( doc_sents) # avoid affecting the original doc_sents list # logger.info('doc_sents: {}'.format(len(doc_sents))) doc_sents.append(query) tf_mat = get_tf_mat(sents=doc_sents).toarray() # logger.info('tf_idf_mat: {}'.format(tf_idf_mat)) # doc_sent_mat = tf_mat[:-1].A # query_mat = tf_mat[-1].A.reshape(-1, 1) # logger.info('doc_sent_mat: {}, query_mat: {}'.format(doc_sent_mat.shape, query_mat.shape)) # logger.info('doc_sent_mat: {}'.format(type(doc_sent_mat))) rel_scores = np.matmul(tf_mat[:-1], tf_mat[-1]) # rel_scores = np.squeeze(doc_query_sim_mat, axis=-1) logger.info('rel_scores: {}'.format(rel_scores.shape)) return rel_scores
def approx_rand(self): obs_diff = self._obs_diff() def _iter(iter_idx): logger.info('iter_idx: {0}'.format(iter_idx)) reserve = np.random.randint(2, size=[self.n_obs, 1]) reserve_mat = np.tile(reserve, [1, n_ways]) flip_mat = 1 - reserve_mat x_stat = np.multiply(self.obs_1, reserve_mat) + np.multiply( self.obs_2, flip_mat) y_stat = np.multiply(self.obs_2, reserve_mat) + np.multiply( self.obs_1, flip_mat) diff = self._diff(x_stat, y_stat) # logger.info('diff: {0}, x_stat: {1}, y_stat: {2}'.format(diff, x_stat, y_stat)) if diff >= obs_diff: return 1 else: return 0 pool = ThreadPool(6) better = float(sum(pool.map(_iter, range(self.iter)))) self.p = (better + 1) / (self.iter + 1) logger.info('p-value: {0}'.format(self.p))
def get_doc_by_head(self, head, use_head_id=False): """ get a doc as a string by its head (default) or head id (set use_head_id = True). """ if lang != 'en' or use_head_id: head_id = head else: head_id = self.get_head_id(head) if not head_id: return None if lang == 'en': file_id = self.get_file_id(head_id) fp_wiki_file = self.fp_wiki_files[file_id] else: fp_wiki_file = self.fp_wiki_file with io.open(fp_wiki_file, encoding='utf-8') as f: data = mmap.mmap(f.fileno(), 0, access=mmap.PROT_READ) pattern = re.compile( self.DOC_PATTERN.format(head_id).encode('utf-8')) match = re.search(pattern, data) if match: logger.info('succeeded to extract doc: {0}'.format(head_id)) return match.group().decode('utf-8') else: logger.info('failed to extract doc: {0}'.format(head_id)) return None
def _proc_sent(self, sent, rm_dialog, rm_stop, stem, rm_short=None, min_nw_sent=3): sent = sent.lower() sent = re.sub(r'\s+', ' ', sent).strip() # remove extra spaces if not sent: return None if rm_short and len(nltk.tokenize.word_tokenize(sent)) < min_nw_sent: return None if rm_dialog: dialog_tokens = ["''", "``"] for tk in dialog_tokens: if tk in sent: logger.info('Remove dialog') return None if config.test_year == '2005' and sent[0] == "'" and ( 'says' in sent or 'said' in sent): logger.info('Remove dialog') return None if rm_stop: sent = remove_stopwords(sent) if stem: sent = self.porter_stemmer.stem_sentence(sent) return sent
def compute_rouge(model_name, n_iter=None, diversity_param_tuple=None, cos_threshold=None, extra=None): rouge_args = '-a -l 250 -n 2 -m -2 4 -u -c 95 -r 1000 -f A -p 0.5 -t 0 -d -e {} -x'.format( path_parser.rouge_dir) r = Rouge155(rouge_args=rouge_args) baselines_wo_config = ['lead', 'lead-2006', 'lead-2007', 'lead_2007'] if model_name in baselines_wo_config or model_name.startswith('duc'): text_dp = join(path_parser.summary_text, model_name) else: text_dp = tools.get_text_dp( model_name, cos_threshold=cos_threshold, n_iter=n_iter, diversity_param_tuple=diversity_param_tuple, extra=extra) r.system_dir = text_dp r.model_dir = join(path_parser.data_summary_targets, config.test_year) gen_sys_file_pat = '(\w*)' gen_model_file_pat = '#ID#_[\d]' r.system_filename_pattern = gen_sys_file_pat r.model_filename_pattern = gen_model_file_pat output = r.convert_and_evaluate() output = proc_output(output) logger.info(output) return output
def rank_e2e(): """ :param pool_func: avg, max, or None (for integrated query). :return: """ rank_dp = join(path_parser.summary_rank, ir_config.IR_MODEL_NAME_TF) test_cid_query_dicts = general_tools.build_test_cid_query_dicts( tokenize_narr=False, concat_title_narr=ir_config.CONCAT_TITLE_NARR, query_type=ir_config.QUERY_TYPE) if exists(rank_dp): raise ValueError('rank_dp exists: {}'.format(rank_dp)) os.mkdir(rank_dp) for cid_query_dict in tqdm(test_cid_query_dicts): params = { **cid_query_dict, } rank_records = _rank(**params) rank_sent.dump_rank_records(rank_records, out_fp=join(rank_dp, params['cid']), with_rank_idx=False) logger.info('Successfully dumped rankings to: {}'.format(rank_dp))
def check_cross_du_2(self): dom_ids_list = list() unique_ids = list() for dom in doms_final: id_fp = os.path.join(self.dp_doc_dedu, dom + '_ids.txt') with io.open(id_fp, encoding='utf-8') as id_f: dom_ids = [l.rstrip('\n') for l in id_f.readlines() if l] dom_ids_list.append(dom_ids) unique_ids.extend( [dom_id for dom_id in dom_ids if dom_id not in unique_ids]) n_label_list = [0] * len(doms_final) for unique_id in unique_ids: n_labels = 0 for dom_ids in dom_ids_list: if unique_id in dom_ids: n_labels += 1 n_label_list[n_labels] += 1 n_label_ratio = [ '{0:.2f}'.format(float(n_label) / sum(n_label_list)) for n_label in n_label_list ] logger.info('#labels: {}'.format(n_label_list)) logger.info('ratio of #labels: {}'.format(n_label_ratio))
def get_head_id(self, head): if head in self.head_to_id: return self.head_to_id[head] # return self.heads.index(head) + 1 # index from 1 else: logger.info('doc out of list: {0}'.format(head)) return None
def __init__(self): self.fp_non_ge_ids = path_parser.data_non_ge_ids self.fp_clean_stats = path_parser.data_clean_stats # join: doc & doms (8 classes) self.dp_doc = path_parser.data_doc self.dom_dp = dict() for dom in doms: self.dom_dp[dom] = os.path.join(self.dp_doc, dom) # join: doc_dedu & doms_final (7 classes) self.dp_doc_dedu = path_parser.data_doc_dedu self.dom_dedu_dp = dict() for dom in doms_final: self.dom_dedu_dp[dom] = os.path.join(self.dp_doc_dedu, dom) self.dp_proj_root = path_parser.proj_root self.SUBCATEGORIES = 'subcategories' self.PAGES = 'pages' if lang == 'en': self.START_DOC, self.END_DOC = '#s-doc\t{0}', '#e-doc\t{0}' self.START_SEC, self.END_SEC = 's-headline', '#e-headline' self.DOC_PATTERN = '#s-doc\t{0}[\s\S]*#e-doc\t{0}' self.SEC_PATTERN = '#s-headline[\s\S]*?(?=#s-headline)' self.SEC_PATTERN_LAST = '#s-headline\t{0}[\s\S]*?(?=#e-doc)' self.fp_wiki_corpus_head = path_parser.wiki_corpus_head self.fp_wiki_files = list() for i in range(24): if i < 10: i = '0{0}'.format(i) self.fp_wiki_files.append( os.path.join(path_parser.wiki_corpus, 'wikipedia-tagged_{}.txt'.format(i))) # self.heads = list() self.id_to_head, self.head_to_id = dict(), dict() with io.open(self.fp_wiki_corpus_head, encoding='utf-8') as f: for line in f: items = line.split('\t') if len(items) >= 2: head_id, head = int(items[0]), items[1] self.id_to_head[head_id] = head self.head_to_id[head] = head_id # self.heads.append(head) logger.info('#articles: {0}'.format(len(self.id_to_head))) else: self.START_DOC, self.END_DOC = '<article name="{0}">', '</article>' self.START_PARAGRAPH, self.END_PARAGRAPH = '<p>', '</p>' self.START_HEADLINE, self.END_HEADLINE = '<h>', '</h>' self.DOC_PATTERN = self.START_DOC + '[\s\S]*?' + self.END_DOC self.fp_wiki_file = path_parser.wiki_corpus # multi-threads self.n_extraction_threads = 5 # 10 self.n_file_threads = 5 # 6
def check_cross_du(self): for target_dom in doms_final: doms_final_copy = list(doms_final) doms_final_copy.remove(target_dom) other_dom_ids = list() for dom in doms_final_copy: id_fp = os.path.join(self.dp_doc_dedu, dom + '_ids.txt') with io.open(id_fp, encoding='utf-8') as id_f: ids = [l.rstrip('\n') for l in id_f.readlines() if l] other_dom_ids.extend(ids) target_id_fp = os.path.join(self.dp_doc_dedu, target_dom + '_ids.txt') with io.open(target_id_fp, encoding='utf-8') as target_id_f: target_ids = [ l.rstrip('\n') for l in target_id_f.readlines() if l ] du_ids = [ target_id for target_id in target_ids if target_id in other_dom_ids ] ratio = float(len(du_ids)) / len(target_ids) logger.info('{0}: {1}/{2}, {3:.2f}'.format( target_dom, len(du_ids), len(target_ids), ratio))
def __init__(self, dataset_type, transform=None, collect_doc_ids=False, doc_ids=None, in_para_only=False): super(DomDetDataset, self).__init__() self.doms = doms_final self.score_func = config_model['score_func'] self.dataset_parser = DatasetParser() self.dataset_type = dataset_type assert dataset_type in ('train', 'dev', 'test', 'test_filtered') if mode == 'debug': dataset_type = 'debug' self.root = path_parser.dataset_type_dp[dataset_type] self.fns = [ fn for fn in listdir(self.root) if isfile(join(self.root, fn)) ] if doc_ids: # only one sample logger.info('Load only {} target docs'.format(len(doc_ids))) self.fns = [ fn for fn in self.fns if tools.get_doc_id(fn) in doc_ids ] self.transform = transform self.collect_doc_ids = collect_doc_ids self.in_para_only = in_para_only
def compute_exam_p_and_r(y_true_3d, y_pred_3d, n_sents, n_words, silent=False): """ return a list of precision values. :param y_true_4d: d_batch * max_n_sents * max_n_words * 1 :param y_pred_4d: d_batch * max_n_sents * max_n_words * 1 :param n_sents: d_batch * 1 :param n_words: d_batch * max_n_sents * 1 """ p_list = list() r_list = list() d_batch = y_true_3d.shape[0] for sample_idx in range(d_batch): n_sent = n_sents[sample_idx, 0] for sent_idx in range(n_sent): n_word = n_words[sample_idx, sent_idx] y_true = y_true_3d[sample_idx, sent_idx, :n_word] y_pred = y_pred_3d[sample_idx, sent_idx, :n_word] if not silent and not y_pred.any() == 1: logger.info('No pred is made for: {0}.{1}. y_pred: {2}'.format( sample_idx, sent_idx, y_pred)) p_list.append(precision_score(y_true, y_pred)) r_list.append(recall_score(y_true, y_pred)) return p_list, r_list
def get_wiki_summ_text_dp(order_subdir, accurate_only, multi_label_only, variation=None): if not variation: variation = config_model['variation'] text_root = join(path_parser.summary_text_dps['wiki'], variation) if not exists(text_root): logger.info("Saving root does not exist. Making directory {}".format( text_root)) mkdir(text_root) if accurate_only: subdir = '{}_accurate'.format(order_subdir) else: subdir = '{}_all'.format(order_subdir) if multi_label_only: subdir = '{}_multilabel'.format(subdir) summ_text_dp = join(text_root, subdir) if not exists(summ_text_dp): logger.info("Saving root does not exist. Making directory {}".format( summ_text_dp)) mkdir(summ_text_dp) return summ_text_dp
def build_rel_scores_tf_passage(cid, query, retrieved_dp=None, tdqfs_data=False): _, proc_passages, passage_ids = load_retrieved_passages( cid, get_sents=False, retrieved_dp=retrieved_dp, passage_ids=None, tdqfs_data=tdqfs_data) # passage_ids, passage_fps = get_passage_fps(cid, retrieved_dp=retrieved_dp) # # # # proc_passages = [] # # passage_ids = [] # for fp in passage_fps: # # po = load_obj(fp) # with open(fp, 'rb') as f: # po = dill.load(f) # # print('po: {}, type(po): {}'.format(po, type(po))) # # passage_ids.append(po.pid) # # passage = po.get_proc_passage() # proc_passages.append(passage) # # logger.info('{}: {}'.format(po.pid, passage)) rel_scores = _compute_rel_scores_tf( [proc_passages], query ) # nest proc_passages again for compatibility; todo: double check the nest level logger.info('rel_scores: {}'.format(rel_scores)) pid2score = {} for pid, score in zip(passage_ids, rel_scores): pid2score[pid] = score return pid2score
def _build_components(cid, query): sim_items = tfidf_tools.build_sim_items_e2e_tfidf_with_lexrank(cid, query, rm_dialog=RM_DIALOG) sim_mat = vec_tools.norm_sim_mat(sim_mat=sim_items['doc_sim_mat'], max_min_scale=False) rel_vec = vec_tools.norm_rel_scores(rel_scores=sim_items['rel_scores'], max_min_scale=False) logger.info('rel_vec: {}'.format(rel_vec)) if len(rel_vec) != len(sim_mat): raise ValueError('Incompatible sim_mat size: {} and rel_vec size: {} for cid: {}'.format( sim_mat.shape, rel_vec.shape, cid)) processed_sents = sim_items['processed_sents'] sid2abs = {} sid_abs = 0 for doc_idx, doc in enumerate(processed_sents): for sent_idx, sent in enumerate(doc): sid = config.SEP.join((str(doc_idx), str(sent_idx))) sid2abs[sid] = sid_abs sid_abs += 1 components = { 'sim_mat': sim_mat, 'rel_vec': rel_vec, 'sid2abs': sid2abs, } return components
def _passage_core(cid, query, narr, passage_size, stride): original_sents, processed_sents = dataset_parser.cid2sents(cid, max_ns_doc=None) # 2d lists, docs => sents logger.info('#doc: {}'.format(len(original_sents))) # build sent_objs sent_objs = [] # organized by doc sent_idx = 0 for doc_idx in range(len(original_sents)): sent_objs_doc = [] for original_s, proc_s in zip(original_sents[doc_idx], processed_sents[doc_idx]): sid = config.SEP.join([cid, str(sent_idx)]) so = SentObj(sid=sid, original_sent=original_s, proc_sent=proc_s) sent_objs_doc.append(so) sent_idx += 1 sent_objs.append(sent_objs_doc) # build passage objs passage_objs = [] for sent_objs_doc in sent_objs: start = 0 # make sure the last sentence whose length < stride will be discarded while start + stride < len(sent_objs_doc): pid = config.SEP.join([cid, str(len(passage_objs))]) target_sent_objs = sent_objs_doc[start:start+passage_size] po = PassageObj(pid=pid, query=query, narr=narr, sent_objs=target_sent_objs) passage_objs.append(po) start += stride return passage_objs
def compute_rouge(): tg2recall_all = {} tg2f1_all = {} for idx in range(N_REFS): build_eval_dirs(summary_index=idx + 1) tg2recall, tg2f1 = compute_rouge_for_human(system_dp=SYSTEM_DP_TEMP, model_dp=MODEL_DP_TEMP) logger.info('tg2f1: {}'.format(tg2f1)) for metric in ROUGE_METRICS: if metric in tg2recall_all: tg2recall_all[metric] += tg2recall[metric] else: tg2recall_all[metric] = tg2recall[metric] if metric in tg2f1_all: tg2f1_all[metric] += tg2f1[metric] else: tg2f1_all[metric] = tg2f1[metric] for metric in ROUGE_METRICS: tg2recall_all[metric] /= N_REFS tg2f1_all[metric] /= N_REFS recall_str = 'Recall:\t{}'.format('\t'.join( ['{0:.2f}'.format(tg2recall_all[metric]) for metric in ROUGE_METRICS])) f1_str = 'F1:\t{}'.format('\t'.join( ['{0:.2f}'.format(tg2f1_all[metric]) for metric in ROUGE_METRICS])) output = '\n' + '\n'.join((f1_str, recall_str)) logger.info(output)
def build_tdqfs_oracle_test_cid_query_dicts(query_fp): def _get_ref(cid): REF_DP = path_parser.data_tdqfs_summary_targets fp = join(REF_DP, '{}_{}'.format(cid, 0)) ref = '' # for fn in fns: lines = io.open(fp, encoding='utf-8').readlines() for line in lines: ref += line.rstrip('\n') return ref assert config_meta['test_year'] == 'tdqfs' lines = io.open(query_fp).readlines() cids = [line.rstrip('\n').split('\t')[0] for line in lines] test_cid_query_dicts = [] for cid in cids: ref = _get_ref(cid) logger.info('cid {}: {}'.format(cid, ref)) test_cid_query_dicts.append({ 'cid': cid, 'query': ref, }) return test_cid_query_dicts
def test_major_sent(synthese): # logger.info('START: model testing...') dataset_type = 'synthese' if synthese else 'lead' data_loader = pipe.DomDetDataLoader(dataset_type=dataset_type) n_iter, total_loss = 0, 0.0 n_samples, total_hamming = 0, 0.0 cf_mats, precision_list, recall_list = list(), list(), list() for batch_idx, batch in enumerate(data_loader): n_iter += 1 y_true = batch['labels'].cpu().numpy() d_batch = len(y_true) des_sent_info = batch['des_sent_info'].cpu().numpy() n_samples += np.sum(des_sent_info[:, -1]) # logger.info('batch_size: {0}'.format(y_true.shape[0])) if synthese: hyp_scores = np.tile(y_pred_vec, (d_batch, 1)) fids = batch['fids'].cpu().numpy() eval_args = { 'hyp_scores': hyp_scores, 'fids': fids, 'is_hiernet': True } eval_res = metrics.metric_eval_for_syn_doc(**eval_args) else: hyp_scores = np.tile(y_pred_vec, (d_batch, max_n_sents, 1)) eval_args = { 'y_true': y_true, 'hyp_scores': hyp_scores, 'des_sent_info': des_sent_info, } eval_res = metrics.metric_eval(**eval_args) cf_mats.append(eval_res['cf_mat_list']) precision_list.extend(eval_res['precision_list']) recall_list.extend(eval_res['recall_list']) total_hamming += eval_res['hamming'] cls_f1, avg_f1 = metrics.compute_f1_with_confusion_mats(cf_mats) example_based_f1 = metrics.compute_example_based_f1( precision_list=precision_list, recall_list=recall_list) hamming = total_hamming / n_samples eval_log_info = { 'example_based_f1': example_based_f1, 'avg_f1': avg_f1, 'cls_f1': cls_f1, 'hamming': hamming, } res_str = 'example_based_f1: {example_based_f1:.6f},' \ 'avg_f1: {avg_f1:.6f}, cls_f1: {cls_f1}, hamming: {hamming:.6f}' logger.info(res_str.format(**eval_log_info))
def dump_retrieval(fp, retrieved_items): retrieve_records = ['\t'.join(items) for items in retrieved_items] n_sents = rank_sent.dump_rank_records(rank_records=retrieve_records, out_fp=fp, with_rank_idx=False) logger.info('successfully dumped {0} retrieved items to {1}'.format( n_sents, fp))
def validate(data_type, window): dump_fp = join(path_parser.squad_proc, 'window_{}'.format(window), '{}.tsv'.format(data_type)) with io.open(dump_fp, encoding='utf-8') as dump_f: line = dump_f.readline() if len(line.split('\t')) != 5: raise ValueError('Invalid line: {}'.format(line)) logger.info('Validated!')
def sample_human_eval_fns(self, max_dom_doc=2): selected_fns = [] dict_dom_n_files = dict() for dom in self.doms_final: dict_dom_n_files[dom] = 0 root_path = path_parser.dataset_test target_path = path_parser.dataset_test_human_eval fns = [fn for fn in listdir(root_path) if not fn.startswith('.') and len(fn.split('_')) >= 2 and isfile(join(root_path, fn))] logger.info('#files: {}'.format(len(fns))) existing_fns = [fn for fn in listdir(target_path) if not fn.startswith('.') and len(fn.split('_')) >= 2 and isfile(join(target_path, fn))] for fn in existing_fns: labels = fn.split('_')[1:] # logger.info('labels: {}'.format(labels)) for label in labels: dict_dom_n_files[label] += 1 for k, v in dict_dom_n_files.items(): logger.info('Existing: {0} - {1}'.format(k, v)) if min(dict_dom_n_files.values()) == max_dom_doc: logger.error('Already full!') raise FileExistsError random.shuffle(fns) full_status = False for fn in fns: labels = fn.split('_')[1:] # logger.info('labels: {0}'.format(labels)) ahead_overflow = False for label in labels: if dict_dom_n_files[label] == max_dom_doc: ahead_overflow = True break if not ahead_overflow and self.check_existence_of_paras(fp=join(root_path, fn)): selected_fns.append(fn) for label in labels: dict_dom_n_files[label] += 1 if min(dict_dom_n_files.values()) == max_dom_doc: full_status = True if full_status: break for fn in selected_fns: old_path = join(root_path, fn) new_path = join(target_path, fn) sp.call(['cp', old_path, new_path])
def __init__(self, model_out_fp_1, model_out_fp_2, gold_fp, iter=10000): self.obs_1 = get_obs(fp=model_out_fp_1) self.obs_2 = get_obs(fp=model_out_fp_2) self.gold = get_obs(fp=gold_fp) assert self.obs_1.shape == self.obs_2.shape == self.gold.shape self.n_obs = self.obs_1.shape[0] logger.info('n_obs: {0}'.format(self.n_obs)) self.iter = iter
def _prune_rank_items(rank_items, threshold=1e-10): if float(rank_items[-1][1]) > threshold: logger.info('Prune ratio: 0.00') return rank_items for i in range(len(rank_items)): if float(rank_items[i][1]) <= threshold: logger.info('Prune ratio: {0:.2f}'.format( float(i) / len(rank_items))) return rank_items[:i]
def get_bert_in_func(): if config.meta_model_name == 'bert_qa': from frame.bert_qa import bert_input bert_in_func = bert_input.build_bert_x else: from data import bert_input_sep bert_in_func = bert_input_sep.build_bert_x_sep logger.info('Using bert_in_func: {}'.format(config.meta_model_name)) return bert_in_func
def _dump_passages(year, cid, passage_objs): cc_dp = join(path_parser.data_passages, year, cid) if not exists(cc_dp): # remove previous output os.mkdir(cc_dp) for po in passage_objs: with open(join(cc_dp, po.pid), 'wb') as f: dill.dump(po, f) logger.info('[_dump_passages] Dump {} passage objects to {}'.format(len(passage_objs), cc_dp))