def sample_human_eval_fns(self, max_dom_doc=2): selected_fns = [] dict_dom_n_files = dict() for dom in self.doms_final: dict_dom_n_files[dom] = 0 root_path = path_parser.dataset_test target_path = path_parser.dataset_test_human_eval fns = [fn for fn in listdir(root_path) if not fn.startswith('.') and len(fn.split('_')) >= 2 and isfile(join(root_path, fn))] logger.info('#files: {}'.format(len(fns))) existing_fns = [fn for fn in listdir(target_path) if not fn.startswith('.') and len(fn.split('_')) >= 2 and isfile(join(target_path, fn))] for fn in existing_fns: labels = fn.split('_')[1:] # logger.info('labels: {}'.format(labels)) for label in labels: dict_dom_n_files[label] += 1 for k, v in dict_dom_n_files.items(): logger.info('Existing: {0} - {1}'.format(k, v)) if min(dict_dom_n_files.values()) == max_dom_doc: logger.error('Already full!') raise FileExistsError random.shuffle(fns) full_status = False for fn in fns: labels = fn.split('_')[1:] # logger.info('labels: {0}'.format(labels)) ahead_overflow = False for label in labels: if dict_dom_n_files[label] == max_dom_doc: ahead_overflow = True break if not ahead_overflow and self.check_existence_of_paras(fp=join(root_path, fn)): selected_fns.append(fn) for label in labels: dict_dom_n_files[label] += 1 if min(dict_dom_n_files.values()) == max_dom_doc: full_status = True if full_status: break for fn in selected_fns: old_path = join(root_path, fn) new_path = join(target_path, fn) sp.call(['cp', old_path, new_path])
def transform_test_input_2d_for_mturk(y_true_3d, y_pred_2d, n_sents): """ for evaluating sentence-level domain detection. :param y_true_3d: d_batch * max_n_sents * n_doms :param y_pred_2d: d_batch * n_doms :param n_sents: d_batch * 1 :return: y_true_sents, y_pred_sents: (d_batch * total_sents) * n_doms """ d_batch = len(y_pred_2d) y_true_sents = list() y_pred_sents = list() # logger.info('n_sents: {}'.format(n_sents)) for s_idx in range(d_batch): y_true_sents_items = y_true_3d[s_idx, 0:n_sents[s_idx, 0], :] y_true_sents.append(y_true_sents_items) y_pred_sents_items = np.tile(y_pred_2d[s_idx, :], (n_sents[s_idx, 0], 1)) y_pred_sents.append(y_pred_sents_items) if y_true_sents_items.shape != y_pred_sents_items.shape: logger.error('n_sents: {0}'.format(n_sents[s_idx])) logger.error('y_true_sents_items: {0}'.format(y_true_sents_items)) logger.error('y_pred_sents_items: {0}'.format(y_pred_sents_items)) raise ValueError y_true_sents = np.concatenate(y_true_sents) y_pred_sents = np.concatenate(y_pred_sents) # logger.info('y_true_sents: {0}, y_pred_sents: {1}'.format(y_true_sents.shape, y_pred_sents.shape)) assert y_true_sents.shape == y_pred_sents.shape return y_true_sents, y_pred_sents
def transform_test_input_2d(y_true_2d, y_pred_2d, des_sent_info): """ for evaluating sentence-level domain detection. :param y_true_2d: d_batch * n_doms :param y_pred_2d: d_batch * n_doms :param des_sent_info: d_batch * [start_sent_idx, end_sent_idx, n_sents] :return: y_true_sents, y_pred_sents: (d_batch * n_sents) * n_doms """ d_batch = len(y_pred_2d) y_true_sents = list() y_pred_sents = list() for s_idx in range(d_batch): start_sent_idx, end_sent_idx, n_sents = des_sent_info[s_idx] y_true_sents_items = np.tile(y_true_2d[s_idx, :], (n_sents, 1)) y_true_sents.append(y_true_sents_items) y_pred_sents_items = np.tile(y_pred_2d[s_idx, :], (n_sents, 1)) y_pred_sents.append(y_pred_sents_items) if y_true_sents_items.shape != y_pred_sents_items.shape: logger.error('des_sent_info: {0}'.format(des_sent_info[s_idx])) logger.error('y_true_sents_items: {0}'.format(y_true_sents_items)) logger.error('y_pred_sents_items: {0}'.format(y_pred_sents_items)) raise ValueError y_true_sents = np.concatenate(y_true_sents) y_pred_sents = np.concatenate(y_pred_sents) # logger.info('y_true_sents: {0}, y_pred_sents: {1}'.format(y_true_sents.shape, y_pred_sents.shape)) assert y_true_sents.shape == y_pred_sents.shape return y_true_sents, y_pred_sents
def _get_para_info(): para_matches = list(re.finditer(re.compile(para_pattern), text)) if not para_matches: logger.error('No para in {0}'.format(xml_fp)) raise AssertionError # logger.info('para_matches {0}'.format(para_matches)) paras = list() para_spans = list() for para_m in para_matches: # if para_m.group() != '\n': paras.append(para_m.group()) para_spans.append(para_m.span()) # logger.info('paras: {0}'.format(paras)) # logger.info('para_spans: {0}'.format(para_spans)) para_info = list(zip(paras, para_spans)) # logger.info('para_info {0}'.format(para_info)) return para_info
def transform_test_input_for_syn_doc(y_pred, fids, label_fns, tile_y_pred, zero_as_no=False): """ :param y_pred: d_batch * n_sents * n_doms or d_batch * n_doms :param fids: :param label_fns: :param tile_y_pred: True for y_pred (d_batch * n_doms) :return: """ y_pred_sents = list() y_true_sents = list() for s_idx, fid in enumerate(fids): y_true_sents_items = get_y_true_by_fid(label_fns, fid[0], zero_as_no) y_true_sents.append(y_true_sents_items) n_sents = len(y_true_sents_items) if tile_y_pred: y_pred_sents_items = np.tile(y_pred[s_idx, :], (n_sents, 1)) else: y_pred_sents_items = y_pred[s_idx, :n_sents, :] y_pred_sents.append(y_pred_sents_items) if y_true_sents_items.shape != y_pred_sents_items.shape: logger.error( 'Shape: y_true_sents_items: {0}, y_pred_sents_items: {1}'. format(y_true_sents_items.shape, y_pred_sents_items.shape)) logger.error('y_true_sents_items: {0}'.format(y_true_sents_items)) logger.error('y_pred_sents_items: {0}'.format(y_pred_sents_items)) assert False y_true_sents = np.concatenate(y_true_sents) y_pred_sents = np.concatenate(y_pred_sents) # logger.info('y_true_sents: {0}, y_pred_sents: {1}'.format(y_true_sents.shape, y_pred_sents.shape)) assert y_true_sents.shape == y_pred_sents.shape return y_true_sents, y_pred_sents