示例#1
0
    def sample_human_eval_fns(self, max_dom_doc=2):
        selected_fns = []
        dict_dom_n_files = dict()
        for dom in self.doms_final:
            dict_dom_n_files[dom] = 0

        root_path = path_parser.dataset_test
        target_path = path_parser.dataset_test_human_eval

        fns = [fn for fn in listdir(root_path) if
               not fn.startswith('.') and len(fn.split('_')) >= 2 and isfile(join(root_path, fn))]

        logger.info('#files: {}'.format(len(fns)))

        existing_fns = [fn for fn in listdir(target_path) if
                        not fn.startswith('.') and len(fn.split('_')) >= 2 and isfile(join(target_path, fn))]

        for fn in existing_fns:
            labels = fn.split('_')[1:]
            # logger.info('labels: {}'.format(labels))
            for label in labels:
                dict_dom_n_files[label] += 1

        for k, v in dict_dom_n_files.items():
            logger.info('Existing: {0} - {1}'.format(k, v))

        if min(dict_dom_n_files.values()) == max_dom_doc:
            logger.error('Already full!')
            raise FileExistsError

        random.shuffle(fns)
        full_status = False

        for fn in fns:
            labels = fn.split('_')[1:]
            # logger.info('labels: {0}'.format(labels))

            ahead_overflow = False
            for label in labels:
                if dict_dom_n_files[label] == max_dom_doc:
                    ahead_overflow = True
                    break

            if not ahead_overflow and self.check_existence_of_paras(fp=join(root_path, fn)):
                selected_fns.append(fn)
                for label in labels:
                    dict_dom_n_files[label] += 1

            if min(dict_dom_n_files.values()) == max_dom_doc:
                full_status = True

            if full_status:
                break

        for fn in selected_fns:
            old_path = join(root_path, fn)
            new_path = join(target_path, fn)
            sp.call(['cp', old_path, new_path])
示例#2
0
def transform_test_input_2d_for_mturk(y_true_3d, y_pred_2d, n_sents):
    """
        for evaluating sentence-level domain detection.

    :param y_true_3d: d_batch * max_n_sents * n_doms
    :param y_pred_2d: d_batch * n_doms
    :param n_sents: d_batch * 1
    :return: y_true_sents, y_pred_sents: (d_batch * total_sents) * n_doms
    """
    d_batch = len(y_pred_2d)
    y_true_sents = list()
    y_pred_sents = list()
    # logger.info('n_sents: {}'.format(n_sents))

    for s_idx in range(d_batch):
        y_true_sents_items = y_true_3d[s_idx, 0:n_sents[s_idx, 0], :]
        y_true_sents.append(y_true_sents_items)

        y_pred_sents_items = np.tile(y_pred_2d[s_idx, :],
                                     (n_sents[s_idx, 0], 1))
        y_pred_sents.append(y_pred_sents_items)

        if y_true_sents_items.shape != y_pred_sents_items.shape:
            logger.error('n_sents: {0}'.format(n_sents[s_idx]))
            logger.error('y_true_sents_items: {0}'.format(y_true_sents_items))
            logger.error('y_pred_sents_items: {0}'.format(y_pred_sents_items))
            raise ValueError

    y_true_sents = np.concatenate(y_true_sents)
    y_pred_sents = np.concatenate(y_pred_sents)

    # logger.info('y_true_sents: {0}, y_pred_sents: {1}'.format(y_true_sents.shape, y_pred_sents.shape))
    assert y_true_sents.shape == y_pred_sents.shape

    return y_true_sents, y_pred_sents
示例#3
0
def transform_test_input_2d(y_true_2d, y_pred_2d, des_sent_info):
    """
        for evaluating sentence-level domain detection.

    :param y_true_2d: d_batch * n_doms
    :param y_pred_2d: d_batch * n_doms
    :param des_sent_info: d_batch * [start_sent_idx, end_sent_idx, n_sents]
    :return: y_true_sents, y_pred_sents: (d_batch * n_sents) * n_doms
    """
    d_batch = len(y_pred_2d)
    y_true_sents = list()
    y_pred_sents = list()

    for s_idx in range(d_batch):
        start_sent_idx, end_sent_idx, n_sents = des_sent_info[s_idx]

        y_true_sents_items = np.tile(y_true_2d[s_idx, :], (n_sents, 1))
        y_true_sents.append(y_true_sents_items)

        y_pred_sents_items = np.tile(y_pred_2d[s_idx, :], (n_sents, 1))
        y_pred_sents.append(y_pred_sents_items)

        if y_true_sents_items.shape != y_pred_sents_items.shape:
            logger.error('des_sent_info: {0}'.format(des_sent_info[s_idx]))
            logger.error('y_true_sents_items: {0}'.format(y_true_sents_items))
            logger.error('y_pred_sents_items: {0}'.format(y_pred_sents_items))
            raise ValueError

    y_true_sents = np.concatenate(y_true_sents)
    y_pred_sents = np.concatenate(y_pred_sents)

    # logger.info('y_true_sents: {0}, y_pred_sents: {1}'.format(y_true_sents.shape, y_pred_sents.shape))
    assert y_true_sents.shape == y_pred_sents.shape

    return y_true_sents, y_pred_sents
示例#4
0
        def _get_para_info():
            para_matches = list(re.finditer(re.compile(para_pattern), text))

            if not para_matches:
                logger.error('No para in {0}'.format(xml_fp))
                raise AssertionError

            # logger.info('para_matches {0}'.format(para_matches))

            paras = list()
            para_spans = list()
            for para_m in para_matches:
                # if para_m.group() != '\n':
                paras.append(para_m.group())
                para_spans.append(para_m.span())

            # logger.info('paras: {0}'.format(paras))
            # logger.info('para_spans: {0}'.format(para_spans))

            para_info = list(zip(paras, para_spans))
            # logger.info('para_info {0}'.format(para_info))

            return para_info
示例#5
0
def transform_test_input_for_syn_doc(y_pred,
                                     fids,
                                     label_fns,
                                     tile_y_pred,
                                     zero_as_no=False):
    """

    :param y_pred: d_batch * n_sents * n_doms or d_batch * n_doms
    :param fids:
    :param label_fns:
    :param tile_y_pred: True for y_pred (d_batch * n_doms)
    :return:
    """
    y_pred_sents = list()
    y_true_sents = list()

    for s_idx, fid in enumerate(fids):
        y_true_sents_items = get_y_true_by_fid(label_fns, fid[0], zero_as_no)
        y_true_sents.append(y_true_sents_items)

        n_sents = len(y_true_sents_items)

        if tile_y_pred:
            y_pred_sents_items = np.tile(y_pred[s_idx, :], (n_sents, 1))
        else:
            y_pred_sents_items = y_pred[s_idx, :n_sents, :]

        y_pred_sents.append(y_pred_sents_items)

        if y_true_sents_items.shape != y_pred_sents_items.shape:
            logger.error(
                'Shape: y_true_sents_items: {0}, y_pred_sents_items: {1}'.
                format(y_true_sents_items.shape, y_pred_sents_items.shape))
            logger.error('y_true_sents_items: {0}'.format(y_true_sents_items))
            logger.error('y_pred_sents_items: {0}'.format(y_pred_sents_items))
            assert False
    y_true_sents = np.concatenate(y_true_sents)
    y_pred_sents = np.concatenate(y_pred_sents)

    # logger.info('y_true_sents: {0}, y_pred_sents: {1}'.format(y_true_sents.shape, y_pred_sents.shape))
    assert y_true_sents.shape == y_pred_sents.shape

    return y_true_sents, y_pred_sents