Exemplo n.º 1
0
    def build_cartesian_domain_sent_corpora(self):
        """
            aggregate descriptive sentences by their labels.
        """

        des_pattern_stripped = re.compile(self.DES_PATTERN_STRIPPED)

        sent_pattern = re.compile(self.SENT_PATTERN_TAGGED)  # reserve start and end

        dom_sents_dict = dict()

        src_fns = [fn for fn in listdir(self.dp_dataset_test_filtered)
                   if isfile(join(self.dp_dataset_test_filtered, fn))]

        for src_fn in tqdm(src_fns):
            label = '_'.join(src_fn.split('_')[1:])
            src_fp = join(self.dp_dataset_test_filtered, src_fn)
            with io.open(src_fp, encoding='utf-8') as src_f:
                text_stripped = re.findall(des_pattern_stripped, src_f.read())[0]
            sents_stripped = re.findall(sent_pattern, text_stripped)

            if label in dom_sents_dict:
                dom_sents_dict[label].extend(sents_stripped)
            else:
                dom_sents_dict[label] = sents_stripped

        for dom, sents in dom_sents_dict.items():
            logger.info('{0} sents: {1}'.format(dom, str(len(sents))))
            out_fp = join(self.dp_dataset_dom_sent_corpora, dom)
            with io.open(out_fp, mode='a', encoding='utf-8') as out_f:
                out_f.write('\n'.join(sents))
Exemplo n.º 2
0
def _build_components(cid, query):
    sim_items = tfidf_tools.build_sim_items_e2e(cid,
                                                query,
                                                mask_intra=mask_intra)
    rel_scores = sim_items['rel_scores']
    sim_mat = sim_items['doc_sim_mat']
    processed_sents = sim_items['processed_sents']

    rel_vec = rel_scores / np.sum(rel_scores)  # l1 norm to make a distribution

    np.fill_diagonal(sim_mat, 0.0)  # avoid self-transition
    logger.info('sim_mat: {}'.format(sim_mat))

    sid2abs = {}
    sid_abs = 0
    for doc_idx, doc in enumerate(processed_sents):
        for sent_idx, sent in enumerate(doc):
            sid = config.SEP.join((str(doc_idx), str(sent_idx)))
            sid2abs[sid] = sid_abs
            sid_abs += 1

    components = {
        'sim_mat': sim_mat,
        'rel_vec': rel_vec,
        'sid2abs': sid2abs,
    }

    return components
Exemplo n.º 3
0
def compute_rouge_for_oracle():
    """
        The rec dp for oracle saves text for comparing against refecence.

    :return:
    """
    ir_rec_dp = join(path_parser.summary_rank,
                     ir_config.IR_RECORDS_DIR_NAME_TF)

    if exists(ir_rec_dp):
        raise ValueError('ir_rec_dp exists: {}'.format(ir_rec_dp))
    os.mkdir(ir_rec_dp)

    cids = tools.get_test_cc_ids()
    for cid in tqdm(cids):
        retrieval_params = {
            'model_name': ir_config.IR_MODEL_NAME_TF,
            'cid': cid,
            'filter_var': ir_config.FILTER_VAR,
            'filter': ir_config.FILTER,
            'deduplicate': ir_config.DEDUPLICATE,
            'prune': True,
        }

        retrieved_items = ir_tools.retrieve(**retrieval_params)
        summary = '\n'.join([item[-1] for item in retrieved_items])
        with open(join(ir_rec_dp, cid), mode='a', encoding='utf-8') as out_f:
            out_f.write(summary)

    performance = rouge.compute_rouge_for_ablation_study(ir_rec_dp)
    logger.info(performance)
Exemplo n.º 4
0
def init():
    # parse args
    parser = ArgumentParser()
    parser.add_argument(
        'n_devices',
        nargs='?',
        default=4,
        help='num of devices on which model will be running on')

    args = parser.parse_args()
    all_device_ids = [0, 1, 2, 3]
    device = all_device_ids[:int(args.n_devices)]
    config_meta['device'] = device

    if not torch.cuda.is_available():
        placement = 'cpu'
        logger.info('[MAIN INIT] path mode: {0}, placement: {1}'.format(
            config.path_type, placement))
    else:
        if len(device) == 1:
            placement = 'single'
            torch.cuda.set_device(device[0])
        elif config_meta['auto_parallel']:
            placement = 'auto'
        else:
            placement = 'manual'

        logger.info(
            '[MAIN INIT] path mode: {0}, placement: {1}, n_devices: {2}'.
            format(config.path_type, placement, args.n_devices))
    config_meta['placement'] = placement
Exemplo n.º 5
0
def _compute_rel_scores_tf_dot(processed_sents, query):
    """

    :param processed_sents:
    :param query_sents:
    :param mask_intra:
    :return:
    """
    doc_sents = list(itertools.chain(*processed_sents))  # 1d sent list
    doc_sents = copy.deepcopy(
        doc_sents)  # avoid affecting the original doc_sents list
    # logger.info('doc_sents: {}'.format(len(doc_sents)))
    doc_sents.append(query)

    tf_mat = get_tf_mat(sents=doc_sents).toarray()
    # logger.info('tf_idf_mat: {}'.format(tf_idf_mat))

    # doc_sent_mat = tf_mat[:-1].A
    # query_mat = tf_mat[-1].A.reshape(-1, 1)
    # logger.info('doc_sent_mat: {}, query_mat: {}'.format(doc_sent_mat.shape, query_mat.shape))
    # logger.info('doc_sent_mat: {}'.format(type(doc_sent_mat)))
    rel_scores = np.matmul(tf_mat[:-1], tf_mat[-1])
    # rel_scores = np.squeeze(doc_query_sim_mat, axis=-1)

    logger.info('rel_scores: {}'.format(rel_scores.shape))

    return rel_scores
Exemplo n.º 6
0
    def approx_rand(self):
        obs_diff = self._obs_diff()

        def _iter(iter_idx):
            logger.info('iter_idx: {0}'.format(iter_idx))
            reserve = np.random.randint(2, size=[self.n_obs, 1])
            reserve_mat = np.tile(reserve, [1, n_ways])
            flip_mat = 1 - reserve_mat

            x_stat = np.multiply(self.obs_1, reserve_mat) + np.multiply(
                self.obs_2, flip_mat)
            y_stat = np.multiply(self.obs_2, reserve_mat) + np.multiply(
                self.obs_1, flip_mat)

            diff = self._diff(x_stat, y_stat)
            # logger.info('diff: {0}, x_stat: {1}, y_stat: {2}'.format(diff, x_stat, y_stat))

            if diff >= obs_diff:
                return 1
            else:
                return 0

        pool = ThreadPool(6)
        better = float(sum(pool.map(_iter, range(self.iter))))

        self.p = (better + 1) / (self.iter + 1)

        logger.info('p-value: {0}'.format(self.p))
Exemplo n.º 7
0
    def get_doc_by_head(self, head, use_head_id=False):
        """
            get a doc as a string by its head (default) or head id (set use_head_id = True).
        """
        if lang != 'en' or use_head_id:
            head_id = head
        else:
            head_id = self.get_head_id(head)
            if not head_id:
                return None

        if lang == 'en':
            file_id = self.get_file_id(head_id)
            fp_wiki_file = self.fp_wiki_files[file_id]
        else:
            fp_wiki_file = self.fp_wiki_file

        with io.open(fp_wiki_file, encoding='utf-8') as f:
            data = mmap.mmap(f.fileno(), 0, access=mmap.PROT_READ)
            pattern = re.compile(
                self.DOC_PATTERN.format(head_id).encode('utf-8'))
            match = re.search(pattern, data)
            if match:
                logger.info('succeeded to extract doc: {0}'.format(head_id))
                return match.group().decode('utf-8')
            else:
                logger.info('failed to extract doc: {0}'.format(head_id))
                return None
Exemplo n.º 8
0
    def _proc_sent(self,
                   sent,
                   rm_dialog,
                   rm_stop,
                   stem,
                   rm_short=None,
                   min_nw_sent=3):
        sent = sent.lower()
        sent = re.sub(r'\s+', ' ', sent).strip()  # remove extra spaces

        if not sent:
            return None

        if rm_short and len(nltk.tokenize.word_tokenize(sent)) < min_nw_sent:
            return None

        if rm_dialog:
            dialog_tokens = ["''", "``"]
            for tk in dialog_tokens:
                if tk in sent:
                    logger.info('Remove dialog')
                    return None

            if config.test_year == '2005' and sent[0] == "'" and (
                    'says' in sent or 'said' in sent):
                logger.info('Remove dialog')
                return None

        if rm_stop:
            sent = remove_stopwords(sent)

        if stem:
            sent = self.porter_stemmer.stem_sentence(sent)

        return sent
Exemplo n.º 9
0
def compute_rouge(model_name,
                  n_iter=None,
                  diversity_param_tuple=None,
                  cos_threshold=None,
                  extra=None):
    rouge_args = '-a -l 250 -n 2 -m -2 4 -u -c 95 -r 1000 -f A -p 0.5 -t 0 -d -e {} -x'.format(
        path_parser.rouge_dir)

    r = Rouge155(rouge_args=rouge_args)

    baselines_wo_config = ['lead', 'lead-2006', 'lead-2007', 'lead_2007']
    if model_name in baselines_wo_config or model_name.startswith('duc'):
        text_dp = join(path_parser.summary_text, model_name)
    else:
        text_dp = tools.get_text_dp(
            model_name,
            cos_threshold=cos_threshold,
            n_iter=n_iter,
            diversity_param_tuple=diversity_param_tuple,
            extra=extra)

    r.system_dir = text_dp
    r.model_dir = join(path_parser.data_summary_targets, config.test_year)
    gen_sys_file_pat = '(\w*)'
    gen_model_file_pat = '#ID#_[\d]'

    r.system_filename_pattern = gen_sys_file_pat
    r.model_filename_pattern = gen_model_file_pat

    output = r.convert_and_evaluate()
    output = proc_output(output)
    logger.info(output)
    return output
Exemplo n.º 10
0
def rank_e2e():
    """

    :param pool_func: avg, max, or None (for integrated query).
    :return:
    """
    rank_dp = join(path_parser.summary_rank, ir_config.IR_MODEL_NAME_TF)
    test_cid_query_dicts = general_tools.build_test_cid_query_dicts(
        tokenize_narr=False,
        concat_title_narr=ir_config.CONCAT_TITLE_NARR,
        query_type=ir_config.QUERY_TYPE)

    if exists(rank_dp):
        raise ValueError('rank_dp exists: {}'.format(rank_dp))
    os.mkdir(rank_dp)

    for cid_query_dict in tqdm(test_cid_query_dicts):
        params = {
            **cid_query_dict,
        }
        rank_records = _rank(**params)
        rank_sent.dump_rank_records(rank_records,
                                    out_fp=join(rank_dp, params['cid']),
                                    with_rank_idx=False)

    logger.info('Successfully dumped rankings to: {}'.format(rank_dp))
Exemplo n.º 11
0
    def check_cross_du_2(self):
        dom_ids_list = list()
        unique_ids = list()
        for dom in doms_final:
            id_fp = os.path.join(self.dp_doc_dedu, dom + '_ids.txt')
            with io.open(id_fp, encoding='utf-8') as id_f:
                dom_ids = [l.rstrip('\n') for l in id_f.readlines() if l]
                dom_ids_list.append(dom_ids)
                unique_ids.extend(
                    [dom_id for dom_id in dom_ids if dom_id not in unique_ids])

        n_label_list = [0] * len(doms_final)
        for unique_id in unique_ids:
            n_labels = 0
            for dom_ids in dom_ids_list:
                if unique_id in dom_ids:
                    n_labels += 1
            n_label_list[n_labels] += 1

        n_label_ratio = [
            '{0:.2f}'.format(float(n_label) / sum(n_label_list))
            for n_label in n_label_list
        ]
        logger.info('#labels: {}'.format(n_label_list))
        logger.info('ratio of #labels: {}'.format(n_label_ratio))
Exemplo n.º 12
0
 def get_head_id(self, head):
     if head in self.head_to_id:
         return self.head_to_id[head]
         # return self.heads.index(head) + 1 # index from 1
     else:
         logger.info('doc out of list: {0}'.format(head))
         return None
Exemplo n.º 13
0
    def __init__(self):
        self.fp_non_ge_ids = path_parser.data_non_ge_ids
        self.fp_clean_stats = path_parser.data_clean_stats

        # join: doc & doms (8 classes)
        self.dp_doc = path_parser.data_doc
        self.dom_dp = dict()
        for dom in doms:
            self.dom_dp[dom] = os.path.join(self.dp_doc, dom)

        # join: doc_dedu & doms_final (7 classes)
        self.dp_doc_dedu = path_parser.data_doc_dedu
        self.dom_dedu_dp = dict()
        for dom in doms_final:
            self.dom_dedu_dp[dom] = os.path.join(self.dp_doc_dedu, dom)

        self.dp_proj_root = path_parser.proj_root

        self.SUBCATEGORIES = 'subcategories'
        self.PAGES = 'pages'

        if lang == 'en':
            self.START_DOC, self.END_DOC = '#s-doc\t{0}', '#e-doc\t{0}'
            self.START_SEC, self.END_SEC = 's-headline', '#e-headline'
            self.DOC_PATTERN = '#s-doc\t{0}[\s\S]*#e-doc\t{0}'
            self.SEC_PATTERN = '#s-headline[\s\S]*?(?=#s-headline)'
            self.SEC_PATTERN_LAST = '#s-headline\t{0}[\s\S]*?(?=#e-doc)'

            self.fp_wiki_corpus_head = path_parser.wiki_corpus_head
            self.fp_wiki_files = list()

            for i in range(24):
                if i < 10:
                    i = '0{0}'.format(i)
                self.fp_wiki_files.append(
                    os.path.join(path_parser.wiki_corpus,
                                 'wikipedia-tagged_{}.txt'.format(i)))

            # self.heads = list()
            self.id_to_head, self.head_to_id = dict(), dict()
            with io.open(self.fp_wiki_corpus_head, encoding='utf-8') as f:
                for line in f:
                    items = line.split('\t')
                    if len(items) >= 2:
                        head_id, head = int(items[0]), items[1]
                        self.id_to_head[head_id] = head
                        self.head_to_id[head] = head_id
                        # self.heads.append(head)

            logger.info('#articles: {0}'.format(len(self.id_to_head)))
        else:
            self.START_DOC, self.END_DOC = '<article name="{0}">', '</article>'
            self.START_PARAGRAPH, self.END_PARAGRAPH = '<p>', '</p>'
            self.START_HEADLINE, self.END_HEADLINE = '<h>', '</h>'
            self.DOC_PATTERN = self.START_DOC + '[\s\S]*?' + self.END_DOC
            self.fp_wiki_file = path_parser.wiki_corpus

        # multi-threads
        self.n_extraction_threads = 5  # 10
        self.n_file_threads = 5  # 6
Exemplo n.º 14
0
    def check_cross_du(self):
        for target_dom in doms_final:
            doms_final_copy = list(doms_final)
            doms_final_copy.remove(target_dom)
            other_dom_ids = list()

            for dom in doms_final_copy:
                id_fp = os.path.join(self.dp_doc_dedu, dom + '_ids.txt')
                with io.open(id_fp, encoding='utf-8') as id_f:
                    ids = [l.rstrip('\n') for l in id_f.readlines() if l]
                    other_dom_ids.extend(ids)

            target_id_fp = os.path.join(self.dp_doc_dedu,
                                        target_dom + '_ids.txt')

            with io.open(target_id_fp, encoding='utf-8') as target_id_f:
                target_ids = [
                    l.rstrip('\n') for l in target_id_f.readlines() if l
                ]
                du_ids = [
                    target_id for target_id in target_ids
                    if target_id in other_dom_ids
                ]
                ratio = float(len(du_ids)) / len(target_ids)
                logger.info('{0}: {1}/{2}, {3:.2f}'.format(
                    target_dom, len(du_ids), len(target_ids), ratio))
Exemplo n.º 15
0
    def __init__(self,
                 dataset_type,
                 transform=None,
                 collect_doc_ids=False,
                 doc_ids=None,
                 in_para_only=False):
        super(DomDetDataset, self).__init__()
        self.doms = doms_final

        self.score_func = config_model['score_func']

        self.dataset_parser = DatasetParser()
        self.dataset_type = dataset_type

        assert dataset_type in ('train', 'dev', 'test', 'test_filtered')
        if mode == 'debug':
            dataset_type = 'debug'

        self.root = path_parser.dataset_type_dp[dataset_type]
        self.fns = [
            fn for fn in listdir(self.root) if isfile(join(self.root, fn))
        ]
        if doc_ids:  # only one sample
            logger.info('Load only {} target docs'.format(len(doc_ids)))
            self.fns = [
                fn for fn in self.fns if tools.get_doc_id(fn) in doc_ids
            ]

        self.transform = transform
        self.collect_doc_ids = collect_doc_ids
        self.in_para_only = in_para_only
Exemplo n.º 16
0
def compute_exam_p_and_r(y_true_3d, y_pred_3d, n_sents, n_words, silent=False):
    """
        return a list of precision values.

    :param y_true_4d: d_batch * max_n_sents * max_n_words * 1
    :param y_pred_4d: d_batch * max_n_sents * max_n_words * 1
    :param n_sents: d_batch * 1
    :param n_words: d_batch * max_n_sents * 1
    """
    p_list = list()
    r_list = list()
    d_batch = y_true_3d.shape[0]

    for sample_idx in range(d_batch):
        n_sent = n_sents[sample_idx, 0]
        for sent_idx in range(n_sent):
            n_word = n_words[sample_idx, sent_idx]
            y_true = y_true_3d[sample_idx, sent_idx, :n_word]
            y_pred = y_pred_3d[sample_idx, sent_idx, :n_word]
            if not silent and not y_pred.any() == 1:
                logger.info('No pred is made for: {0}.{1}. y_pred: {2}'.format(
                    sample_idx, sent_idx, y_pred))

            p_list.append(precision_score(y_true, y_pred))
            r_list.append(recall_score(y_true, y_pred))

    return p_list, r_list
Exemplo n.º 17
0
def get_wiki_summ_text_dp(order_subdir,
                          accurate_only,
                          multi_label_only,
                          variation=None):
    if not variation:
        variation = config_model['variation']
    text_root = join(path_parser.summary_text_dps['wiki'], variation)

    if not exists(text_root):
        logger.info("Saving root does not exist. Making directory {}".format(
            text_root))
        mkdir(text_root)

    if accurate_only:
        subdir = '{}_accurate'.format(order_subdir)
    else:
        subdir = '{}_all'.format(order_subdir)

    if multi_label_only:
        subdir = '{}_multilabel'.format(subdir)

    summ_text_dp = join(text_root, subdir)
    if not exists(summ_text_dp):
        logger.info("Saving root does not exist. Making directory {}".format(
            summ_text_dp))
        mkdir(summ_text_dp)

    return summ_text_dp
Exemplo n.º 18
0
def build_rel_scores_tf_passage(cid,
                                query,
                                retrieved_dp=None,
                                tdqfs_data=False):
    _, proc_passages, passage_ids = load_retrieved_passages(
        cid,
        get_sents=False,
        retrieved_dp=retrieved_dp,
        passage_ids=None,
        tdqfs_data=tdqfs_data)
    # passage_ids, passage_fps = get_passage_fps(cid, retrieved_dp=retrieved_dp)
    #     #
    #     # proc_passages = []
    # # passage_ids = []
    # for fp in passage_fps:
    #     # po = load_obj(fp)
    #     with open(fp, 'rb') as f:
    #         po = dill.load(f)
    #     # print('po: {}, type(po): {}'.format(po, type(po)))
    #     # passage_ids.append(po.pid)
    #
    #     passage = po.get_proc_passage()
    #     proc_passages.append(passage)
    #     # logger.info('{}: {}'.format(po.pid, passage))
    rel_scores = _compute_rel_scores_tf(
        [proc_passages], query
    )  # nest proc_passages again for compatibility; todo: double check the nest level
    logger.info('rel_scores: {}'.format(rel_scores))

    pid2score = {}
    for pid, score in zip(passage_ids, rel_scores):
        pid2score[pid] = score

    return pid2score
Exemplo n.º 19
0
def _build_components(cid, query):
    sim_items = tfidf_tools.build_sim_items_e2e_tfidf_with_lexrank(cid, query, rm_dialog=RM_DIALOG)

    sim_mat = vec_tools.norm_sim_mat(sim_mat=sim_items['doc_sim_mat'], max_min_scale=False)
    rel_vec = vec_tools.norm_rel_scores(rel_scores=sim_items['rel_scores'], max_min_scale=False)
    logger.info('rel_vec: {}'.format(rel_vec))

    if len(rel_vec) != len(sim_mat):
        raise ValueError('Incompatible sim_mat size: {} and rel_vec size: {} for cid: {}'.format(
            sim_mat.shape, rel_vec.shape, cid))

    processed_sents = sim_items['processed_sents']
    sid2abs = {}
    sid_abs = 0
    for doc_idx, doc in enumerate(processed_sents):
        for sent_idx, sent in enumerate(doc):
            sid = config.SEP.join((str(doc_idx), str(sent_idx)))
            sid2abs[sid] = sid_abs
            sid_abs += 1

    components = {
        'sim_mat': sim_mat,
        'rel_vec': rel_vec,
        'sid2abs': sid2abs,
    }

    return components
Exemplo n.º 20
0
def _passage_core(cid, query, narr, passage_size, stride):
    original_sents, processed_sents = dataset_parser.cid2sents(cid, max_ns_doc=None)  # 2d lists, docs => sents
    logger.info('#doc: {}'.format(len(original_sents)))

    # build sent_objs
    sent_objs = []  # organized by doc
    sent_idx = 0
    for doc_idx in range(len(original_sents)):
        sent_objs_doc = []
        for original_s, proc_s in zip(original_sents[doc_idx], processed_sents[doc_idx]):
            sid = config.SEP.join([cid, str(sent_idx)])
            so = SentObj(sid=sid, original_sent=original_s, proc_sent=proc_s)
            sent_objs_doc.append(so)
            sent_idx += 1

        sent_objs.append(sent_objs_doc)

    # build passage objs
    passage_objs = []
    for sent_objs_doc in sent_objs:
        start = 0
        # make sure the last sentence whose length < stride will be discarded
        while start + stride < len(sent_objs_doc):
            pid = config.SEP.join([cid, str(len(passage_objs))])

            target_sent_objs = sent_objs_doc[start:start+passage_size]
            po = PassageObj(pid=pid, query=query, narr=narr, sent_objs=target_sent_objs)
            passage_objs.append(po)

            start += stride

    return passage_objs
Exemplo n.º 21
0
def compute_rouge():
    tg2recall_all = {}
    tg2f1_all = {}
    for idx in range(N_REFS):
        build_eval_dirs(summary_index=idx + 1)
        tg2recall, tg2f1 = compute_rouge_for_human(system_dp=SYSTEM_DP_TEMP,
                                                   model_dp=MODEL_DP_TEMP)
        logger.info('tg2f1: {}'.format(tg2f1))
        for metric in ROUGE_METRICS:
            if metric in tg2recall_all:
                tg2recall_all[metric] += tg2recall[metric]
            else:
                tg2recall_all[metric] = tg2recall[metric]

            if metric in tg2f1_all:
                tg2f1_all[metric] += tg2f1[metric]
            else:
                tg2f1_all[metric] = tg2f1[metric]

    for metric in ROUGE_METRICS:
        tg2recall_all[metric] /= N_REFS
        tg2f1_all[metric] /= N_REFS

    recall_str = 'Recall:\t{}'.format('\t'.join(
        ['{0:.2f}'.format(tg2recall_all[metric]) for metric in ROUGE_METRICS]))
    f1_str = 'F1:\t{}'.format('\t'.join(
        ['{0:.2f}'.format(tg2f1_all[metric]) for metric in ROUGE_METRICS]))

    output = '\n' + '\n'.join((f1_str, recall_str))
    logger.info(output)
Exemplo n.º 22
0
def build_tdqfs_oracle_test_cid_query_dicts(query_fp):
    def _get_ref(cid):
        REF_DP = path_parser.data_tdqfs_summary_targets
        fp = join(REF_DP, '{}_{}'.format(cid, 0))
        ref = ''
        # for fn in fns:
        lines = io.open(fp, encoding='utf-8').readlines()
        for line in lines:
            ref += line.rstrip('\n')

        return ref

    assert config_meta['test_year'] == 'tdqfs'
    lines = io.open(query_fp).readlines()
    cids = [line.rstrip('\n').split('\t')[0] for line in lines]
    test_cid_query_dicts = []
    for cid in cids:
        ref = _get_ref(cid)
        logger.info('cid {}: {}'.format(cid, ref))

        test_cid_query_dicts.append({
            'cid': cid,
            'query': ref,
        })
    return test_cid_query_dicts
Exemplo n.º 23
0
def test_major_sent(synthese):
    # logger.info('START: model testing...')
    dataset_type = 'synthese' if synthese else 'lead'
    data_loader = pipe.DomDetDataLoader(dataset_type=dataset_type)

    n_iter, total_loss = 0, 0.0
    n_samples, total_hamming = 0, 0.0

    cf_mats, precision_list, recall_list = list(), list(), list()

    for batch_idx, batch in enumerate(data_loader):
        n_iter += 1

        y_true = batch['labels'].cpu().numpy()
        d_batch = len(y_true)
        des_sent_info = batch['des_sent_info'].cpu().numpy()
        n_samples += np.sum(des_sent_info[:, -1])

        # logger.info('batch_size: {0}'.format(y_true.shape[0]))

        if synthese:
            hyp_scores = np.tile(y_pred_vec, (d_batch, 1))
            fids = batch['fids'].cpu().numpy()
            eval_args = {
                'hyp_scores': hyp_scores,
                'fids': fids,
                'is_hiernet': True
            }
            eval_res = metrics.metric_eval_for_syn_doc(**eval_args)
        else:
            hyp_scores = np.tile(y_pred_vec, (d_batch, max_n_sents, 1))
            eval_args = {
                'y_true': y_true,
                'hyp_scores': hyp_scores,
                'des_sent_info': des_sent_info,
            }
            eval_res = metrics.metric_eval(**eval_args)

        cf_mats.append(eval_res['cf_mat_list'])
        precision_list.extend(eval_res['precision_list'])
        recall_list.extend(eval_res['recall_list'])
        total_hamming += eval_res['hamming']

    cls_f1, avg_f1 = metrics.compute_f1_with_confusion_mats(cf_mats)
    example_based_f1 = metrics.compute_example_based_f1(
        precision_list=precision_list, recall_list=recall_list)
    hamming = total_hamming / n_samples

    eval_log_info = {
        'example_based_f1': example_based_f1,
        'avg_f1': avg_f1,
        'cls_f1': cls_f1,
        'hamming': hamming,
    }

    res_str = 'example_based_f1: {example_based_f1:.6f},' \
              'avg_f1: {avg_f1:.6f}, cls_f1: {cls_f1}, hamming: {hamming:.6f}'

    logger.info(res_str.format(**eval_log_info))
Exemplo n.º 24
0
def dump_retrieval(fp, retrieved_items):
    retrieve_records = ['\t'.join(items) for items in retrieved_items]
    n_sents = rank_sent.dump_rank_records(rank_records=retrieve_records,
                                          out_fp=fp,
                                          with_rank_idx=False)

    logger.info('successfully dumped {0} retrieved items to {1}'.format(
        n_sents, fp))
Exemplo n.º 25
0
def validate(data_type, window):
    dump_fp = join(path_parser.squad_proc, 'window_{}'.format(window),
                   '{}.tsv'.format(data_type))
    with io.open(dump_fp, encoding='utf-8') as dump_f:
        line = dump_f.readline()
        if len(line.split('\t')) != 5:
            raise ValueError('Invalid line: {}'.format(line))
    logger.info('Validated!')
Exemplo n.º 26
0
    def sample_human_eval_fns(self, max_dom_doc=2):
        selected_fns = []
        dict_dom_n_files = dict()
        for dom in self.doms_final:
            dict_dom_n_files[dom] = 0

        root_path = path_parser.dataset_test
        target_path = path_parser.dataset_test_human_eval

        fns = [fn for fn in listdir(root_path) if
               not fn.startswith('.') and len(fn.split('_')) >= 2 and isfile(join(root_path, fn))]

        logger.info('#files: {}'.format(len(fns)))

        existing_fns = [fn for fn in listdir(target_path) if
                        not fn.startswith('.') and len(fn.split('_')) >= 2 and isfile(join(target_path, fn))]

        for fn in existing_fns:
            labels = fn.split('_')[1:]
            # logger.info('labels: {}'.format(labels))
            for label in labels:
                dict_dom_n_files[label] += 1

        for k, v in dict_dom_n_files.items():
            logger.info('Existing: {0} - {1}'.format(k, v))

        if min(dict_dom_n_files.values()) == max_dom_doc:
            logger.error('Already full!')
            raise FileExistsError

        random.shuffle(fns)
        full_status = False

        for fn in fns:
            labels = fn.split('_')[1:]
            # logger.info('labels: {0}'.format(labels))

            ahead_overflow = False
            for label in labels:
                if dict_dom_n_files[label] == max_dom_doc:
                    ahead_overflow = True
                    break

            if not ahead_overflow and self.check_existence_of_paras(fp=join(root_path, fn)):
                selected_fns.append(fn)
                for label in labels:
                    dict_dom_n_files[label] += 1

            if min(dict_dom_n_files.values()) == max_dom_doc:
                full_status = True

            if full_status:
                break

        for fn in selected_fns:
            old_path = join(root_path, fn)
            new_path = join(target_path, fn)
            sp.call(['cp', old_path, new_path])
Exemplo n.º 27
0
    def __init__(self, model_out_fp_1, model_out_fp_2, gold_fp, iter=10000):
        self.obs_1 = get_obs(fp=model_out_fp_1)
        self.obs_2 = get_obs(fp=model_out_fp_2)
        self.gold = get_obs(fp=gold_fp)
        assert self.obs_1.shape == self.obs_2.shape == self.gold.shape
        self.n_obs = self.obs_1.shape[0]
        logger.info('n_obs: {0}'.format(self.n_obs))

        self.iter = iter
Exemplo n.º 28
0
def _prune_rank_items(rank_items, threshold=1e-10):
    if float(rank_items[-1][1]) > threshold:
        logger.info('Prune ratio: 0.00')
        return rank_items

    for i in range(len(rank_items)):
        if float(rank_items[i][1]) <= threshold:
            logger.info('Prune ratio: {0:.2f}'.format(
                float(i) / len(rank_items)))
            return rank_items[:i]
Exemplo n.º 29
0
def get_bert_in_func():
    if config.meta_model_name == 'bert_qa':
        from frame.bert_qa import bert_input
        bert_in_func = bert_input.build_bert_x
    else:
        from data import bert_input_sep
        bert_in_func = bert_input_sep.build_bert_x_sep

    logger.info('Using bert_in_func: {}'.format(config.meta_model_name))
    return bert_in_func
Exemplo n.º 30
0
def _dump_passages(year, cid, passage_objs):
    cc_dp = join(path_parser.data_passages, year, cid)
    if not exists(cc_dp):  # remove previous output
        os.mkdir(cc_dp)

    for po in passage_objs:
        with open(join(cc_dp, po.pid), 'wb') as f:
            dill.dump(po, f)

    logger.info('[_dump_passages] Dump {} passage objects to {}'.format(len(passage_objs), cc_dp))