Exemplo n.º 1
0
def build_components_e2e():
    dp_params = {
        'model_name': MODEL_NAME,
        'n_iter': None,
        'mode': 'w',
    }

    summ_comp_root = graph_io.get_summ_comp_root(**dp_params)
    sim_mat_dp = graph_io.get_sim_mat_dp(summ_comp_root, mode='w')
    rel_vec_dp = graph_io.get_rel_vec_dp(summ_comp_root, mode='w')
    sid2abs_dp = graph_io.get_sid2abs_dp(summ_comp_root, mode='w')

    logger.info('sim_mat_dp: {}'.format(sim_mat_dp))
    logger.info('rel_vec_dp: {}'.format(rel_vec_dp))
    logger.info('sid2abs_dp: {}'.format(sid2abs_dp))

    test_cid_query_dicts = general_tools.build_test_cid_query_dicts(tokenize_narr=False,
                                                                    concat_title_narr=CONCAT_TITLE_NARR,
                                                                    query_type=QUERY_TYPE)

    for params in tqdm(test_cid_query_dicts):
        logger.info('cid: {}'.format(params['cid']))

        components = _build_components(**params)
        graph_io.dump_sim_mat(sim_mat=components['sim_mat'], sim_mat_dp=sim_mat_dp, cid=params['cid'])
        graph_io.dump_rel_vec(rel_vec=components['rel_vec'], rel_vec_dp=rel_vec_dp, cid=params['cid'])
        graph_io.dump_sid2abs(sid2abs=components['sid2abs'], sid2abs_dp=sid2abs_dp, cid=params['cid'])
def build_components_e2e():
    dp_params = {
        'model_name': centrality_ensemble_config.CENTRALITY_MODEL_NAME_BASIC,
        'n_iter': None,
        'mode': 'w',
    }

    summ_comp_root = graph_io.get_summ_comp_root(**dp_params)
    sim_mat_dp = graph_io.get_sim_mat_dp(summ_comp_root, mode='w')
    rel_vec_dp = graph_io.get_rel_vec_dp(summ_comp_root, mode='w')
    sid2abs_dp = graph_io.get_sid2abs_dp(summ_comp_root, mode='w')

    logger.info('sim_mat_dp: {}'.format(sim_mat_dp))
    logger.info('rel_vec_dp: {}'.format(rel_vec_dp))
    logger.info('sid2abs_dp: {}'.format(sid2abs_dp))
    for params in tqdm(test_cid_query_dicts):
        logger.info('cid: {}'.format(params['cid']))

        components = _build_components(**params)
        graph_io.dump_sim_mat(sim_mat=components['sim_mat'],
                              sim_mat_dp=sim_mat_dp,
                              cid=params['cid'])
        graph_io.dump_rel_vec(rel_vec=components['rel_vec'],
                              rel_vec_dp=rel_vec_dp,
                              cid=params['cid'])
        graph_io.dump_sid2abs(sid2abs=components['sid2abs'],
                              sid2abs_dp=sid2abs_dp,
                              cid=params['cid'])
Exemplo n.º 3
0
def score_end2end(model_name, n_iter=None, damp=0.85, use_rel_vec=True, cc_ids=None):
    dp_mode = 'r'
    dp_params = {
        'model_name': model_name,  # one model has only one suit of summary components but different ranking sys
        'n_iter': n_iter,
        'mode': dp_mode,
    }

    summ_comp_root = graph_io.get_summ_comp_root(**dp_params)
    sim_mat_dp = graph_io.get_sim_mat_dp(summ_comp_root, mode=dp_mode)
    rel_vec_dp = graph_io.get_rel_vec_dp(summ_comp_root, mode=dp_mode)
    sid2abs_dp = graph_io.get_sid2abs_dp(summ_comp_root, mode=dp_mode)

    sid2score_dp = graph_io.get_sid2score_dp(summ_comp_root, mode='w')

    dps = {
        'sim_mat_dp': sim_mat_dp,
        'rel_vec_dp': rel_vec_dp,
        'sid2abs_dp': sid2abs_dp,
    }

    if not cc_ids:
        cc_ids = tools.get_test_cc_ids()
    
    for cid in tqdm(cc_ids):
        comp_params = {
            **dps,
            'cid': cid,
        }
        components = graph_io.load_components(**comp_params)
        # logger.info('[GRAPH RANK 1/2] successfully loaded components')

        abs2sid = {}
        for sid, abs in components['sid2abs'].items():
            abs2sid[abs] = sid

        scoring_params = {
            'sim_mat': components['sim_mat'],
            'rel_vec': components['rel_vec'].transpose() if use_rel_vec else None,
            # 'rel_vec': components['rel_vec'] if use_rel_vec else None,
            'cid': cid,
            'damp': damp,
            'abs2sid': abs2sid,
            # 'rm_dialog': rm_dialog,
        }

        sid2score = _score_graph_initially(**scoring_params)
        graph_io.dump_sid2score(sid2score=sid2score, sid2score_dp=sid2score_dp, cid=cid)

        # logger.info('[GRAPH RANK 2/2] successfully completed initial scoring')

    logger.info('[GRAPH RANK] Finished. Scores were dumped to: {}'.format(sid2score_dp))
Exemplo n.º 4
0
def build_components_e2e():
    dp_params = {
        'model_name': MODEL_NAME,
        'n_iter': None,
        'mode': 'w',
    }

    summ_comp_root = graph_io.get_summ_comp_root(**dp_params)
    sim_mat_dp = graph_io.get_sim_mat_dp(summ_comp_root, mode='w')
    rel_vec_dp = graph_io.get_rel_vec_dp(summ_comp_root, mode='w')
    sid2abs_dp = graph_io.get_sid2abs_dp(summ_comp_root, mode='w')

    logger.info('sim_mat_dp: {}'.format(sim_mat_dp))
    logger.info('rel_vec_dp: {}'.format(rel_vec_dp))
    logger.info('sid2abs_dp: {}'.format(sid2abs_dp))

    test_cid_query_dicts = general_tools.build_test_cid_query_dicts(
        tokenize_narr=False,
        concat_title_narr=False,
        query_type=centrality_config.QUERY_TYPE)

    for params in tqdm(test_cid_query_dicts):
        components = _build_components(**params)

        graph_io.dump_sim_mat(sim_mat=components['sim_mat'],
                              sim_mat_dp=sim_mat_dp,
                              cid=params['cid'])
        graph_io.dump_rel_vec(rel_vec=components['rel_vec'],
                              rel_vec_dp=rel_vec_dp,
                              cid=params['cid'])
        graph_io.dump_sid2abs(sid2abs=components['sid2abs'],
                              sid2abs_dp=sid2abs_dp,
                              cid=params['cid'])

        logger.info(
            '[BUILD GRAPH COMPONENT] dumping sim mat file to: {0}'.format(
                sim_mat_dp))
        logger.info(
            '[BUILD GRAPH COMPONENT] dumping rel vec file to: {0}'.format(
                rel_vec_dp))
        logger.info(
            '[BUILD GRAPH COMPONENT] dumping sid2abs file to: {0}'.format(
                sid2abs_dp))
Exemplo n.º 5
0
def build_components_e2e():
    dp_params = {
        'model_name': model_name,
        'n_iter': None,
        'mode': 'w',
    }

    summ_comp_root = graph_io.get_summ_comp_root(**dp_params)
    sim_mat_dp = graph_io.get_sim_mat_dp(summ_comp_root, mode='w')
    rel_vec_dp = graph_io.get_rel_vec_dp(summ_comp_root, mode='w')
    sid2abs_dp = graph_io.get_sid2abs_dp(summ_comp_root, mode='w')

    for params in tqdm(test_cid_query_dicts):
        components = _build_components(**params)

        graph_io.dump_sim_mat(sim_mat=components['sim_mat'],
                              sim_mat_dp=sim_mat_dp,
                              cid=params['cid'])
        graph_io.dump_rel_vec(rel_vec=components['rel_vec'],
                              rel_vec_dp=rel_vec_dp,
                              cid=params['cid'])
        graph_io.dump_sid2abs(sid2abs=components['sid2abs'],
                              sid2abs_dp=sid2abs_dp,
                              cid=params['cid'])
Exemplo n.º 6
0
def rank_end2end(model_name,
                 diversity_param_tuple,
                 component_name=None,
                 n_iter=None,
                 rank_dp=None,
                 retrieved_dp=None,
                 rm_dialog=True,
                 cc_ids=None):
    """

    :param model_name:
    :param diversity_param_tuple:
    :param component_name:
    :param n_iter:
    :param rank_dp:
    :param retrieved_dp:
    :param rm_dialog: only useful when retrieved_dp=None
    :return:
    """
    dp_mode = 'r'
    dp_params = {
        'n_iter': n_iter,
        'mode': dp_mode,
    }

    diversity_weight, diversity_algorithm = diversity_param_tuple

    # todo: double check this condition; added later for avoiding bug for centrality-tfidf.
    # # one model has only one suit of summary components but different ranking sys
    if component_name:
        dp_params['model_name'] = component_name
    else:
        dp_params['model_name'] = model_name

    summ_comp_root = graph_io.get_summ_comp_root(**dp_params)
    sim_mat_dp = graph_io.get_sim_mat_dp(summ_comp_root, mode=dp_mode)
    rel_vec_dp = graph_io.get_rel_vec_dp(summ_comp_root, mode=dp_mode)
    sid2abs_dp = graph_io.get_sid2abs_dp(summ_comp_root, mode=dp_mode)
    sid2score_dp = graph_io.get_sid2score_dp(summ_comp_root, mode=dp_mode)

    if not rank_dp:
        rank_dp_params = {
            'model_name': model_name,
            'n_iter': n_iter,
            'diversity_param_tuple': diversity_param_tuple,
        }

        rank_dp = tools.get_rank_dp(**rank_dp_params)

    if exists(rank_dp):
        raise ValueError('rank_dp exists: {}'.format(rank_dp))
    os.mkdir(rank_dp)

    dps = {
        'sim_mat_dp': sim_mat_dp,
        'rel_vec_dp': rel_vec_dp,
        'sid2abs_dp': sid2abs_dp,
    }

    if not cc_ids:
        cc_ids = tools.get_test_cc_ids()
    
    for cid in tqdm(cc_ids):
        # logger.info('cid: {}'.format(cid))
        comp_params = {
            **dps,
            'cid': cid,
        }
        components = graph_io.load_components(**comp_params)
        # logger.info('[GRAPH RANK 1/2] successfully loaded components')
        sid2score = graph_io.load_sid2score(sid2score_dp, cid)

        if retrieved_dp:
            original_sents, _ = load_retrieved_sentences(retrieved_dp=retrieved_dp, cid=cid)
        else:
            if 'tdqfs' in config.test_year:
                original_sents, _ = dataset_parser.cid2sents_tdqfs(cid)
            else:
                original_sents, _ = dataset_parser.cid2sents(cid, rm_dialog=rm_dialog)  # 2d lists, docs => sents

        diversity_params = {
            'sid2score': sid2score,
            'sid2abs': components['sid2abs'],
            'sim_mat': components['sim_mat'],
            'original_sents': original_sents,
        }

        if diversity_algorithm == 'wan':
            diversity_params['omega'] = diversity_weight
            rank_records = _rank_with_diversity_penalty_wan(**diversity_params)
        else:
            raise ValueError('Invalid diversity_algorithm: {}'.format(diversity_algorithm))

        logger.info('cid: {}, #rank_records: {}'.format(cid, len(rank_records)))
        rank_sent.dump_rank_records(rank_records, out_fp=join(rank_dp, cid), with_rank_idx=False)

    logger.info('[GRAPH RANK] Finished. Rankings were dumped to: {}'.format(rank_dp))