Exemplo n.º 1
0
def build_sim_items_e2e(cid,
                        query,
                        mask_intra,
                        max_ns_doc=None,
                        retrieved_dp=None,
                        sentence_rep='tfidf',
                        rm_dialog=True):
    if retrieved_dp:
        original_sents, processed_sents = load_retrieved_sentences(
            retrieved_dp=retrieved_dp, cid=cid)
    else:
        original_sents, processed_sents = dataset_parser.cid2sents(
            cid, rm_dialog=rm_dialog,
            max_ns_doc=max_ns_doc)  # 2d lists, docs => sents

    assert sentence_rep == 'tfidf'
    res = _compute_sim_mat_tfidf(processed_sents=processed_sents,
                                 query=query,
                                 mask_intra=mask_intra)

    sim_items = {
        'doc_sim_mat': res['doc_sim_mat'],
        'rel_scores': res['rel_scores'],
        'processed_sents': processed_sents,
        'original_sents': original_sents,
    }

    return sim_items
Exemplo n.º 2
0
    def __init__(self, cid, query, retrieve_dp, transform=None):
        super(ClusterDataset, self).__init__()
        original_sents, _ = load_retrieved_sentences(retrieved_dp=retrieve_dp, cid=cid)
        self.sentences = original_sents[0]

        self.query = query
        self.yy = 0.0  # 0.0

        self.transform = transform
Exemplo n.º 3
0
def build_sim_items_e2e_tfidf_with_lexrank(cid,
                                           query,
                                           max_ns_doc=None,
                                           retrieved_dp=None,
                                           rm_dialog=True):
    """
        Initialize LexRank with document-wise organized sentences to get true IDF.

    :param cid:
    :param query:
    :param max_ns_doc:
    :param retrieved_dp:
    :param rm_dialog:
    :return:
    """
    if retrieved_dp:
        original_sents, processed_sents = load_retrieved_sentences(
            retrieved_dp=retrieved_dp, cid=cid)
    else:
        if 'tdqfs' in config.test_year:
            original_sents, processed_sents = dataset_parser.cid2sents_tdqfs(
                cid)
        else:
            original_sents, processed_sents = dataset_parser.cid2sents(
                cid, rm_dialog=rm_dialog,
                max_ns_doc=max_ns_doc)  # 2d lists, docs => sents

    lxr = LexRank(processed_sents, stopwords=STOPWORDS['en'])

    doc_sents = list(itertools.chain(*processed_sents))  # 1d sent list
    doc_sents = copy.deepcopy(
        doc_sents)  # avoid affecting the original doc_sents list
    doc_sents.append(query)

    sim_mat = lxr.get_tfidf_similarity_matrix(sentences=doc_sents)

    doc_sim_mat = sim_mat[:-1, :-1]
    rel_scores = sim_mat[-1, :-1]
    # logger.info('doc_sim_mat: {}, rel_scores: {}'.format(doc_sim_mat.shape, rel_scores.shape))

    sim_items = {
        'doc_sim_mat': doc_sim_mat,
        'rel_scores': rel_scores,
        'processed_sents': processed_sents,
        'original_sents': original_sents,
    }

    return sim_items
Exemplo n.º 4
0
def build_rel_scores_tf(cid, query, max_ns_doc=None, retrieved_dp=None):
    if retrieved_dp:
        original_sents, processed_sents = load_retrieved_sentences(
            retrieved_dp=retrieved_dp, cid=cid)
    else:
        original_sents, processed_sents = dataset_parser.cid2sents(
            cid, max_ns_doc=max_ns_doc)  # 2d lists, docs => sents
    rel_scores = _compute_rel_scores_tf(processed_sents, query)

    res = {
        'rel_scores': rel_scores,
        'processed_sents': processed_sents,
        'original_sents': original_sents,
    }

    return res
Exemplo n.º 5
0
def rel_scores2rank():
    if exists(rank_dp):
        raise ValueError('rank_dp exists: {}'.format(rank_dp))
    os.mkdir(rank_dp)

    for cid in tqdm(cids):
        rel_scores = load_rel_scores(cid=cid, rel_scores_dp=rel_scores_dp)
        sent_ids = np.argsort(rel_scores)[::-1].tolist()

        sid_score_list = []
        for sid in sent_ids:
            sid_score = ('0_{}'.format(sid), rel_scores[sid])
            sid_score_list.append(sid_score)

        original_sents, _ = load_retrieved_sentences(retrieved_dp=ir_rec_dp,
                                                     cid=cid)
        rank_records = rank_sent.get_rank_records(sid_score_list,
                                                  sents=original_sents)

        n_sents = rank_sent.dump_rank_records(rank_records=rank_records,
                                              out_fp=join(rank_dp, cid),
                                              with_rank_idx=False)
        logger.info('Dump {} ranking records'.format(n_sents))
Exemplo n.º 6
0
def rank_end2end(model_name,
                 diversity_param_tuple,
                 component_name=None,
                 n_iter=None,
                 rank_dp=None,
                 retrieved_dp=None,
                 rm_dialog=True,
                 cc_ids=None):
    """

    :param model_name:
    :param diversity_param_tuple:
    :param component_name:
    :param n_iter:
    :param rank_dp:
    :param retrieved_dp:
    :param rm_dialog: only useful when retrieved_dp=None
    :return:
    """
    dp_mode = 'r'
    dp_params = {
        'n_iter': n_iter,
        'mode': dp_mode,
    }

    diversity_weight, diversity_algorithm = diversity_param_tuple

    # todo: double check this condition; added later for avoiding bug for centrality-tfidf.
    # # one model has only one suit of summary components but different ranking sys
    if component_name:
        dp_params['model_name'] = component_name
    else:
        dp_params['model_name'] = model_name

    summ_comp_root = graph_io.get_summ_comp_root(**dp_params)
    sim_mat_dp = graph_io.get_sim_mat_dp(summ_comp_root, mode=dp_mode)
    rel_vec_dp = graph_io.get_rel_vec_dp(summ_comp_root, mode=dp_mode)
    sid2abs_dp = graph_io.get_sid2abs_dp(summ_comp_root, mode=dp_mode)
    sid2score_dp = graph_io.get_sid2score_dp(summ_comp_root, mode=dp_mode)

    if not rank_dp:
        rank_dp_params = {
            'model_name': model_name,
            'n_iter': n_iter,
            'diversity_param_tuple': diversity_param_tuple,
        }

        rank_dp = tools.get_rank_dp(**rank_dp_params)

    if exists(rank_dp):
        raise ValueError('rank_dp exists: {}'.format(rank_dp))
    os.mkdir(rank_dp)

    dps = {
        'sim_mat_dp': sim_mat_dp,
        'rel_vec_dp': rel_vec_dp,
        'sid2abs_dp': sid2abs_dp,
    }

    if not cc_ids:
        cc_ids = tools.get_test_cc_ids()
    
    for cid in tqdm(cc_ids):
        # logger.info('cid: {}'.format(cid))
        comp_params = {
            **dps,
            'cid': cid,
        }
        components = graph_io.load_components(**comp_params)
        # logger.info('[GRAPH RANK 1/2] successfully loaded components')
        sid2score = graph_io.load_sid2score(sid2score_dp, cid)

        if retrieved_dp:
            original_sents, _ = load_retrieved_sentences(retrieved_dp=retrieved_dp, cid=cid)
        else:
            if 'tdqfs' in config.test_year:
                original_sents, _ = dataset_parser.cid2sents_tdqfs(cid)
            else:
                original_sents, _ = dataset_parser.cid2sents(cid, rm_dialog=rm_dialog)  # 2d lists, docs => sents

        diversity_params = {
            'sid2score': sid2score,
            'sid2abs': components['sid2abs'],
            'sim_mat': components['sim_mat'],
            'original_sents': original_sents,
        }

        if diversity_algorithm == 'wan':
            diversity_params['omega'] = diversity_weight
            rank_records = _rank_with_diversity_penalty_wan(**diversity_params)
        else:
            raise ValueError('Invalid diversity_algorithm: {}'.format(diversity_algorithm))

        logger.info('cid: {}, #rank_records: {}'.format(cid, len(rank_records)))
        rank_sent.dump_rank_records(rank_records, out_fp=join(rank_dp, cid), with_rank_idx=False)

    logger.info('[GRAPH RANK] Finished. Rankings were dumped to: {}'.format(rank_dp))
Exemplo n.º 7
0
    def __init__(self,
                 cid,
                 rank_fp,
                 text_dp,
                 cos_threshold,
                 max_n_summary_words,
                 rel_sents_dp=None,
                 retrieved_dp=None,
                 rm_dialog=True):
        """
            before generate summaries,
            rank sentences in a cluster first,
            and save the rankings (see model_exec.py).

        :param rm_dialog: only useful when retrieved_dp=None
        """
        self.cid = cid
        self.cos_threshold = cos_threshold

        self.word_tokenize = nltk.tokenize.word_tokenize

        # fps for rank and text
        self.rank_fp = rank_fp
        if not exists(self.rank_fp):
            raise ValueError('rank_fp does not exist: {}'.format(self.rank_fp))

        self.text_fp = join(text_dp, cid)  # for dumping summaries

        # 2|3-d list organized by: docs => paragraphs => sents
        if rel_sents_dp and retrieved_dp:
            raise ValueError(
                'Specify only one of rel_sents_dp and retrieved_dp!')

        if rel_sents_dp:
            self.use_filter_sents = True
            rel_sents_fp = join(rel_sents_dp, cid)
            self.original_sents, self.processed_sents = dataset_parser.parse_rel_sents_file(
                rel_sents_fp)  # 1d sentence lists

        elif retrieved_dp:
            self.use_filter_sents = False
            self.original_sents, self.processed_sents = load_retrieved_sentences(
                retrieved_dp=retrieved_dp, cid=cid)

        else:
            self.use_filter_sents = False

            if 'tdqfs' in config.test_year:
                self.original_sents, self.processed_sents = dataset_parser.cid2sents_tdqfs(
                    cid)
            else:
                self.original_sents, self.processed_sents = dataset_parser.cid2sents(
                    cid, rm_dialog=rm_dialog)

        if max_n_summary_words:
            self.max_n_summary_words = max_n_summary_words

        logger.info('[Selector.__init__] max_nw for {}: {}'.format(
            cid, self.max_n_summary_words))

        self.summary_sent_words = []  # 2-d list organized by: sents => words