def standard_workflow(self,
                          train_data_source,
                          val_data_source=None,
                          test_data_source=None,
                          epochs=1,
                          logging_period=10,
                          eval_train=False,
                          after_epoch_func=None):
        """
        Runs a workflow of steps such as training and evaluation. It executes
        a very general workflow, where eval flags can be assigned in order to
        perform evaluation on train, val, and test data-sources.
        
        :param train_data_source: self-explanatory.
        :param val_data_source: self-explanatory.
        :param test_data_source: self-explanatory.
        :param epochs: self-explanatory.
        :param logging_period: how often to log the loss of the model
        :param eval_train: whether to eval performance on the training
                           data-source.
        :param after_epoch_func: a function that takes as input 'epoch', and is
                                 executed after completion of each epoch, except
                                 the last one. E.g. model saving.
        """
        if val_data_source:
            metrs = self.eval_on_intern_metrs(data_source=val_data_source)
            logger.info(metrics_to_str(metrs, "Validation"))

        epoch = 0
        for epoch in range(1, epochs + 1):
            logger.info('Epoch %d/%d' % (epoch, epochs))
            self.train(data_source=train_data_source,
                       epoch=epoch,
                       logging_steps=logging_period)

            if eval_train:
                metrs = self.eval_on_intern_metrs(
                    data_source=train_data_source, epoch=epoch)
                if metrs:
                    logger.info(metrics_to_str(metrs, "Training"))

            if val_data_source:
                metrs = self.eval_on_intern_metrs(data_source=val_data_source)
                logger.info(metrics_to_str(metrs, "Validation"))

            if epoch != epochs and after_epoch_func:
                after_epoch_func(epoch)

        if test_data_source:
            metrs = self.eval_on_intern_metrs(data_source=test_data_source,
                                              epoch=epoch)
            logger.info(metrics_to_str(metrs, "Testing"))
예제 #2
0
def evaluate_summs(gen_summ_file_path, gold_summ_file_path, gold_summ_fnames):
    """
    Evaluates generated versus true summaries using Google ROUGE script.

    :param gen_summ_file_path: txt file path to generated summaries. Each
        summary should be on a separate line.
    :param gold_summ_file_path: csv file path to gold summaries.
    :param gold_summ_fnames: list of CSV columns to gold summaries.
    """

    gen_summs = read_text_file(gen_summ_file_path)
    rouge_scorer = GoogleRouge()

    gold_ds = pd.read_csv(gold_summ_file_path,
                          sep="\t",
                          quotechar="\'",
                          encoding='utf-8')
    assert len(gen_summs) == len(gold_ds)

    gold_summs = [[du[gsumm_fname] for gsumm_fname in gold_summ_fnames]
                  for _, du in gold_ds.iterrows()]

    for _gen_summ, _gold_summs in zip(gen_summs, gold_summs):
        rouge_scorer.accum([_gen_summ], [_gold_summs])
    res = rouge_scorer.aggr()

    for k, v in res.items():
        print("%s %s" % (k, metrics_to_str(v)))
    def train(self, data_source, logging_steps=10, **kwargs):
        """
        Performs a single epoch training on the passed `data_source`.

        :param data_source: self-explanatory.
        :param logging_steps: how often to log training produced batch metrics.
        """
        empty_cache()
        logger.info("Training data source: %s" % data_source)
        total_batches = 0
        total_revs = 0
        start = time()

        data_chunk_iter = self.train_data_pipeline.iter(**data_source)

        for indx, batch in enumerate(data_chunk_iter, 1):
            c_lambd = self.c_kl_ann(increment_indx=True)
            z_lambd = self.z_kl_ann(increment_indx=True)
            metrics = self.imodel.train(batch,
                                        c_lambd=c_lambd,
                                        z_lambd=z_lambd)
            total_revs += len(batch[ModelF.REV])
            if indx % logging_steps == 0:
                mess = metrics_to_str(metrics, prefix="Chunk # %d" % indx)
                logger.info(mess)
            total_batches += 1

        logger.info("Epoch training time elapsed: %.2f (s)." %
                    (time() - start))
        logger.info("Total reviews: %d." % total_revs)
    def train(self, data_source, logging_steps=10, **kwargs):
        """
        Performs a single epoch training on the passed data_source.

        :param data_source: self-explanatory.
        :param logging_steps: self-explanatory.
        """
        logger.info("Training data source: %s" % data_source)
        start = time()
        for i, batch in enumerate(self.train_data_pipeline.iter(**data_source),
                                  1):
            metrics = self.imodel.train(batch=batch, **kwargs)
            if i % logging_steps == 0:
                mess = metrics_to_str(metrics, prefix="Chunk # %d" % i)
                logger.info(mess)
        logger.info("Epoch training time elapsed: %.2f (s)" % (time() - start))
예제 #5
0
    def eval(self, data_source, output_file_path=None):
        """
        Assumes that batches contain SUMMS that are lists of sublists,
        where is sublist contain a fixed number of summary strings. I.e.
        summaries should not be tokenized.

        :param data_source:
        :param output_file_path:
        """
        output_dc = DataChunk(
            **{
                OutputF.GOLD_SUMMS: [],
                OutputF.GEN_SUMM: [],
                OutputF.GROUP_ID: [],
                OutputF.CAT: [],
                OutputF.ROUGE: [],
                OutputF.REV: []
            })
        rouge_evaluator = Rouge()
        skipped_summs = 0

        for batch in self.data_pipeline.iter(**data_source):
            # notice that each product has K true summaries created by
            # annotators
            true_summs = batch[ModelF.SUMMS]
            prod_ids = batch[ModelF.SUMM_GROUP_ID]
            cats = batch[ModelF.SUMM_CAT]

            # getting group reviews that were used as input to produce summaries
            inp_revs = self.revs_formatter_func(batch[ModelF.REV])
            group_rev_indxs = batch[ModelF.GROUP_REV_INDXS]
            group_rev_indxs_mask = batch[ModelF.GROUP_REV_INDXS_MASK]
            group_revs = get_group_reviews(inp_revs, group_rev_indxs,
                                           group_rev_indxs_mask)

            gen_summs = self.summs_gen_func(batch)

            assert (len(true_summs) == len(gen_summs))

            # accumulating ROUGE statistics
            res = []
            for gen_summ, _true_summs in zip(gen_summs, true_summs):

                if len(gen_summ) == 0:
                    skipped_summs += 1
                    res.append(None)
                    continue

                # extra [] wrapping is needed as the accum method is batch based
                r_avg, _, r_max, _ = rouge_evaluator.accum(
                    hypotheses=[gen_summ], references=[_true_summs])
                if self.avg_rouge:
                    curr_rouge = r_avg
                else:
                    curr_rouge = r_max
                res.append(curr_rouge)

            # splitting by the sentence for better visualization
            if self.sent_splitter:
                group_revs = self.split_group_seqs_by_sents(group_revs)
                true_summs = self.split_group_seqs_by_sents(true_summs)
                gen_summs = [self.sent_splitter(summ) for summ in gen_summs]

            # storing the output batch for later dumping
            output_dc[OutputF.GOLD_SUMMS] += true_summs
            output_dc[OutputF.GEN_SUMM] += gen_summs
            output_dc[OutputF.REV] += group_revs
            output_dc[OutputF.CAT] += list(cats)
            output_dc[OutputF.GROUP_ID] += list(prod_ids)
            output_dc[OutputF.ROUGE] += res

        # running analytics
        if self.analytics_func:
            if self.sent_splitter:
                # performing a preliminary merge of sentences
                summs_to_analyze = [
                    " ".join(sents) for sents in output_dc[ModelF.GEN_SUMM]
                ]
            else:
                summs_to_analyze = output_dc[ModelF.GEN_SUMM]
            res = self.analytics_func(summs_to_analyze)
            logger.info("Ran analytics of generated summaries.")
            logger.info(" ".join("%s: %.3f" % (k, v) for k, v in res))

        final_metrs = rouge_evaluator.aggr(avg=self.avg_rouge)
        logger.info("ROUGE scores (avg_rouge=%s): " % self.avg_rouge)
        for k, v in final_metrs.items():
            logger.info("%s based avg. %s." % (k, metrics_to_str(v)))

        # this is a safe way to make proper arrays
        for k in output_dc:
            l = len(output_dc[k])
            cont = np.zeros(l, dtype='object')
            for indx in range(l):
                cont[indx] = output_dc[k][indx]
            output_dc[k] = cont

        if output_file_path:
            gr_fields = [OutputF.CAT, OutputF.GROUP_ID]
            safe_mkfdir(output_file_path)
            output_file = codecs.open(output_file_path, 'w')
            output_dc.to_json(f=output_file, grouping_fnames=gr_fields)
            logger.info("Wrote the eval output to: "
                        "'%s'." % output_file_path)
        logger.info("Not generated %d summaries." % skipped_summs)