Пример #1
0
    def test_order(self):
        """Testing production of chunks in a different order from the stream."""
        data_sizes = [200, 545]
        data_attrs_numbers = [5, 8, 2, 1, 15]
        inp_chunk_sizes = [1, 2, 3, 4, 5]
        buffer_sizes = [2, 38, 1000]

        for data_size, data_attrs_number, buffer_size, input_chunk_size in \
                itertools.product(data_sizes, data_attrs_numbers, buffer_sizes,
                                  inp_chunk_sizes):
            data = generate_data_chunk(data_attrs_number, data_size)
            inp_data_chunks = create_list_of_data_chunks(
                data, input_chunk_size)

            chunk_collector = ChunkShuffler(buffer_size=buffer_size)
            accum = ChunkAccumulator(collector=chunk_collector)

            actual_chunks = []

            for actual_chunk in accum.iter(inp_data_chunks):
                actual_chunks.append(actual_chunk)
            actual_ds = concat_chunks(*actual_chunks)

            self.assertTrue(data != actual_ds)
            self.assertTrue(len(data) == len(actual_ds))
    def gen_and_save_summs(self, data_source, output_file_path):
        """
        Generates summaries by running the model and writes them along with other
        attributes to a json file.

        :param data_source: self-explanatory.
        :param output_file_path: self-explanatory.
        """
        safe_mkfdir(output_file_path)
        start_id = self.word_vocab[START].id
        end_id = self.word_vocab[END].id
        pad_id = self.word_vocab[PAD].id
        output_file = open(output_file_path, encoding='utf-8', mode='w')
        vocab_mapper = VocabMapper(
            {
                ModelF.REV: self.word_vocab,
                ModelF.GEN_SUMM: self.word_vocab,
                ModelF.GEN_REV: self.word_vocab
            },
            symbols_attr='token')
        chunk_coll = []

        for i, dc in enumerate(self.val_data_pipeline.iter(**data_source), 1):
            gen_revs, _, gen_summ, _ = self.imodel.predict(dc)

            # converting to the data-chunk to use the internal writing
            # mechanism
            new_dc = DataChunk()
            for fn in [
                    ModelF.SUMM_CAT, ModelF.SUMM_GROUP_ID, ModelF.REV,
                    ModelF.GROUP_ID
            ]:
                new_dc[fn] = dc[fn]
            new_dc[ModelF.GEN_REV] = gen_revs
            new_dc[ModelF.GEN_SUMM] = gen_summ

            seq_fnames = [ModelF.GEN_SUMM, ModelF.GEN_REV, ModelF.REV]
            # converting PyTorch tensors to numpy arrays if present
            new_dc = convert_tensors_to_numpy(new_dc)
            for fn in seq_fnames:
                new_dc[fn] = format_seqs(new_dc[fn],
                                         start_id=start_id,
                                         end_id=end_id,
                                         pad_id=pad_id)
            new_dc = vocab_mapper(new_dc)

            # convert all seqs to strings
            for fn in seq_fnames:
                new_dc[fn] = conv_seqs_to_sents(new_dc[fn])

            # group by product ids
            indxs = group_vals_by_keys(range(len(new_dc[ModelF.REV])),
                                       new_dc[ModelF.GROUP_ID]).values()

            for fn in [ModelF.GEN_REV, ModelF.REV]:
                new_dc[fn] = self._group_by_prods(indxs, new_dc[fn])

            del new_dc[ModelF.GROUP_ID]

            chunk_coll.append(new_dc)

        output_chunk = concat_chunks(*chunk_coll)

        output_chunk.to_json(
            f=output_file,
            grouping_fnames=[ModelF.SUMM_CAT, ModelF.SUMM_GROUP_ID])

        logger.info("Generated summaries and saved to: '%s'."
                    "" % output_file_path)

        # analytics for repetitions checking
        # because gen summs contain list of strings I need to merge them
        # together before running analytics
        all_gen_summ_strs = [
            " ".join(sents) for sents in output_chunk[ModelF.GEN_SUMM]
        ]

        an_metrics = ngram_seq_analysis(all_gen_summ_strs,
                                        tokenizer=self.tok_func,
                                        sent_splitter=self.sent_split_func,
                                        n_grams_to_comp=(2, 3, 4))

        logger.info("Ran analytics of generated summaries.")
        metrs_str = " ".join(["%s: %.3f" % (k, v) for k, v in an_metrics])
        logger.info(metrs_str)
Пример #3
0
 def _merge_chunks(self):
     """Merges data-chunks and returns their generator."""
     merged_dc = concat_chunks(*[dc for dc in self._coll])
     return merged_dc