def test_vocabulary_mapper_multidim_lists(self):
        """Testing whether the mapper can map multi-dim lists."""
        target_field_name = "dummy"
        symbols_attr = "id"

        data_chunk = DataChunk(
            **{
                target_field_name:
                np.array(
                    [[["one"], ["two"]], [["three"], ["four", "five", "six"]]],
                    dtype="object")
            })
        exp_val = np.empty(2, dtype="object")
        exp_val[0] = np.array([[1], [2]])
        exp_val[1] = np.array([[3], [4, 5, 6]])
        expected_output_chunk = DataChunk(**{target_field_name: exp_val})

        # creating and populating a vocab
        vocab = Vocabulary()
        vocab.add_symbol("zero")
        vocab.add_symbol("one")
        vocab.add_symbol("two")
        vocab.add_symbol("three")
        vocab.add_symbol("four")
        vocab.add_symbol("five")
        vocab.add_symbol("six")

        mapper = VocabMapper({target_field_name: vocab},
                             symbols_attr=symbols_attr)
        actual_output_chunk = mapper(copy.deepcopy(data_chunk))

        self.assertTrue(actual_output_chunk == expected_output_chunk)
Пример #2
0
    def test_vocabulary_mapper_mixed_field_values(self):
        """Testing whether the mapper can map multi-dim mixed field values."""
        target_field_name = "dummy"
        symbols_attr = "id"

        data_chunk = DataChunk(**{target_field_name: np.array([
            [["one"], np.array(["two", "one"])],
            [["three"], np.array(["four", "five", "six"])]
        ], dtype="object")})
        expected_output_chunk = DataChunk(**{target_field_name: np.array([
            [[1], np.array([2, 1])],
            [[3], np.array([4, 5, 6])]
        ], dtype="object")})

        # creating and populating a vocab
        vocab = Vocabulary()
        vocab.add_symbol("zero")
        vocab.add_symbol("one")
        vocab.add_symbol("two")
        vocab.add_symbol("three")
        vocab.add_symbol("four")
        vocab.add_symbol("five")
        vocab.add_symbol("six")

        mapper = VocabMapper({target_field_name: vocab},
                             symbols_attr=symbols_attr)
        actual_output_chunk = mapper(data_chunk)

        self.assertTrue(actual_output_chunk == expected_output_chunk)
    def test_vocabulary_mapper(self):
        """Testing whether the mapper allows to map back and forth field values.
        """
        data_path = 'mldp/tests/data/mock_data.csv'
        target_fields = ["first_name", "last_name", "email", "gender"]

        reader = CsvReader(sep=',')
        vocab = Vocabulary(reader)

        for target_field in target_fields:
            vocab.create(data_source={"data_path": data_path},
                         data_fnames=target_field)

            data = read_data_from_csv_file(data_path)
            data_original = copy.deepcopy(data)

            mapper_to = VocabMapper({target_field: vocab}, "id")
            mapper_back = VocabMapper({target_field: vocab}, "token")

            data = mapper_to(data)
            data = mapper_back(data)

            self.assertTrue(
                (data[target_field] == data_original[target_field]).all())
Пример #4
0
def assemble_train_pipeline(word_vocab,
                            max_groups_per_batch=1,
                            min_revs_per_group=None,
                            max_revs_per_group=10,
                            seed=None,
                            workers=1):
    """
    This pipeline is specific to the preprocessed Amazon and Yelp reviews.
    Creates a flow of transformation steps that modify the data until the final
    form is reached in terms of PyTorch tensors.

    :param word_vocab: vocabulary object with words/tokens.
    :param max_groups_per_batch: number of groups each batch should have.
    :param min_revs_per_group: number of reviews a group should have in order
                               not to be discarded.
    :param max_revs_per_group: self-explanatory.
    :param reseed: set it to True if use multi-processing and want it to return
                   different sequences of batches every epoch. This has to do
                   purely with multi-processing issues in combination with
                   numpy.
    """
    assert START in word_vocab
    assert END in word_vocab

    group_files_shuffler = GroupFileShuffler()

    reader = CsvReader(sep='\t',
                       engine='python',
                       chunk_size=None,
                       encoding='utf-8',
                       quoting=QUOTE_NONE,
                       timeout=None,
                       worker_threads_num=1)

    fname_renamer = FieldRenamer({
        InpDataF.REV_TEXT: ModelF.REV,
        InpDataF.GROUP_ID: ModelF.GROUP_ID
    })

    unit_sampler = UnitSampler(id_fname=ModelF.GROUP_ID,
                               sample_all=True,
                               min_units=min_revs_per_group,
                               max_units=max_revs_per_group)
    unit_sampler_accum = ChunkAccumulator(unit_sampler)

    # since we're splitting one group into multiple chunks, it's convenient
    # to postfix each group_id name, such that it would be possible to
    # associate summaries with different subsets of reviews
    postfixer = Postfixer(id_fname=ModelF.GROUP_ID)

    # to avoid having same product/business appearing in the same merged
    # data-chunk, buffer a small number of them, shuffle, and release
    chunk_shuffler = ChunkAccumulator(ChunkShuffler(buffer_size=500))

    # accumulates a fixed number of group chunks, merges them
    # together, and passes along the pipeline
    chunk_coll = ChunkCollector(buffer_size=max_groups_per_batch)
    chunk_accum = ChunkAccumulator(chunk_coll)

    # alternation of data entries
    tokenizer = TokenProcessor(fnames=ModelF.REV)
    vocab_mapper = VocabMapper({ModelF.REV: word_vocab})

    seq_wrapper = SeqWrapper(fname=ModelF.REV,
                             start_el=word_vocab[START].id,
                             end_el=word_vocab[END].id)

    seq_len_computer = SeqLenComputer(ModelF.REV, ModelF.REV_LEN)

    sorter = ChunkSorter(ModelF.REV_LEN)

    padder = Padder(fname=ModelF.REV,
                    new_mask_fname=ModelF.REV_MASK,
                    pad_symbol=word_vocab[PAD].id,
                    padding_mode='right')

    summ_rev_indxs_creator = SummRevIndxsCreator(
        group_id_fname=ModelF.GROUP_ID, category_fname=ModelF.CAT)

    rev_mapper = RevMapper(group_rev_indxs_fname=ModelF.GROUP_REV_INDXS,
                           group_rev_mask_fname=ModelF.GROUP_REV_INDXS_MASK,
                           rev_mask_fname=ModelF.REV_MASK)

    pipeline = PyTorchPipeline(reader=reader,
                               preprocessor=group_files_shuffler,
                               worker_processes_num=workers,
                               seed=seed,
                               error_on_invalid_chunk=False,
                               timeout=None)

    pipeline.add_step(fname_renamer)

    pipeline.add_step(unit_sampler_accum)
    pipeline.add_step(postfixer)
    pipeline.add_step(chunk_shuffler)
    pipeline.add_step(chunk_accum)

    # entry transformations
    pipeline.add_step(tokenizer)
    pipeline.add_step(vocab_mapper)
    pipeline.add_step(seq_wrapper)
    pipeline.add_step(seq_len_computer)
    pipeline.add_step(sorter)
    pipeline.add_step(padder)

    # adding additional fields for attention and summarization
    pipeline.add_step(summ_rev_indxs_creator)
    pipeline.add_step(rev_mapper)

    return pipeline
Пример #5
0
def assemble_tuning_pipeline(word_vocab,
                             max_groups_per_batch=1,
                             tok_func=None,
                             lowercase=False):
    """
    The pipeline yields tokenized reviews and summaries that can be used for
    training (fine-tuning of the model).
    """
    assert START in word_vocab and END in word_vocab

    reader = CsvReader(sep='\t',
                       encoding='utf-8',
                       engine='python',
                       chunk_size=None,
                       use_lists=True,
                       quating=QUOTE_NONE)

    chunk_accum = ChunkAccumulator(new_size=max_groups_per_batch)

    ama_spec_trans = AmazonTransformer(fnames_to_copy=[
        GoldDataF.PROD_ID,
        GoldDataF.CAT,
    ])
    summ_mapper = SummMapper(fname=ModelF.SUMMS,
                             new_indx_fname=ModelF.SUMM_GROUP_INDX)

    token_processor = TokenProcessor(fnames=[ModelF.REV, ModelF.SUMM],
                                     tok_func=tok_func,
                                     lowercase=lowercase)

    vocab_mapper = VocabMapper({
        ModelF.REV: word_vocab,
        ModelF.SUMM: word_vocab
    })

    fname_renamer = FieldRenamer({
        GoldDataF.PROD_ID: ModelF.GROUP_ID,
        GoldDataF.CAT: ModelF.CAT,
        ModelF.SUMMS: ModelF.SUMM
    })

    seq_wrapper = SeqWrapper(fname=[ModelF.REV, ModelF.SUMM],
                             start_el=word_vocab[START].id,
                             end_el=word_vocab[END].id)

    padder = Padder(fname=[ModelF.REV, ModelF.SUMM],
                    new_mask_fname=[ModelF.REV_MASK, ModelF.SUMM_MASK],
                    pad_symbol=word_vocab[PAD].id,
                    padding_mode='right')

    indxs_creator = GoldSummRevIndxsCreator()

    # rev_mapper = RevMapper(group_rev_indxs_fname=ModelF.GROUP_REV_INDXS,
    #                        group_rev_mask_fname=ModelF.GROUP_REV_INDXS_MASK,
    #                        rev_mask_fname=ModelF.REV_MASK)

    # props
    len_prop = SummLenProp(summ_fname=ModelF.SUMM,
                           rev_fname=ModelF.REV,
                           group_rev_indxs_fname=ModelF.GROUP_REV_INDXS,
                           summ_group_indx_fname=ModelF.SUMM_GROUP_INDX,
                           new_fname=ModelF.LEN_PROP)
    pov_prop = POVProp(text_fname=ModelF.SUMM, new_fname=ModelF.POV_PROP)
    rouge_prop = SummRougeProp(summ_fname=ModelF.SUMM,
                               rev_fname=ModelF.REV,
                               group_rev_indxs_fname=ModelF.GROUP_REV_INDXS,
                               summ_group_indx_fname=ModelF.SUMM_GROUP_INDX,
                               new_fname=ModelF.ROUGE_PROP)
    rating_prop = DummyProp(fname=ModelF.SUMM,
                            new_fname=ModelF.RATING_PROP,
                            fval=0.)

    np_formatter = NumpyFormatter([
        ModelF.LEN_PROP, ModelF.RATING_PROP, ModelF.POV_PROP, ModelF.ROUGE_PROP
    ])

    pipeline = PyTorchPipeline(reader=reader, error_on_invalid_chunk=False)

    # pipeline.add_step(shuffler)
    pipeline.add_step(chunk_accum)

    pipeline.add_step(ama_spec_trans)
    pipeline.add_step(summ_mapper)

    pipeline.add_step(fname_renamer)

    pipeline.add_step(indxs_creator)

    # props
    pipeline.add_step(rating_prop)
    pipeline.add_step(rouge_prop)

    pipeline.add_step(token_processor)

    # the props below require tokenization
    pipeline.add_step(len_prop)
    pipeline.add_step(pov_prop)

    pipeline.add_step(vocab_mapper)

    pipeline.add_step(seq_wrapper)
    pipeline.add_step(padder)

    pipeline.add_step(np_formatter)

    return pipeline
Пример #6
0
def assemble_eval_pipeline(word_vocab,
                           max_groups_per_chunk=1,
                           tok_func=None,
                           lowercase=False):
    """Assembles a data-pipeline for eval. against gold summaries."""
    assert START in word_vocab and END in word_vocab

    reader = CsvReader(sep='\t',
                       encoding='utf-8',
                       engine='python',
                       chunk_size=max_groups_per_chunk,
                       use_lists=True,
                       quating=QUOTE_NONE)

    rouge_prop = SummEvalRougeKnob(
        hyp_fnames=[GoldDataF.SUMM1, GoldDataF.SUMM2, GoldDataF.SUMM3],
        ref_fnames=GoldDataF.REVS,
        new_fname=ModelF.ROUGE_PROP)

    field_dupl = FieldDuplicator({
        GoldDataF.SUMM1: TOK_SUMM1,
        GoldDataF.SUMM2: TOK_SUMM2,
        GoldDataF.SUMM3: TOK_SUMM3
    })
    tokenizer = TokenProcessor(fnames=[TOK_SUMM1, TOK_SUMM2, TOK_SUMM3] +
                               GoldDataF.REVS,
                               tok_func=tok_func,
                               lowercase=lowercase)
    field_dropper = FieldDropper([TOK_SUMM1, TOK_SUMM2, TOK_SUMM3])

    rating_prop = DummyProp(fname=GoldDataF.PROD_ID,
                            new_fname=ModelF.RATING_PROP,
                            fval=0.)
    len_prop = SummEvalLenProp(summ_fnames=[TOK_SUMM1, TOK_SUMM2, TOK_SUMM3],
                               rev_fnames=GoldDataF.REVS,
                               new_fname=ModelF.LEN_PROP)
    pov_prop = SummEvalPOVProp(summ_fnames=[TOK_SUMM1, TOK_SUMM2, TOK_SUMM3],
                               new_fname=ModelF.POV_PROP)

    # summaries are not converted to tokens
    vocab_mapper = VocabMapper({ModelF.REV: word_vocab})

    dataset_spec_trans = AmazonTransformer([
        GoldDataF.PROD_ID, GoldDataF.CAT, ModelF.ROUGE_PROP, ModelF.LEN_PROP,
        ModelF.RATING_PROP, ModelF.POV_PROP
    ])

    fname_renamer = FieldRenamer({
        GoldDataF.PROD_ID: ModelF.GROUP_ID,
        GoldDataF.CAT: ModelF.CAT
    })

    seq_wrapper = SeqWrapper(fname=[ModelF.REV],
                             start_el=word_vocab[START].id,
                             end_el=word_vocab[END].id)

    padder = Padder(fname=[ModelF.REV],
                    new_mask_fname=[ModelF.REV_MASK],
                    pad_symbol=word_vocab[PAD].id,
                    padding_mode='right')

    indxs_creator = GoldSummRevIndxsCreator()

    rev_mapper = RevMapper(group_rev_indxs_fname=ModelF.GROUP_REV_INDXS,
                           group_rev_mask_fname=ModelF.GROUP_REV_INDXS_MASK,
                           rev_mask_fname=ModelF.REV_MASK)

    np_formatter = NumpyFormatter([
        ModelF.ROUGE_PROP, ModelF.LEN_PROP, ModelF.RATING_PROP, ModelF.POV_PROP
    ])

    pipeline = PyTorchPipeline(reader=reader, error_on_invalid_chunk=False)

    pipeline.add_step(rouge_prop)
    pipeline.add_step(rating_prop)

    # props that require tokenization
    pipeline.add_step(field_dupl)
    pipeline.add_step(tokenizer)
    pipeline.add_step(pov_prop)
    pipeline.add_step(len_prop)
    pipeline.add_step(field_dropper)

    pipeline.add_step(dataset_spec_trans)

    pipeline.add_step(vocab_mapper)

    pipeline.add_step(fname_renamer)
    pipeline.add_step(seq_wrapper)

    pipeline.add_step(padder)

    pipeline.add_step(indxs_creator)
    pipeline.add_step(rev_mapper)

    pipeline.add_step(np_formatter)

    return pipeline
def assemble_eval_pipeline(word_vocab,
                           max_groups_per_chunk=1,
                           dataset='yelp',
                           tokenization_func=lambda x: x.split()):
    """Assembles the pipeline for evaluation on the YELP and Amazon eval set."""
    assert dataset in ['yelp', 'amazon']

    if dataset == 'yelp':
        fields_obj = YelpEvalF
        fname_renamer = FieldRenamer({fields_obj.BUSINESS_ID: ModelF.GROUP_ID})
        dataset_spec_trans = YelpTransformer()
    else:
        fields_obj = AmazonEvalF
        fname_renamer = FieldRenamer({
            fields_obj.PROD_ID: ModelF.GROUP_ID,
            fields_obj.CAT: ModelF.CAT
        })
        dataset_spec_trans = AmazonTransformer()

    assert START in word_vocab
    assert END in word_vocab

    reader = CsvReader(sep='\t',
                       encoding='utf-8',
                       engine='python',
                       quotechar='\'',
                       chunk_size=max_groups_per_chunk)

    # notice that I do not tokenize summaries, I leave them as they are!
    token_processor = TokenProcessor(fnames=fields_obj.REVS,
                                     tokenization_func=tokenization_func)
    # notice that I don't convert summs tokens to ids
    vocab_mapper = VocabMapper({fn: word_vocab for fn in fields_obj.REVS})

    seq_wrapper = SeqWrapper(ModelF.REV,
                             start_el=word_vocab[START].id,
                             end_el=word_vocab[END].id)

    seq_len_computer = SeqLenComputer(ModelF.REV, ModelF.REV_LEN)

    padder = Padder(fname=ModelF.REV,
                    new_mask_fname=ModelF.REV_MASK,
                    pad_symbol=word_vocab[PAD].id,
                    padding_mode='right')

    sorter = ChunkSorter(field_name=ModelF.REV_LEN,
                         fields_to_sort=[
                             ModelF.REV, ModelF.REV_MASK, ModelF.CAT,
                             ModelF.GROUP_ID
                         ])

    indxs_creator = GoldSummRevIndxsCreator(group_id_fname=ModelF.GROUP_ID)

    rev_mapper = RevMapper(group_rev_indxs_fname=ModelF.GROUP_REV_INDXS,
                           group_rev_mask_fname=ModelF.GROUP_REV_INDXS_MASK,
                           rev_mask_fname=ModelF.REV_MASK)

    formatter = PyTorchFormatter()

    pipeline = Pipeline(reader=reader, error_on_invalid_chunk=False)

    pipeline.add_step(token_processor)
    pipeline.add_step(vocab_mapper)

    pipeline.add_step(dataset_spec_trans)
    pipeline.add_step(fname_renamer)

    pipeline.add_step(seq_wrapper)
    pipeline.add_step(seq_len_computer)
    pipeline.add_step(padder)
    pipeline.add_step(sorter)
    pipeline.add_step(indxs_creator)
    pipeline.add_step(rev_mapper)

    pipeline.add_step(formatter)

    return pipeline
Пример #8
0
def assemble_infer_pipeline(word_vocab,
                            max_groups_per_chunk=1,
                            max_reviews=10,
                            tokenization_func=lambda x: x.split()):
    """Assembles a simple inference pipeline for summary generation. Assumes
    that csv files are read where reviews have the following column names:
    'rev1', 'rev2', ..., 'revN', each review separated by \t.

    Args:
        word_vocab: word vocabulary to convert words to ids.
        max_groups_per_chunk: self-explanatory.
        max_reviews: the maximum number of reviews to load per group. Columns
            in the CSV file should be `rev1`, ...., `revN`.
        tokenization_func: self-explanatory.
    """
    rev_fnames = [
        f'{InfDataF.REV_PREFIX}{i}' for i in range(1, max_reviews + 1)
    ]

    assert START in word_vocab
    assert END in word_vocab

    reader = CsvReader(sep='\t',
                       encoding='utf-8',
                       engine='python',
                       quotechar='\'',
                       chunk_size=max_groups_per_chunk)

    rev_flattener = ReviewFlattener(group_id_fname=InfDataF.GROUP_ID,
                                    rev_fnames=rev_fnames)

    token_processor = TokenProcessor(fnames=ModelF.REV,
                                     tokenization_func=tokenization_func)
    # notice that I don't convert summs tokens to ids
    vocab_mapper = VocabMapper({ModelF.REV: word_vocab})

    seq_wrapper = SeqWrapper(ModelF.REV,
                             start_el=word_vocab[START].id,
                             end_el=word_vocab[END].id)

    seq_len_computer = SeqLenComputer(ModelF.REV, ModelF.REV_LEN)

    padder = Padder(fname=ModelF.REV,
                    new_mask_fname=ModelF.REV_MASK,
                    pad_symbol=word_vocab[PAD].id,
                    padding_mode='right')

    sorter = ChunkSorter(field_name=ModelF.REV_LEN,
                         fields_to_sort=[ModelF.REV, ModelF.GROUP_ID])

    # re-using the step
    summ_rev_indx_creator = GoldSummRevIndxsCreator(
        group_id_fname=ModelF.GROUP_ID)

    pipeline = PyTorchPipeline(reader=reader, error_on_invalid_chunk=False)

    pipeline.add_step(rev_flattener)
    pipeline.add_step(token_processor)
    pipeline.add_step(vocab_mapper)

    pipeline.add_step(seq_wrapper)
    pipeline.add_step(seq_len_computer)
    pipeline.add_step(padder)
    pipeline.add_step(sorter)

    pipeline.add_step(summ_rev_indx_creator)

    return pipeline
    def gen_and_save_summs(self, data_source, output_file_path):
        """
        Generates summaries by running the model and writes them along with other
        attributes to a json file.

        :param data_source: self-explanatory.
        :param output_file_path: self-explanatory.
        """
        safe_mkfdir(output_file_path)
        start_id = self.word_vocab[START].id
        end_id = self.word_vocab[END].id
        pad_id = self.word_vocab[PAD].id
        output_file = open(output_file_path, encoding='utf-8', mode='w')
        vocab_mapper = VocabMapper(
            {
                ModelF.REV: self.word_vocab,
                ModelF.GEN_SUMM: self.word_vocab,
                ModelF.GEN_REV: self.word_vocab
            },
            symbols_attr='token')
        chunk_coll = []

        for i, dc in enumerate(self.val_data_pipeline.iter(**data_source), 1):
            gen_revs, _, gen_summ, _ = self.imodel.predict(dc)

            # converting to the data-chunk to use the internal writing
            # mechanism
            new_dc = DataChunk()
            for fn in [
                    ModelF.SUMM_CAT, ModelF.SUMM_GROUP_ID, ModelF.REV,
                    ModelF.GROUP_ID
            ]:
                new_dc[fn] = dc[fn]
            new_dc[ModelF.GEN_REV] = gen_revs
            new_dc[ModelF.GEN_SUMM] = gen_summ

            seq_fnames = [ModelF.GEN_SUMM, ModelF.GEN_REV, ModelF.REV]
            # converting PyTorch tensors to numpy arrays if present
            new_dc = convert_tensors_to_numpy(new_dc)
            for fn in seq_fnames:
                new_dc[fn] = format_seqs(new_dc[fn],
                                         start_id=start_id,
                                         end_id=end_id,
                                         pad_id=pad_id)
            new_dc = vocab_mapper(new_dc)

            # convert all seqs to strings
            for fn in seq_fnames:
                new_dc[fn] = conv_seqs_to_sents(new_dc[fn])

            # group by product ids
            indxs = group_vals_by_keys(range(len(new_dc[ModelF.REV])),
                                       new_dc[ModelF.GROUP_ID]).values()

            for fn in [ModelF.GEN_REV, ModelF.REV]:
                new_dc[fn] = self._group_by_prods(indxs, new_dc[fn])

            del new_dc[ModelF.GROUP_ID]

            chunk_coll.append(new_dc)

        output_chunk = concat_chunks(*chunk_coll)

        output_chunk.to_json(
            f=output_file,
            grouping_fnames=[ModelF.SUMM_CAT, ModelF.SUMM_GROUP_ID])

        logger.info("Generated summaries and saved to: '%s'."
                    "" % output_file_path)

        # analytics for repetitions checking
        # because gen summs contain list of strings I need to merge them
        # together before running analytics
        all_gen_summ_strs = [
            " ".join(sents) for sents in output_chunk[ModelF.GEN_SUMM]
        ]

        an_metrics = ngram_seq_analysis(all_gen_summ_strs,
                                        tokenizer=self.tok_func,
                                        sent_splitter=self.sent_split_func,
                                        n_grams_to_comp=(2, 3, 4))

        logger.info("Ran analytics of generated summaries.")
        metrs_str = " ".join(["%s: %.3f" % (k, v) for k, v in an_metrics])
        logger.info(metrs_str)
Пример #10
0
    def test_how_to_apply_run(self):

        data_path = os.path.join(self.tutorials_path, "data/tweets.csv")

        # paths where vocabs will be saved and later loaded from
        words_vocab_file_path = os.path.join(self.tutorials_path,
                                             "data/vocabs/words.txt")
        labels_vocab_file_path = os.path.join(self.tutorials_path,
                                              'data/vocabs/labels.txt')

        # creating step objects
        twitter_tokenizer = TweetTokenizer()
        preprocessor = TwitterFilesPreprocessor(
            input_cols_number=3,
            tweets_indx=2,
            add_header=['ids', 'labels', 'tweets'])
        csv_reader = CsvReader(sep='\t', chunk_size=30)
        fields_selector = FieldSelector(fnames=["tweets", "labels"])
        token_processor = TokenProcessor(
            fnames="tweets",
            tok_func=twitter_tokenizer.tokenize,
            tok_cleaning_func=twitter_text_cleaner,
            lowercase=True)

        # data pipeline for vocabularies creation
        vocab_data_pipeline = Pipeline(reader=csv_reader,
                                       preprocessor=preprocessor,
                                       worker_processes_num=0,
                                       name_prefix="vocabs")
        vocab_data_pipeline.add_step(fields_selector)
        vocab_data_pipeline.add_step(token_processor)

        # creating or loading vocabs
        words_vocab = Vocabulary(vocab_data_pipeline, name_prefix="words")
        words_vocab.load_or_create(words_vocab_file_path,
                                   data_source={"data_path": data_path},
                                   data_fnames="tweets")

        labels_vocab = Vocabulary(vocab_data_pipeline, name_prefix="labels")
        labels_vocab.load_or_create(labels_vocab_file_path,
                                    data_source={"data_path": data_path},
                                    data_fnames="labels")

        print(words_vocab)

        print(labels_vocab)

        print(vocab_data_pipeline)

        # extra steps for training and evaluation
        mapper = VocabMapper(field_names_to_vocabs={
            "tweets": words_vocab,
            "labels": labels_vocab
        })
        padder = Padder(fname="tweets",
                        new_mask_fname="tweets_mask",
                        pad_symbol=words_vocab[PAD].id)
        formatter = FeaturesLabelsFormatter(features_field_name="tweets",
                                            labels_field_name="labels",
                                            classes_number=len(labels_vocab))

        # building the actual pipeline
        dev_data_pipeline = Pipeline(reader=csv_reader,
                                     preprocessor=preprocessor,
                                     worker_processes_num=1,
                                     name_prefix="dev")
        dev_data_pipeline.add_step(fields_selector)
        dev_data_pipeline.add_step(token_processor)
        dev_data_pipeline.add_step(mapper)
        dev_data_pipeline.add_step(padder)
        dev_data_pipeline.add_step(formatter)

        print(dev_data_pipeline)

        epochs = 2

        i_model = ISentiLSTM(dev_data_pipeline)
        i_model.init_model(words_vocab_size=len(words_vocab),
                           input_dim=50,
                           lstm_hidden_dim=120,
                           number_of_classes=len(labels_vocab),
                           mask_symbol=words_vocab[PAD].id)
Пример #11
0
def assemble_unsup_pipeline(word_vocab,
                            max_groups_per_batch=1,
                            reader_threads=5,
                            min_revs_per_group=None,
                            max_revs_per_group=10,
                            worker_num=1,
                            seed=None,
                            tok_func=None,
                            lowercase=True,
                            max_len=None,
                            shuffler_buffer_size=250):
    """Creates a data-pipeline that yields batches for to train the unsup. model.

    Creates a flow of data transformation steps that modify the data until the
    final form is reached in terms of PyTorch tensors.

    Args:
        word_vocab: vocabulary object with words/tokens.
        max_groups_per_batch: number of groups each batch should have.
        min_revs_per_group: number of reviews a group should have in order not
            to be discarded.
        max_revs_per_group: self-explanatory.
        seed: used to use the same data subsamples/shuffles every epoch.
        max_len: if passed will filter out all reviews that a longer than the
            threshold.
    Returns:
        DataPipeline object that allows iteration over batches/chunks.
    """
    assert START in word_vocab and END in word_vocab

    file_shuffler = FileShuffler()

    # TODO: explain how grouping works here - each file has reviews of a group

    reader = CsvReader(sep='\t',
                       engine='c',
                       chunk_size=None,
                       encoding='utf-8',
                       quoting=QUOTE_NONE,
                       buffer_size=200,
                       timeout=None,
                       worker_threads_num=reader_threads,
                       use_lists=True)

    fname_renamer = FieldRenamer({
        InpDataF.REV_TEXT: ModelF.REV,
        InpDataF.GROUP_ID: ModelF.REV_GROUP_ID,
        InpDataF.RATING: ModelF.REV_RATING,
        InpDataF.RATING_DEV: ModelF.RATING_PROP,
        InpDataF.CAT: ModelF.REV_CAT
    })

    unit_sampler = UnitSampler(id_fname=ModelF.REV_GROUP_ID,
                               sample_all=True,
                               min_units=min_revs_per_group,
                               max_units=max_revs_per_group)

    unit_sampler_accum = ChunkAccumulator(unit_sampler)

    # since we're splitting one group into multiple chunks, it's convenient
    # to postfix each group_id name, such that it would be possible to
    # associate summaries with different subsets of reviews
    postfixer = Postfixer(id_fname=ModelF.REV_GROUP_ID)

    # property and related steps
    len_prop = LenProp(len_fname=ModelF.REV_LEN, new_fname=ModelF.LEN_PROP)
    pov_prop = POVProp(text_fname=ModelF.REV, new_fname=ModelF.POV_PROP)

    rouge_field_merger = FieldMerger(
        merge_fnames=[InpDataF.ROUGE1, InpDataF.ROUGE2, InpDataF.ROUGEL],
        new_fname=ModelF.ROUGE_PROP)

    # to avoid having same product/business appearing in the same merged
    # data-chunk, buffer a small number of them, shuffle, and release
    chunk_shuffler = ChunkAccumulator(
        ChunkShuffler(buffer_size=shuffler_buffer_size))

    # accumulates a fixed number of group chunks, merges them
    # together, and passes along the pipeline
    chunk_coll = ChunkCollector(buffer_size=max_groups_per_batch, strict=True)
    chunk_accum = ChunkAccumulator(chunk_coll)

    # alternation of data entries
    tokenizer = TokenProcessor(fnames=ModelF.REV,
                               tok_func=tok_func,
                               lowercase=lowercase)
    vocab_mapper = VocabMapper({ModelF.REV: word_vocab})

    seq_wrapper = SeqWrapper(fname=ModelF.REV,
                             start_el=word_vocab[START].token,
                             end_el=word_vocab[END].token)

    seq_len_computer = SeqLenComputer(ModelF.REV, ModelF.REV_LEN)

    padder = Padder(fname=ModelF.REV,
                    new_mask_fname=ModelF.REV_MASK,
                    pad_symbol=word_vocab[PAD].id,
                    padding_mode='right')

    summ_rev_indxs_creator = GroupRevIndxsCreator(
        rev_group_id_fname=ModelF.REV_GROUP_ID, rev_cat_fname=ModelF.REV_CAT)

    rev_mapper = RevMapper(group_rev_indxs_fname=ModelF.GROUP_REV_INDXS,
                           group_rev_mask_fname=ModelF.GROUP_REV_INDXS_MASK,
                           rev_mask_fname=ModelF.REV_MASK)

    # extra steps for the loss associated with probability mass
    un_word_cal = UniqueWordCalc(
        new_fname=ModelF.OTHER_REV_UWORDS,
        rev_fname=ModelF.REV,
        other_rev_indxs_fname=ModelF.OTHER_REV_INDXS,
        other_rev_indxs_mask_fname=ModelF.OTHER_REV_INDXS_MASK)
    un_word_padder = Padder(fname=ModelF.OTHER_REV_UWORDS,
                            new_mask_fname=ModelF.OTHER_REV_UWORDS_MASK,
                            pad_symbol=word_vocab[PAD].id,
                            padding_mode='right')

    numpy_formatter = NumpyFormatter(fnames=[
        ModelF.ROUGE_PROP, ModelF.RATING_PROP, ModelF.LEN_PROP, ModelF.POV_PROP
    ])

    pipeline = PyTorchPipeline(reader=reader,
                               preprocessor=file_shuffler,
                               worker_processes_num=worker_num,
                               seed=seed,
                               output_buffer_size=50,
                               error_on_invalid_chunk=False,
                               timeout=None)

    pipeline.add_step(fname_renamer)
    pipeline.add_step(rouge_field_merger)
    pipeline.add_step(tokenizer)

    if max_len:
        pipeline.add_step(TextLenFilter(fname=ModelF.REV, max_len=max_len))

    pipeline.add_step(unit_sampler_accum)
    pipeline.add_step(postfixer)

    pipeline.add_step(chunk_shuffler)

    pipeline.add_step(seq_wrapper)
    pipeline.add_step(seq_len_computer)

    # properties
    pipeline.add_step(len_prop)
    pipeline.add_step(pov_prop)

    pipeline.add_step(chunk_accum)
    pipeline.add_step(vocab_mapper)
    pipeline.add_step(padder)

    # adding additional fields for attention and summarization
    pipeline.add_step(summ_rev_indxs_creator)
    pipeline.add_step(rev_mapper)

    # adding steps for word count computation
    pipeline.add_step(un_word_cal)
    pipeline.add_step(un_word_padder)

    pipeline.add_step(numpy_formatter)

    return pipeline