示例#1
0
    def test_multi_threaded_reader_output(self):
        """
        Check if multi-threaded and single threaded readers produce the correct
        output.
        """
        data_paths = [
            'mldp/tests/data/small_chunks/chunk1.csv',
            'mldp/tests/data/small_chunks/chunk2.csv',
            'mldp/tests/data/small_chunks/chunk3.csv'
        ]
        chunk_size = 2

        reader = CsvReader(chunk_size=chunk_size,
                           worker_threads_num=3,
                           sep=',')

        expected_data = read_data_from_csv_file(data_paths)

        actual_data_chunks = DataChunk()
        for data_chunk in reader.iter(data_path=data_paths):
            for key in data_chunk.keys():
                if key not in actual_data_chunks:
                    actual_data_chunks[key] = np.array([])
                actual_data_chunks[key] = np.concatenate(
                    [actual_data_chunks[key], data_chunk[key]])
        self.compare_unsorted_data_chunks(dc1=expected_data,
                                          dc2=actual_data_chunks,
                                          sort_key='id')
示例#2
0
    def test_valid_paths(self):
        """
        Passing an intentionally wrong input to the reader and expecting it to
        throw an error.
        """
        data_paths = ["a", "b", 123123123, 123.12313]
        reader = CsvReader()

        with self.assertRaises(ValueError):
            itr = reader.iter(data_path=data_paths)
            chunk = next(itr.__iter__())
示例#3
0
    def test_invalid_steps(self):
        """Testing whether an error is raised if an invalid step is present."""
        data_path = 'mldp/tests/data/news.csv'
        data_source = {'data_path': data_path}

        inv_reader = InvalidCsvReader()
        val_reader = CsvReader()

        val_transf1 = FieldSelector("text")
        val_transf2 = TokenProcessor(fnames='text')
        inv_transf1 = InvalidTransformer()
        accum = ChunkAccumulator(new_size=3)
        formatter = PandasFormatter()

        # try only the invalid reader and valid steps
        dp = Pipeline(reader=inv_reader, error_on_invalid_chunk='error')
        for vs in [val_transf1, val_transf2, accum, formatter]:
            dp.add_step(vs)
        with self.assertRaises(DataChunkError):
            for _ in dp.iter(**data_source):
                pass

        # try valid reader and invalid steps
        steps = [val_transf1, val_transf2, inv_transf1, accum]
        for st in permutations(steps):
            dp = Pipeline(reader=val_reader, error_on_invalid_chunk='error')
            for s in st:
                dp.add_step(s)
            dp.add_step(formatter)
            with self.assertRaises(DataChunkError):
                for _ in dp.iter(**data_source):
                    pass
示例#4
0
    def test_readme_example(self):
        from mltoolkit.mldp.pipeline import Pipeline
        from mltoolkit.mldp.steps.readers import CsvReader
        from mltoolkit.mldp.steps.transformers.nlp import TokenProcessor, Padder
        from mltoolkit.mldp.steps.transformers.field import FieldSelector

        data_path = "mltoolkit/mldp/tests/data/tweets.csv"

        # creating steps
        csv_reader = CsvReader(sep='\t', chunk_size=30)
        fields_selector = FieldSelector(fnames=["tweets", "labels"])
        token_processor = TokenProcessor(fnames="tweets",
                                         tok_func=lambda x: x.split(),
                                         lowercase=True)
        padder = Padder(fname="tweets", new_mask_fname="tweets_mask",
                        pad_symbol="<PAD>")

        # creating the pipeline
        pipeline = Pipeline(reader=csv_reader, worker_processes_num=1)
        pipeline.add_step(fields_selector)
        pipeline.add_step(token_processor)
        pipeline.add_step(padder)

        # iterate over data chunks
        for data_chunk in pipeline.iter(data_path=data_path):
            pass

        # generate documentation and print it
        print(pipeline)
示例#5
0
    def test_simple_scenario(self):
        """
        Tries to run the pipeline, and if it works - it's considered to be
        successful. Tries different numbers of workers.
        """
        data_path = 'mldp/tests/data/small_chunks'
        field_names = ['first_name', 'email']
        worker_processes_nums = [0, 1, 2, 3, 4]

        reader = CsvReader(sep=",")

        for wpn in worker_processes_nums:

            dev_data_pipeline = Pipeline(reader=reader,
                                         worker_processes_num=wpn)
            dev_data_pipeline.add_step(FieldSelector(field_names))
            dev_data_pipeline.add_step(ChunkAccumulator(new_size=3))
            dev_data_pipeline.add_step(PandasFormatter())

            flag = False
            for data_chunk in dev_data_pipeline.iter(data_path=data_path):
                flag = True
                self.assertTrue(len(data_chunk) > 0)

        self.assertTrue(flag)
示例#6
0
def train_and_save_true_casing_model(input_fps, text_fname, output_fp):
    """Trains the Moses model on tokenized csv files; saves params."""
    mtr = MosesTruecaser(is_asr=True)
    reader = CsvReader(quoting=QUOTE_NONE,
                       sep='\t',
                       engine='python',
                       encoding='utf-8')
    texts = []
    logger.info("Loading data from: '%s'." % input_fps)
    for dc in reader.iter(data_path=input_fps):
        for du in dc.iter():
            texts.append(du[text_fname].split())
    logger.info("Loaded the data.")
    safe_mkfdir(output_fp)
    logger.info("Training the truecaser.")
    mtr.train(texts, save_to=output_fp, progress_bar=True, processes=1)
    logger.info("Done, saved the model to: '%s'." % output_fp)
示例#7
0
    def test_output(self):
        """Checking if read data-chunks are valid."""
        data_path = 'mltoolkit/mldp/tests/data/small_chunks/chunk2.csv'
        chunk_size = 2

        reader = CsvReader(chunk_size=chunk_size, worker_threads_num=1, sep=',',
                           encoding='utf-8', use_lists=False)

        data = read_data_from_csv_file(data_path, encoding='utf-8')
        expected_chunks = create_list_of_data_chunks(data,
                                                     chunk_size=chunk_size)

        itr = reader.iter(data_path=data_path)
        i = 0
        for (actual_chunk, expected_chunk) in zip(itr, expected_chunks):
            self.assertTrue(actual_chunk == expected_chunk)
            i += 1

        self.assertTrue(i == len(expected_chunks) and len(expected_chunks) > 0)
示例#8
0
def assemble_vocab_pipeline(text_fname, sep='\t', encoding='utf-8',
                            tok_func=None, lowercase=True):
    """Assembler for the vocabulary pipeline based on a CSV reader."""
    reader = CsvReader(sep=sep, encoding=encoding, quoting=QUOTE_NONE)
    token_processor = TokenProcessor(fnames=text_fname, lowercase=lowercase,
                                     tok_func=tok_func)
    # creating vocabulary pipeline
    vocab_pipeline = Pipeline(reader=reader)
    vocab_pipeline.add_step(token_processor)
    return vocab_pipeline
示例#9
0
 def test_invalid_pipeline(self):
     """
     Tries to create an invalid data processing pipeline, and expect to get
     an error.
     """
     reader = CsvReader()
     with self.assertRaises(ValueError):
         data_pipeline = Pipeline(reader)
         data_pipeline.add_step(FieldSelector(["dummy"]))
         data_pipeline.add_step(PandasFormatter())
         data_pipeline.add_step(FunctionApplier({"dummy": lambda x: x}))
示例#10
0
    def test_empty_chunks(self):
        """Testing whether empty chunks do not reach user."""
        data_path = 'mltoolkit/mldp/tests/data/small_chunks'
        field_names = ['first_name', 'email']
        reader = CsvReader(chunk_size=1, sep=",")
        empty_chunk_transformer = EmptyChunkTransformer(max_count=3)

        dev_data_pipeline = Pipeline(reader=reader)
        dev_data_pipeline.add_step(empty_chunk_transformer)
        dev_data_pipeline.add_step(FieldSelector(field_names))

        flag = False
        for dc in dev_data_pipeline.iter(data_path=data_path):
            flag = True
            self.assertFalse(equal_to_constant(dc, EMPTY_CHUNK))

        self.assertTrue(flag)
    def test_vocabulary_mapper(self):
        """Testing whether the mapper allows to map back and forth field values.
        """
        data_path = 'mldp/tests/data/mock_data.csv'
        target_fields = ["first_name", "last_name", "email", "gender"]

        reader = CsvReader(sep=',')
        vocab = Vocabulary(reader)

        for target_field in target_fields:
            vocab.create(data_source={"data_path": data_path},
                         data_fnames=target_field)

            data = read_data_from_csv_file(data_path)
            data_original = copy.deepcopy(data)

            mapper_to = VocabMapper({target_field: vocab}, "id")
            mapper_back = VocabMapper({target_field: vocab}, "token")

            data = mapper_to(data)
            data = mapper_back(data)

            self.assertTrue(
                (data[target_field] == data_original[target_field]).all())
示例#12
0
def assemble_train_pipeline(word_vocab,
                            max_groups_per_batch=1,
                            min_revs_per_group=None,
                            max_revs_per_group=10,
                            seed=None,
                            workers=1):
    """
    This pipeline is specific to the preprocessed Amazon and Yelp reviews.
    Creates a flow of transformation steps that modify the data until the final
    form is reached in terms of PyTorch tensors.

    :param word_vocab: vocabulary object with words/tokens.
    :param max_groups_per_batch: number of groups each batch should have.
    :param min_revs_per_group: number of reviews a group should have in order
                               not to be discarded.
    :param max_revs_per_group: self-explanatory.
    :param reseed: set it to True if use multi-processing and want it to return
                   different sequences of batches every epoch. This has to do
                   purely with multi-processing issues in combination with
                   numpy.
    """
    assert START in word_vocab
    assert END in word_vocab

    group_files_shuffler = GroupFileShuffler()

    reader = CsvReader(sep='\t',
                       engine='python',
                       chunk_size=None,
                       encoding='utf-8',
                       quoting=QUOTE_NONE,
                       timeout=None,
                       worker_threads_num=1)

    fname_renamer = FieldRenamer({
        InpDataF.REV_TEXT: ModelF.REV,
        InpDataF.GROUP_ID: ModelF.GROUP_ID
    })

    unit_sampler = UnitSampler(id_fname=ModelF.GROUP_ID,
                               sample_all=True,
                               min_units=min_revs_per_group,
                               max_units=max_revs_per_group)
    unit_sampler_accum = ChunkAccumulator(unit_sampler)

    # since we're splitting one group into multiple chunks, it's convenient
    # to postfix each group_id name, such that it would be possible to
    # associate summaries with different subsets of reviews
    postfixer = Postfixer(id_fname=ModelF.GROUP_ID)

    # to avoid having same product/business appearing in the same merged
    # data-chunk, buffer a small number of them, shuffle, and release
    chunk_shuffler = ChunkAccumulator(ChunkShuffler(buffer_size=500))

    # accumulates a fixed number of group chunks, merges them
    # together, and passes along the pipeline
    chunk_coll = ChunkCollector(buffer_size=max_groups_per_batch)
    chunk_accum = ChunkAccumulator(chunk_coll)

    # alternation of data entries
    tokenizer = TokenProcessor(fnames=ModelF.REV)
    vocab_mapper = VocabMapper({ModelF.REV: word_vocab})

    seq_wrapper = SeqWrapper(fname=ModelF.REV,
                             start_el=word_vocab[START].id,
                             end_el=word_vocab[END].id)

    seq_len_computer = SeqLenComputer(ModelF.REV, ModelF.REV_LEN)

    sorter = ChunkSorter(ModelF.REV_LEN)

    padder = Padder(fname=ModelF.REV,
                    new_mask_fname=ModelF.REV_MASK,
                    pad_symbol=word_vocab[PAD].id,
                    padding_mode='right')

    summ_rev_indxs_creator = SummRevIndxsCreator(
        group_id_fname=ModelF.GROUP_ID, category_fname=ModelF.CAT)

    rev_mapper = RevMapper(group_rev_indxs_fname=ModelF.GROUP_REV_INDXS,
                           group_rev_mask_fname=ModelF.GROUP_REV_INDXS_MASK,
                           rev_mask_fname=ModelF.REV_MASK)

    pipeline = PyTorchPipeline(reader=reader,
                               preprocessor=group_files_shuffler,
                               worker_processes_num=workers,
                               seed=seed,
                               error_on_invalid_chunk=False,
                               timeout=None)

    pipeline.add_step(fname_renamer)

    pipeline.add_step(unit_sampler_accum)
    pipeline.add_step(postfixer)
    pipeline.add_step(chunk_shuffler)
    pipeline.add_step(chunk_accum)

    # entry transformations
    pipeline.add_step(tokenizer)
    pipeline.add_step(vocab_mapper)
    pipeline.add_step(seq_wrapper)
    pipeline.add_step(seq_len_computer)
    pipeline.add_step(sorter)
    pipeline.add_step(padder)

    # adding additional fields for attention and summarization
    pipeline.add_step(summ_rev_indxs_creator)
    pipeline.add_step(rev_mapper)

    return pipeline
示例#13
0
def assemble_tuning_pipeline(word_vocab,
                             max_groups_per_batch=1,
                             tok_func=None,
                             lowercase=False):
    """
    The pipeline yields tokenized reviews and summaries that can be used for
    training (fine-tuning of the model).
    """
    assert START in word_vocab and END in word_vocab

    reader = CsvReader(sep='\t',
                       encoding='utf-8',
                       engine='python',
                       chunk_size=None,
                       use_lists=True,
                       quating=QUOTE_NONE)

    chunk_accum = ChunkAccumulator(new_size=max_groups_per_batch)

    ama_spec_trans = AmazonTransformer(fnames_to_copy=[
        GoldDataF.PROD_ID,
        GoldDataF.CAT,
    ])
    summ_mapper = SummMapper(fname=ModelF.SUMMS,
                             new_indx_fname=ModelF.SUMM_GROUP_INDX)

    token_processor = TokenProcessor(fnames=[ModelF.REV, ModelF.SUMM],
                                     tok_func=tok_func,
                                     lowercase=lowercase)

    vocab_mapper = VocabMapper({
        ModelF.REV: word_vocab,
        ModelF.SUMM: word_vocab
    })

    fname_renamer = FieldRenamer({
        GoldDataF.PROD_ID: ModelF.GROUP_ID,
        GoldDataF.CAT: ModelF.CAT,
        ModelF.SUMMS: ModelF.SUMM
    })

    seq_wrapper = SeqWrapper(fname=[ModelF.REV, ModelF.SUMM],
                             start_el=word_vocab[START].id,
                             end_el=word_vocab[END].id)

    padder = Padder(fname=[ModelF.REV, ModelF.SUMM],
                    new_mask_fname=[ModelF.REV_MASK, ModelF.SUMM_MASK],
                    pad_symbol=word_vocab[PAD].id,
                    padding_mode='right')

    indxs_creator = GoldSummRevIndxsCreator()

    # rev_mapper = RevMapper(group_rev_indxs_fname=ModelF.GROUP_REV_INDXS,
    #                        group_rev_mask_fname=ModelF.GROUP_REV_INDXS_MASK,
    #                        rev_mask_fname=ModelF.REV_MASK)

    # props
    len_prop = SummLenProp(summ_fname=ModelF.SUMM,
                           rev_fname=ModelF.REV,
                           group_rev_indxs_fname=ModelF.GROUP_REV_INDXS,
                           summ_group_indx_fname=ModelF.SUMM_GROUP_INDX,
                           new_fname=ModelF.LEN_PROP)
    pov_prop = POVProp(text_fname=ModelF.SUMM, new_fname=ModelF.POV_PROP)
    rouge_prop = SummRougeProp(summ_fname=ModelF.SUMM,
                               rev_fname=ModelF.REV,
                               group_rev_indxs_fname=ModelF.GROUP_REV_INDXS,
                               summ_group_indx_fname=ModelF.SUMM_GROUP_INDX,
                               new_fname=ModelF.ROUGE_PROP)
    rating_prop = DummyProp(fname=ModelF.SUMM,
                            new_fname=ModelF.RATING_PROP,
                            fval=0.)

    np_formatter = NumpyFormatter([
        ModelF.LEN_PROP, ModelF.RATING_PROP, ModelF.POV_PROP, ModelF.ROUGE_PROP
    ])

    pipeline = PyTorchPipeline(reader=reader, error_on_invalid_chunk=False)

    # pipeline.add_step(shuffler)
    pipeline.add_step(chunk_accum)

    pipeline.add_step(ama_spec_trans)
    pipeline.add_step(summ_mapper)

    pipeline.add_step(fname_renamer)

    pipeline.add_step(indxs_creator)

    # props
    pipeline.add_step(rating_prop)
    pipeline.add_step(rouge_prop)

    pipeline.add_step(token_processor)

    # the props below require tokenization
    pipeline.add_step(len_prop)
    pipeline.add_step(pov_prop)

    pipeline.add_step(vocab_mapper)

    pipeline.add_step(seq_wrapper)
    pipeline.add_step(padder)

    pipeline.add_step(np_formatter)

    return pipeline
示例#14
0
def assemble_eval_pipeline(word_vocab,
                           max_groups_per_chunk=1,
                           tok_func=None,
                           lowercase=False):
    """Assembles a data-pipeline for eval. against gold summaries."""
    assert START in word_vocab and END in word_vocab

    reader = CsvReader(sep='\t',
                       encoding='utf-8',
                       engine='python',
                       chunk_size=max_groups_per_chunk,
                       use_lists=True,
                       quating=QUOTE_NONE)

    rouge_prop = SummEvalRougeKnob(
        hyp_fnames=[GoldDataF.SUMM1, GoldDataF.SUMM2, GoldDataF.SUMM3],
        ref_fnames=GoldDataF.REVS,
        new_fname=ModelF.ROUGE_PROP)

    field_dupl = FieldDuplicator({
        GoldDataF.SUMM1: TOK_SUMM1,
        GoldDataF.SUMM2: TOK_SUMM2,
        GoldDataF.SUMM3: TOK_SUMM3
    })
    tokenizer = TokenProcessor(fnames=[TOK_SUMM1, TOK_SUMM2, TOK_SUMM3] +
                               GoldDataF.REVS,
                               tok_func=tok_func,
                               lowercase=lowercase)
    field_dropper = FieldDropper([TOK_SUMM1, TOK_SUMM2, TOK_SUMM3])

    rating_prop = DummyProp(fname=GoldDataF.PROD_ID,
                            new_fname=ModelF.RATING_PROP,
                            fval=0.)
    len_prop = SummEvalLenProp(summ_fnames=[TOK_SUMM1, TOK_SUMM2, TOK_SUMM3],
                               rev_fnames=GoldDataF.REVS,
                               new_fname=ModelF.LEN_PROP)
    pov_prop = SummEvalPOVProp(summ_fnames=[TOK_SUMM1, TOK_SUMM2, TOK_SUMM3],
                               new_fname=ModelF.POV_PROP)

    # summaries are not converted to tokens
    vocab_mapper = VocabMapper({ModelF.REV: word_vocab})

    dataset_spec_trans = AmazonTransformer([
        GoldDataF.PROD_ID, GoldDataF.CAT, ModelF.ROUGE_PROP, ModelF.LEN_PROP,
        ModelF.RATING_PROP, ModelF.POV_PROP
    ])

    fname_renamer = FieldRenamer({
        GoldDataF.PROD_ID: ModelF.GROUP_ID,
        GoldDataF.CAT: ModelF.CAT
    })

    seq_wrapper = SeqWrapper(fname=[ModelF.REV],
                             start_el=word_vocab[START].id,
                             end_el=word_vocab[END].id)

    padder = Padder(fname=[ModelF.REV],
                    new_mask_fname=[ModelF.REV_MASK],
                    pad_symbol=word_vocab[PAD].id,
                    padding_mode='right')

    indxs_creator = GoldSummRevIndxsCreator()

    rev_mapper = RevMapper(group_rev_indxs_fname=ModelF.GROUP_REV_INDXS,
                           group_rev_mask_fname=ModelF.GROUP_REV_INDXS_MASK,
                           rev_mask_fname=ModelF.REV_MASK)

    np_formatter = NumpyFormatter([
        ModelF.ROUGE_PROP, ModelF.LEN_PROP, ModelF.RATING_PROP, ModelF.POV_PROP
    ])

    pipeline = PyTorchPipeline(reader=reader, error_on_invalid_chunk=False)

    pipeline.add_step(rouge_prop)
    pipeline.add_step(rating_prop)

    # props that require tokenization
    pipeline.add_step(field_dupl)
    pipeline.add_step(tokenizer)
    pipeline.add_step(pov_prop)
    pipeline.add_step(len_prop)
    pipeline.add_step(field_dropper)

    pipeline.add_step(dataset_spec_trans)

    pipeline.add_step(vocab_mapper)

    pipeline.add_step(fname_renamer)
    pipeline.add_step(seq_wrapper)

    pipeline.add_step(padder)

    pipeline.add_step(indxs_creator)
    pipeline.add_step(rev_mapper)

    pipeline.add_step(np_formatter)

    return pipeline
def assemble_eval_pipeline(word_vocab,
                           max_groups_per_chunk=1,
                           dataset='yelp',
                           tokenization_func=lambda x: x.split()):
    """Assembles the pipeline for evaluation on the YELP and Amazon eval set."""
    assert dataset in ['yelp', 'amazon']

    if dataset == 'yelp':
        fields_obj = YelpEvalF
        fname_renamer = FieldRenamer({fields_obj.BUSINESS_ID: ModelF.GROUP_ID})
        dataset_spec_trans = YelpTransformer()
    else:
        fields_obj = AmazonEvalF
        fname_renamer = FieldRenamer({
            fields_obj.PROD_ID: ModelF.GROUP_ID,
            fields_obj.CAT: ModelF.CAT
        })
        dataset_spec_trans = AmazonTransformer()

    assert START in word_vocab
    assert END in word_vocab

    reader = CsvReader(sep='\t',
                       encoding='utf-8',
                       engine='python',
                       quotechar='\'',
                       chunk_size=max_groups_per_chunk)

    # notice that I do not tokenize summaries, I leave them as they are!
    token_processor = TokenProcessor(fnames=fields_obj.REVS,
                                     tokenization_func=tokenization_func)
    # notice that I don't convert summs tokens to ids
    vocab_mapper = VocabMapper({fn: word_vocab for fn in fields_obj.REVS})

    seq_wrapper = SeqWrapper(ModelF.REV,
                             start_el=word_vocab[START].id,
                             end_el=word_vocab[END].id)

    seq_len_computer = SeqLenComputer(ModelF.REV, ModelF.REV_LEN)

    padder = Padder(fname=ModelF.REV,
                    new_mask_fname=ModelF.REV_MASK,
                    pad_symbol=word_vocab[PAD].id,
                    padding_mode='right')

    sorter = ChunkSorter(field_name=ModelF.REV_LEN,
                         fields_to_sort=[
                             ModelF.REV, ModelF.REV_MASK, ModelF.CAT,
                             ModelF.GROUP_ID
                         ])

    indxs_creator = GoldSummRevIndxsCreator(group_id_fname=ModelF.GROUP_ID)

    rev_mapper = RevMapper(group_rev_indxs_fname=ModelF.GROUP_REV_INDXS,
                           group_rev_mask_fname=ModelF.GROUP_REV_INDXS_MASK,
                           rev_mask_fname=ModelF.REV_MASK)

    formatter = PyTorchFormatter()

    pipeline = Pipeline(reader=reader, error_on_invalid_chunk=False)

    pipeline.add_step(token_processor)
    pipeline.add_step(vocab_mapper)

    pipeline.add_step(dataset_spec_trans)
    pipeline.add_step(fname_renamer)

    pipeline.add_step(seq_wrapper)
    pipeline.add_step(seq_len_computer)
    pipeline.add_step(padder)
    pipeline.add_step(sorter)
    pipeline.add_step(indxs_creator)
    pipeline.add_step(rev_mapper)

    pipeline.add_step(formatter)

    return pipeline
示例#16
0
 def setUp(self):
     self.reader = CsvReader(sep=',')
示例#17
0
def assemble_infer_pipeline(word_vocab,
                            max_groups_per_chunk=1,
                            max_reviews=10,
                            tokenization_func=lambda x: x.split()):
    """Assembles a simple inference pipeline for summary generation. Assumes
    that csv files are read where reviews have the following column names:
    'rev1', 'rev2', ..., 'revN', each review separated by \t.

    Args:
        word_vocab: word vocabulary to convert words to ids.
        max_groups_per_chunk: self-explanatory.
        max_reviews: the maximum number of reviews to load per group. Columns
            in the CSV file should be `rev1`, ...., `revN`.
        tokenization_func: self-explanatory.
    """
    rev_fnames = [
        f'{InfDataF.REV_PREFIX}{i}' for i in range(1, max_reviews + 1)
    ]

    assert START in word_vocab
    assert END in word_vocab

    reader = CsvReader(sep='\t',
                       encoding='utf-8',
                       engine='python',
                       quotechar='\'',
                       chunk_size=max_groups_per_chunk)

    rev_flattener = ReviewFlattener(group_id_fname=InfDataF.GROUP_ID,
                                    rev_fnames=rev_fnames)

    token_processor = TokenProcessor(fnames=ModelF.REV,
                                     tokenization_func=tokenization_func)
    # notice that I don't convert summs tokens to ids
    vocab_mapper = VocabMapper({ModelF.REV: word_vocab})

    seq_wrapper = SeqWrapper(ModelF.REV,
                             start_el=word_vocab[START].id,
                             end_el=word_vocab[END].id)

    seq_len_computer = SeqLenComputer(ModelF.REV, ModelF.REV_LEN)

    padder = Padder(fname=ModelF.REV,
                    new_mask_fname=ModelF.REV_MASK,
                    pad_symbol=word_vocab[PAD].id,
                    padding_mode='right')

    sorter = ChunkSorter(field_name=ModelF.REV_LEN,
                         fields_to_sort=[ModelF.REV, ModelF.GROUP_ID])

    # re-using the step
    summ_rev_indx_creator = GoldSummRevIndxsCreator(
        group_id_fname=ModelF.GROUP_ID)

    pipeline = PyTorchPipeline(reader=reader, error_on_invalid_chunk=False)

    pipeline.add_step(rev_flattener)
    pipeline.add_step(token_processor)
    pipeline.add_step(vocab_mapper)

    pipeline.add_step(seq_wrapper)
    pipeline.add_step(seq_len_computer)
    pipeline.add_step(padder)
    pipeline.add_step(sorter)

    pipeline.add_step(summ_rev_indx_creator)

    return pipeline
示例#18
0
def assemble_unsup_pipeline(word_vocab,
                            max_groups_per_batch=1,
                            reader_threads=5,
                            min_revs_per_group=None,
                            max_revs_per_group=10,
                            worker_num=1,
                            seed=None,
                            tok_func=None,
                            lowercase=True,
                            max_len=None,
                            shuffler_buffer_size=250):
    """Creates a data-pipeline that yields batches for to train the unsup. model.

    Creates a flow of data transformation steps that modify the data until the
    final form is reached in terms of PyTorch tensors.

    Args:
        word_vocab: vocabulary object with words/tokens.
        max_groups_per_batch: number of groups each batch should have.
        min_revs_per_group: number of reviews a group should have in order not
            to be discarded.
        max_revs_per_group: self-explanatory.
        seed: used to use the same data subsamples/shuffles every epoch.
        max_len: if passed will filter out all reviews that a longer than the
            threshold.
    Returns:
        DataPipeline object that allows iteration over batches/chunks.
    """
    assert START in word_vocab and END in word_vocab

    file_shuffler = FileShuffler()

    # TODO: explain how grouping works here - each file has reviews of a group

    reader = CsvReader(sep='\t',
                       engine='c',
                       chunk_size=None,
                       encoding='utf-8',
                       quoting=QUOTE_NONE,
                       buffer_size=200,
                       timeout=None,
                       worker_threads_num=reader_threads,
                       use_lists=True)

    fname_renamer = FieldRenamer({
        InpDataF.REV_TEXT: ModelF.REV,
        InpDataF.GROUP_ID: ModelF.REV_GROUP_ID,
        InpDataF.RATING: ModelF.REV_RATING,
        InpDataF.RATING_DEV: ModelF.RATING_PROP,
        InpDataF.CAT: ModelF.REV_CAT
    })

    unit_sampler = UnitSampler(id_fname=ModelF.REV_GROUP_ID,
                               sample_all=True,
                               min_units=min_revs_per_group,
                               max_units=max_revs_per_group)

    unit_sampler_accum = ChunkAccumulator(unit_sampler)

    # since we're splitting one group into multiple chunks, it's convenient
    # to postfix each group_id name, such that it would be possible to
    # associate summaries with different subsets of reviews
    postfixer = Postfixer(id_fname=ModelF.REV_GROUP_ID)

    # property and related steps
    len_prop = LenProp(len_fname=ModelF.REV_LEN, new_fname=ModelF.LEN_PROP)
    pov_prop = POVProp(text_fname=ModelF.REV, new_fname=ModelF.POV_PROP)

    rouge_field_merger = FieldMerger(
        merge_fnames=[InpDataF.ROUGE1, InpDataF.ROUGE2, InpDataF.ROUGEL],
        new_fname=ModelF.ROUGE_PROP)

    # to avoid having same product/business appearing in the same merged
    # data-chunk, buffer a small number of them, shuffle, and release
    chunk_shuffler = ChunkAccumulator(
        ChunkShuffler(buffer_size=shuffler_buffer_size))

    # accumulates a fixed number of group chunks, merges them
    # together, and passes along the pipeline
    chunk_coll = ChunkCollector(buffer_size=max_groups_per_batch, strict=True)
    chunk_accum = ChunkAccumulator(chunk_coll)

    # alternation of data entries
    tokenizer = TokenProcessor(fnames=ModelF.REV,
                               tok_func=tok_func,
                               lowercase=lowercase)
    vocab_mapper = VocabMapper({ModelF.REV: word_vocab})

    seq_wrapper = SeqWrapper(fname=ModelF.REV,
                             start_el=word_vocab[START].token,
                             end_el=word_vocab[END].token)

    seq_len_computer = SeqLenComputer(ModelF.REV, ModelF.REV_LEN)

    padder = Padder(fname=ModelF.REV,
                    new_mask_fname=ModelF.REV_MASK,
                    pad_symbol=word_vocab[PAD].id,
                    padding_mode='right')

    summ_rev_indxs_creator = GroupRevIndxsCreator(
        rev_group_id_fname=ModelF.REV_GROUP_ID, rev_cat_fname=ModelF.REV_CAT)

    rev_mapper = RevMapper(group_rev_indxs_fname=ModelF.GROUP_REV_INDXS,
                           group_rev_mask_fname=ModelF.GROUP_REV_INDXS_MASK,
                           rev_mask_fname=ModelF.REV_MASK)

    # extra steps for the loss associated with probability mass
    un_word_cal = UniqueWordCalc(
        new_fname=ModelF.OTHER_REV_UWORDS,
        rev_fname=ModelF.REV,
        other_rev_indxs_fname=ModelF.OTHER_REV_INDXS,
        other_rev_indxs_mask_fname=ModelF.OTHER_REV_INDXS_MASK)
    un_word_padder = Padder(fname=ModelF.OTHER_REV_UWORDS,
                            new_mask_fname=ModelF.OTHER_REV_UWORDS_MASK,
                            pad_symbol=word_vocab[PAD].id,
                            padding_mode='right')

    numpy_formatter = NumpyFormatter(fnames=[
        ModelF.ROUGE_PROP, ModelF.RATING_PROP, ModelF.LEN_PROP, ModelF.POV_PROP
    ])

    pipeline = PyTorchPipeline(reader=reader,
                               preprocessor=file_shuffler,
                               worker_processes_num=worker_num,
                               seed=seed,
                               output_buffer_size=50,
                               error_on_invalid_chunk=False,
                               timeout=None)

    pipeline.add_step(fname_renamer)
    pipeline.add_step(rouge_field_merger)
    pipeline.add_step(tokenizer)

    if max_len:
        pipeline.add_step(TextLenFilter(fname=ModelF.REV, max_len=max_len))

    pipeline.add_step(unit_sampler_accum)
    pipeline.add_step(postfixer)

    pipeline.add_step(chunk_shuffler)

    pipeline.add_step(seq_wrapper)
    pipeline.add_step(seq_len_computer)

    # properties
    pipeline.add_step(len_prop)
    pipeline.add_step(pov_prop)

    pipeline.add_step(chunk_accum)
    pipeline.add_step(vocab_mapper)
    pipeline.add_step(padder)

    # adding additional fields for attention and summarization
    pipeline.add_step(summ_rev_indxs_creator)
    pipeline.add_step(rev_mapper)

    # adding steps for word count computation
    pipeline.add_step(un_word_cal)
    pipeline.add_step(un_word_padder)

    pipeline.add_step(numpy_formatter)

    return pipeline
示例#19
0
    def test_how_to_apply_run(self):

        data_path = os.path.join(self.tutorials_path, "data/tweets.csv")

        # paths where vocabs will be saved and later loaded from
        words_vocab_file_path = os.path.join(self.tutorials_path,
                                             "data/vocabs/words.txt")
        labels_vocab_file_path = os.path.join(self.tutorials_path,
                                              'data/vocabs/labels.txt')

        # creating step objects
        twitter_tokenizer = TweetTokenizer()
        preprocessor = TwitterFilesPreprocessor(
            input_cols_number=3,
            tweets_indx=2,
            add_header=['ids', 'labels', 'tweets'])
        csv_reader = CsvReader(sep='\t', chunk_size=30)
        fields_selector = FieldSelector(fnames=["tweets", "labels"])
        token_processor = TokenProcessor(
            fnames="tweets",
            tok_func=twitter_tokenizer.tokenize,
            tok_cleaning_func=twitter_text_cleaner,
            lowercase=True)

        # data pipeline for vocabularies creation
        vocab_data_pipeline = Pipeline(reader=csv_reader,
                                       preprocessor=preprocessor,
                                       worker_processes_num=0,
                                       name_prefix="vocabs")
        vocab_data_pipeline.add_step(fields_selector)
        vocab_data_pipeline.add_step(token_processor)

        # creating or loading vocabs
        words_vocab = Vocabulary(vocab_data_pipeline, name_prefix="words")
        words_vocab.load_or_create(words_vocab_file_path,
                                   data_source={"data_path": data_path},
                                   data_fnames="tweets")

        labels_vocab = Vocabulary(vocab_data_pipeline, name_prefix="labels")
        labels_vocab.load_or_create(labels_vocab_file_path,
                                    data_source={"data_path": data_path},
                                    data_fnames="labels")

        print(words_vocab)

        print(labels_vocab)

        print(vocab_data_pipeline)

        # extra steps for training and evaluation
        mapper = VocabMapper(field_names_to_vocabs={
            "tweets": words_vocab,
            "labels": labels_vocab
        })
        padder = Padder(fname="tweets",
                        new_mask_fname="tweets_mask",
                        pad_symbol=words_vocab[PAD].id)
        formatter = FeaturesLabelsFormatter(features_field_name="tweets",
                                            labels_field_name="labels",
                                            classes_number=len(labels_vocab))

        # building the actual pipeline
        dev_data_pipeline = Pipeline(reader=csv_reader,
                                     preprocessor=preprocessor,
                                     worker_processes_num=1,
                                     name_prefix="dev")
        dev_data_pipeline.add_step(fields_selector)
        dev_data_pipeline.add_step(token_processor)
        dev_data_pipeline.add_step(mapper)
        dev_data_pipeline.add_step(padder)
        dev_data_pipeline.add_step(formatter)

        print(dev_data_pipeline)

        epochs = 2

        i_model = ISentiLSTM(dev_data_pipeline)
        i_model.init_model(words_vocab_size=len(words_vocab),
                           input_dim=50,
                           lstm_hidden_dim=120,
                           number_of_classes=len(labels_vocab),
                           mask_symbol=words_vocab[PAD].id)
示例#20
0
def assemble_postproc_pipeline(text_prep_func,
                               min_revs_per_group=None,
                               seed=None,
                               max_revs_per_group=10,
                               rouge_kwargs=None,
                               workers=1):
    """Creates a data-pipeline that yields batches with computed ROUGE score.

    Args:
        min_revs_per_group: number of reviews a group should have in order not
            to be discarded.
        max_revs_per_group: self-explanatory.
        seed: used to use the same data subsamples/shuffles every epoch.
    Returns:
        DataPipeline object that allows iteration over batches/chunks.
    """

    reader = CsvReader(sep='\t',
                       engine='c',
                       chunk_size=None,
                       encoding='utf-8',
                       quoting=QUOTE_NONE,
                       use_lists=True,
                       timeout=None,
                       worker_threads_num=1)

    converter = TypeConverter(fname=InpDataF.RATING,
                              dtype_func=float,
                              remove_invalid=True)

    unit_sampler = UnitSampler(id_fname=InpDataF.GROUP_ID,
                               sample_all=True,
                               min_units=min_revs_per_group,
                               max_units=max_revs_per_group)

    unit_sampler_accum = ChunkAccumulator(unit_sampler)

    # since we're splitting one group into multiple chunks, it's convenient
    # to postfix each group_id name, such that it would be possible to
    # associate summaries with different subsets of reviews
    postfixer = Postfixer(id_fname=InpDataF.GROUP_ID)

    field_dupl = FieldDuplicator({InpDataF.REV_TEXT: "dummy"})

    # the field below is needed as I'm detokenizing (with text_prep_func)
    # before computing ROUGE.
    func_appl = FunctionApplier({'dummy': text_prep_func})

    # props
    rouge_prop = RougeProp(rev_fname='dummy', rouge_kwargs=rouge_kwargs)
    rating_prop = RatingProp(rating_fname=InpDataF.RATING,
                             new_fname=InpDataF.RATING_DEV)

    field_dropper = FieldDropper('dummy')

    pipeline = Pipeline(reader=reader,
                        worker_processes_num=workers,
                        seed=seed,
                        output_buffer_size=40,
                        error_on_invalid_chunk=True,
                        timeout=None)
    pipeline.add_step(converter)
    pipeline.add_step(unit_sampler_accum)
    pipeline.add_step(postfixer)
    pipeline.add_step(field_dupl)
    pipeline.add_step(func_appl)
    # props
    pipeline.add_step(rouge_prop)
    pipeline.add_step(field_dropper)
    pipeline.add_step(rating_prop)

    return pipeline
 def _iter(self, data_path):
     for dc in CsvReader._iter(self, data_path=data_path):
         yield dc.data