def test_multi_threaded_reader_output(self): """ Check if multi-threaded and single threaded readers produce the correct output. """ data_paths = [ 'mldp/tests/data/small_chunks/chunk1.csv', 'mldp/tests/data/small_chunks/chunk2.csv', 'mldp/tests/data/small_chunks/chunk3.csv' ] chunk_size = 2 reader = CsvReader(chunk_size=chunk_size, worker_threads_num=3, sep=',') expected_data = read_data_from_csv_file(data_paths) actual_data_chunks = DataChunk() for data_chunk in reader.iter(data_path=data_paths): for key in data_chunk.keys(): if key not in actual_data_chunks: actual_data_chunks[key] = np.array([]) actual_data_chunks[key] = np.concatenate( [actual_data_chunks[key], data_chunk[key]]) self.compare_unsorted_data_chunks(dc1=expected_data, dc2=actual_data_chunks, sort_key='id')
def test_valid_paths(self): """ Passing an intentionally wrong input to the reader and expecting it to throw an error. """ data_paths = ["a", "b", 123123123, 123.12313] reader = CsvReader() with self.assertRaises(ValueError): itr = reader.iter(data_path=data_paths) chunk = next(itr.__iter__())
def test_invalid_steps(self): """Testing whether an error is raised if an invalid step is present.""" data_path = 'mldp/tests/data/news.csv' data_source = {'data_path': data_path} inv_reader = InvalidCsvReader() val_reader = CsvReader() val_transf1 = FieldSelector("text") val_transf2 = TokenProcessor(fnames='text') inv_transf1 = InvalidTransformer() accum = ChunkAccumulator(new_size=3) formatter = PandasFormatter() # try only the invalid reader and valid steps dp = Pipeline(reader=inv_reader, error_on_invalid_chunk='error') for vs in [val_transf1, val_transf2, accum, formatter]: dp.add_step(vs) with self.assertRaises(DataChunkError): for _ in dp.iter(**data_source): pass # try valid reader and invalid steps steps = [val_transf1, val_transf2, inv_transf1, accum] for st in permutations(steps): dp = Pipeline(reader=val_reader, error_on_invalid_chunk='error') for s in st: dp.add_step(s) dp.add_step(formatter) with self.assertRaises(DataChunkError): for _ in dp.iter(**data_source): pass
def test_readme_example(self): from mltoolkit.mldp.pipeline import Pipeline from mltoolkit.mldp.steps.readers import CsvReader from mltoolkit.mldp.steps.transformers.nlp import TokenProcessor, Padder from mltoolkit.mldp.steps.transformers.field import FieldSelector data_path = "mltoolkit/mldp/tests/data/tweets.csv" # creating steps csv_reader = CsvReader(sep='\t', chunk_size=30) fields_selector = FieldSelector(fnames=["tweets", "labels"]) token_processor = TokenProcessor(fnames="tweets", tok_func=lambda x: x.split(), lowercase=True) padder = Padder(fname="tweets", new_mask_fname="tweets_mask", pad_symbol="<PAD>") # creating the pipeline pipeline = Pipeline(reader=csv_reader, worker_processes_num=1) pipeline.add_step(fields_selector) pipeline.add_step(token_processor) pipeline.add_step(padder) # iterate over data chunks for data_chunk in pipeline.iter(data_path=data_path): pass # generate documentation and print it print(pipeline)
def test_simple_scenario(self): """ Tries to run the pipeline, and if it works - it's considered to be successful. Tries different numbers of workers. """ data_path = 'mldp/tests/data/small_chunks' field_names = ['first_name', 'email'] worker_processes_nums = [0, 1, 2, 3, 4] reader = CsvReader(sep=",") for wpn in worker_processes_nums: dev_data_pipeline = Pipeline(reader=reader, worker_processes_num=wpn) dev_data_pipeline.add_step(FieldSelector(field_names)) dev_data_pipeline.add_step(ChunkAccumulator(new_size=3)) dev_data_pipeline.add_step(PandasFormatter()) flag = False for data_chunk in dev_data_pipeline.iter(data_path=data_path): flag = True self.assertTrue(len(data_chunk) > 0) self.assertTrue(flag)
def train_and_save_true_casing_model(input_fps, text_fname, output_fp): """Trains the Moses model on tokenized csv files; saves params.""" mtr = MosesTruecaser(is_asr=True) reader = CsvReader(quoting=QUOTE_NONE, sep='\t', engine='python', encoding='utf-8') texts = [] logger.info("Loading data from: '%s'." % input_fps) for dc in reader.iter(data_path=input_fps): for du in dc.iter(): texts.append(du[text_fname].split()) logger.info("Loaded the data.") safe_mkfdir(output_fp) logger.info("Training the truecaser.") mtr.train(texts, save_to=output_fp, progress_bar=True, processes=1) logger.info("Done, saved the model to: '%s'." % output_fp)
def test_output(self): """Checking if read data-chunks are valid.""" data_path = 'mltoolkit/mldp/tests/data/small_chunks/chunk2.csv' chunk_size = 2 reader = CsvReader(chunk_size=chunk_size, worker_threads_num=1, sep=',', encoding='utf-8', use_lists=False) data = read_data_from_csv_file(data_path, encoding='utf-8') expected_chunks = create_list_of_data_chunks(data, chunk_size=chunk_size) itr = reader.iter(data_path=data_path) i = 0 for (actual_chunk, expected_chunk) in zip(itr, expected_chunks): self.assertTrue(actual_chunk == expected_chunk) i += 1 self.assertTrue(i == len(expected_chunks) and len(expected_chunks) > 0)
def assemble_vocab_pipeline(text_fname, sep='\t', encoding='utf-8', tok_func=None, lowercase=True): """Assembler for the vocabulary pipeline based on a CSV reader.""" reader = CsvReader(sep=sep, encoding=encoding, quoting=QUOTE_NONE) token_processor = TokenProcessor(fnames=text_fname, lowercase=lowercase, tok_func=tok_func) # creating vocabulary pipeline vocab_pipeline = Pipeline(reader=reader) vocab_pipeline.add_step(token_processor) return vocab_pipeline
def test_invalid_pipeline(self): """ Tries to create an invalid data processing pipeline, and expect to get an error. """ reader = CsvReader() with self.assertRaises(ValueError): data_pipeline = Pipeline(reader) data_pipeline.add_step(FieldSelector(["dummy"])) data_pipeline.add_step(PandasFormatter()) data_pipeline.add_step(FunctionApplier({"dummy": lambda x: x}))
def test_empty_chunks(self): """Testing whether empty chunks do not reach user.""" data_path = 'mltoolkit/mldp/tests/data/small_chunks' field_names = ['first_name', 'email'] reader = CsvReader(chunk_size=1, sep=",") empty_chunk_transformer = EmptyChunkTransformer(max_count=3) dev_data_pipeline = Pipeline(reader=reader) dev_data_pipeline.add_step(empty_chunk_transformer) dev_data_pipeline.add_step(FieldSelector(field_names)) flag = False for dc in dev_data_pipeline.iter(data_path=data_path): flag = True self.assertFalse(equal_to_constant(dc, EMPTY_CHUNK)) self.assertTrue(flag)
def test_vocabulary_mapper(self): """Testing whether the mapper allows to map back and forth field values. """ data_path = 'mldp/tests/data/mock_data.csv' target_fields = ["first_name", "last_name", "email", "gender"] reader = CsvReader(sep=',') vocab = Vocabulary(reader) for target_field in target_fields: vocab.create(data_source={"data_path": data_path}, data_fnames=target_field) data = read_data_from_csv_file(data_path) data_original = copy.deepcopy(data) mapper_to = VocabMapper({target_field: vocab}, "id") mapper_back = VocabMapper({target_field: vocab}, "token") data = mapper_to(data) data = mapper_back(data) self.assertTrue( (data[target_field] == data_original[target_field]).all())
def assemble_train_pipeline(word_vocab, max_groups_per_batch=1, min_revs_per_group=None, max_revs_per_group=10, seed=None, workers=1): """ This pipeline is specific to the preprocessed Amazon and Yelp reviews. Creates a flow of transformation steps that modify the data until the final form is reached in terms of PyTorch tensors. :param word_vocab: vocabulary object with words/tokens. :param max_groups_per_batch: number of groups each batch should have. :param min_revs_per_group: number of reviews a group should have in order not to be discarded. :param max_revs_per_group: self-explanatory. :param reseed: set it to True if use multi-processing and want it to return different sequences of batches every epoch. This has to do purely with multi-processing issues in combination with numpy. """ assert START in word_vocab assert END in word_vocab group_files_shuffler = GroupFileShuffler() reader = CsvReader(sep='\t', engine='python', chunk_size=None, encoding='utf-8', quoting=QUOTE_NONE, timeout=None, worker_threads_num=1) fname_renamer = FieldRenamer({ InpDataF.REV_TEXT: ModelF.REV, InpDataF.GROUP_ID: ModelF.GROUP_ID }) unit_sampler = UnitSampler(id_fname=ModelF.GROUP_ID, sample_all=True, min_units=min_revs_per_group, max_units=max_revs_per_group) unit_sampler_accum = ChunkAccumulator(unit_sampler) # since we're splitting one group into multiple chunks, it's convenient # to postfix each group_id name, such that it would be possible to # associate summaries with different subsets of reviews postfixer = Postfixer(id_fname=ModelF.GROUP_ID) # to avoid having same product/business appearing in the same merged # data-chunk, buffer a small number of them, shuffle, and release chunk_shuffler = ChunkAccumulator(ChunkShuffler(buffer_size=500)) # accumulates a fixed number of group chunks, merges them # together, and passes along the pipeline chunk_coll = ChunkCollector(buffer_size=max_groups_per_batch) chunk_accum = ChunkAccumulator(chunk_coll) # alternation of data entries tokenizer = TokenProcessor(fnames=ModelF.REV) vocab_mapper = VocabMapper({ModelF.REV: word_vocab}) seq_wrapper = SeqWrapper(fname=ModelF.REV, start_el=word_vocab[START].id, end_el=word_vocab[END].id) seq_len_computer = SeqLenComputer(ModelF.REV, ModelF.REV_LEN) sorter = ChunkSorter(ModelF.REV_LEN) padder = Padder(fname=ModelF.REV, new_mask_fname=ModelF.REV_MASK, pad_symbol=word_vocab[PAD].id, padding_mode='right') summ_rev_indxs_creator = SummRevIndxsCreator( group_id_fname=ModelF.GROUP_ID, category_fname=ModelF.CAT) rev_mapper = RevMapper(group_rev_indxs_fname=ModelF.GROUP_REV_INDXS, group_rev_mask_fname=ModelF.GROUP_REV_INDXS_MASK, rev_mask_fname=ModelF.REV_MASK) pipeline = PyTorchPipeline(reader=reader, preprocessor=group_files_shuffler, worker_processes_num=workers, seed=seed, error_on_invalid_chunk=False, timeout=None) pipeline.add_step(fname_renamer) pipeline.add_step(unit_sampler_accum) pipeline.add_step(postfixer) pipeline.add_step(chunk_shuffler) pipeline.add_step(chunk_accum) # entry transformations pipeline.add_step(tokenizer) pipeline.add_step(vocab_mapper) pipeline.add_step(seq_wrapper) pipeline.add_step(seq_len_computer) pipeline.add_step(sorter) pipeline.add_step(padder) # adding additional fields for attention and summarization pipeline.add_step(summ_rev_indxs_creator) pipeline.add_step(rev_mapper) return pipeline
def assemble_tuning_pipeline(word_vocab, max_groups_per_batch=1, tok_func=None, lowercase=False): """ The pipeline yields tokenized reviews and summaries that can be used for training (fine-tuning of the model). """ assert START in word_vocab and END in word_vocab reader = CsvReader(sep='\t', encoding='utf-8', engine='python', chunk_size=None, use_lists=True, quating=QUOTE_NONE) chunk_accum = ChunkAccumulator(new_size=max_groups_per_batch) ama_spec_trans = AmazonTransformer(fnames_to_copy=[ GoldDataF.PROD_ID, GoldDataF.CAT, ]) summ_mapper = SummMapper(fname=ModelF.SUMMS, new_indx_fname=ModelF.SUMM_GROUP_INDX) token_processor = TokenProcessor(fnames=[ModelF.REV, ModelF.SUMM], tok_func=tok_func, lowercase=lowercase) vocab_mapper = VocabMapper({ ModelF.REV: word_vocab, ModelF.SUMM: word_vocab }) fname_renamer = FieldRenamer({ GoldDataF.PROD_ID: ModelF.GROUP_ID, GoldDataF.CAT: ModelF.CAT, ModelF.SUMMS: ModelF.SUMM }) seq_wrapper = SeqWrapper(fname=[ModelF.REV, ModelF.SUMM], start_el=word_vocab[START].id, end_el=word_vocab[END].id) padder = Padder(fname=[ModelF.REV, ModelF.SUMM], new_mask_fname=[ModelF.REV_MASK, ModelF.SUMM_MASK], pad_symbol=word_vocab[PAD].id, padding_mode='right') indxs_creator = GoldSummRevIndxsCreator() # rev_mapper = RevMapper(group_rev_indxs_fname=ModelF.GROUP_REV_INDXS, # group_rev_mask_fname=ModelF.GROUP_REV_INDXS_MASK, # rev_mask_fname=ModelF.REV_MASK) # props len_prop = SummLenProp(summ_fname=ModelF.SUMM, rev_fname=ModelF.REV, group_rev_indxs_fname=ModelF.GROUP_REV_INDXS, summ_group_indx_fname=ModelF.SUMM_GROUP_INDX, new_fname=ModelF.LEN_PROP) pov_prop = POVProp(text_fname=ModelF.SUMM, new_fname=ModelF.POV_PROP) rouge_prop = SummRougeProp(summ_fname=ModelF.SUMM, rev_fname=ModelF.REV, group_rev_indxs_fname=ModelF.GROUP_REV_INDXS, summ_group_indx_fname=ModelF.SUMM_GROUP_INDX, new_fname=ModelF.ROUGE_PROP) rating_prop = DummyProp(fname=ModelF.SUMM, new_fname=ModelF.RATING_PROP, fval=0.) np_formatter = NumpyFormatter([ ModelF.LEN_PROP, ModelF.RATING_PROP, ModelF.POV_PROP, ModelF.ROUGE_PROP ]) pipeline = PyTorchPipeline(reader=reader, error_on_invalid_chunk=False) # pipeline.add_step(shuffler) pipeline.add_step(chunk_accum) pipeline.add_step(ama_spec_trans) pipeline.add_step(summ_mapper) pipeline.add_step(fname_renamer) pipeline.add_step(indxs_creator) # props pipeline.add_step(rating_prop) pipeline.add_step(rouge_prop) pipeline.add_step(token_processor) # the props below require tokenization pipeline.add_step(len_prop) pipeline.add_step(pov_prop) pipeline.add_step(vocab_mapper) pipeline.add_step(seq_wrapper) pipeline.add_step(padder) pipeline.add_step(np_formatter) return pipeline
def assemble_eval_pipeline(word_vocab, max_groups_per_chunk=1, tok_func=None, lowercase=False): """Assembles a data-pipeline for eval. against gold summaries.""" assert START in word_vocab and END in word_vocab reader = CsvReader(sep='\t', encoding='utf-8', engine='python', chunk_size=max_groups_per_chunk, use_lists=True, quating=QUOTE_NONE) rouge_prop = SummEvalRougeKnob( hyp_fnames=[GoldDataF.SUMM1, GoldDataF.SUMM2, GoldDataF.SUMM3], ref_fnames=GoldDataF.REVS, new_fname=ModelF.ROUGE_PROP) field_dupl = FieldDuplicator({ GoldDataF.SUMM1: TOK_SUMM1, GoldDataF.SUMM2: TOK_SUMM2, GoldDataF.SUMM3: TOK_SUMM3 }) tokenizer = TokenProcessor(fnames=[TOK_SUMM1, TOK_SUMM2, TOK_SUMM3] + GoldDataF.REVS, tok_func=tok_func, lowercase=lowercase) field_dropper = FieldDropper([TOK_SUMM1, TOK_SUMM2, TOK_SUMM3]) rating_prop = DummyProp(fname=GoldDataF.PROD_ID, new_fname=ModelF.RATING_PROP, fval=0.) len_prop = SummEvalLenProp(summ_fnames=[TOK_SUMM1, TOK_SUMM2, TOK_SUMM3], rev_fnames=GoldDataF.REVS, new_fname=ModelF.LEN_PROP) pov_prop = SummEvalPOVProp(summ_fnames=[TOK_SUMM1, TOK_SUMM2, TOK_SUMM3], new_fname=ModelF.POV_PROP) # summaries are not converted to tokens vocab_mapper = VocabMapper({ModelF.REV: word_vocab}) dataset_spec_trans = AmazonTransformer([ GoldDataF.PROD_ID, GoldDataF.CAT, ModelF.ROUGE_PROP, ModelF.LEN_PROP, ModelF.RATING_PROP, ModelF.POV_PROP ]) fname_renamer = FieldRenamer({ GoldDataF.PROD_ID: ModelF.GROUP_ID, GoldDataF.CAT: ModelF.CAT }) seq_wrapper = SeqWrapper(fname=[ModelF.REV], start_el=word_vocab[START].id, end_el=word_vocab[END].id) padder = Padder(fname=[ModelF.REV], new_mask_fname=[ModelF.REV_MASK], pad_symbol=word_vocab[PAD].id, padding_mode='right') indxs_creator = GoldSummRevIndxsCreator() rev_mapper = RevMapper(group_rev_indxs_fname=ModelF.GROUP_REV_INDXS, group_rev_mask_fname=ModelF.GROUP_REV_INDXS_MASK, rev_mask_fname=ModelF.REV_MASK) np_formatter = NumpyFormatter([ ModelF.ROUGE_PROP, ModelF.LEN_PROP, ModelF.RATING_PROP, ModelF.POV_PROP ]) pipeline = PyTorchPipeline(reader=reader, error_on_invalid_chunk=False) pipeline.add_step(rouge_prop) pipeline.add_step(rating_prop) # props that require tokenization pipeline.add_step(field_dupl) pipeline.add_step(tokenizer) pipeline.add_step(pov_prop) pipeline.add_step(len_prop) pipeline.add_step(field_dropper) pipeline.add_step(dataset_spec_trans) pipeline.add_step(vocab_mapper) pipeline.add_step(fname_renamer) pipeline.add_step(seq_wrapper) pipeline.add_step(padder) pipeline.add_step(indxs_creator) pipeline.add_step(rev_mapper) pipeline.add_step(np_formatter) return pipeline
def assemble_eval_pipeline(word_vocab, max_groups_per_chunk=1, dataset='yelp', tokenization_func=lambda x: x.split()): """Assembles the pipeline for evaluation on the YELP and Amazon eval set.""" assert dataset in ['yelp', 'amazon'] if dataset == 'yelp': fields_obj = YelpEvalF fname_renamer = FieldRenamer({fields_obj.BUSINESS_ID: ModelF.GROUP_ID}) dataset_spec_trans = YelpTransformer() else: fields_obj = AmazonEvalF fname_renamer = FieldRenamer({ fields_obj.PROD_ID: ModelF.GROUP_ID, fields_obj.CAT: ModelF.CAT }) dataset_spec_trans = AmazonTransformer() assert START in word_vocab assert END in word_vocab reader = CsvReader(sep='\t', encoding='utf-8', engine='python', quotechar='\'', chunk_size=max_groups_per_chunk) # notice that I do not tokenize summaries, I leave them as they are! token_processor = TokenProcessor(fnames=fields_obj.REVS, tokenization_func=tokenization_func) # notice that I don't convert summs tokens to ids vocab_mapper = VocabMapper({fn: word_vocab for fn in fields_obj.REVS}) seq_wrapper = SeqWrapper(ModelF.REV, start_el=word_vocab[START].id, end_el=word_vocab[END].id) seq_len_computer = SeqLenComputer(ModelF.REV, ModelF.REV_LEN) padder = Padder(fname=ModelF.REV, new_mask_fname=ModelF.REV_MASK, pad_symbol=word_vocab[PAD].id, padding_mode='right') sorter = ChunkSorter(field_name=ModelF.REV_LEN, fields_to_sort=[ ModelF.REV, ModelF.REV_MASK, ModelF.CAT, ModelF.GROUP_ID ]) indxs_creator = GoldSummRevIndxsCreator(group_id_fname=ModelF.GROUP_ID) rev_mapper = RevMapper(group_rev_indxs_fname=ModelF.GROUP_REV_INDXS, group_rev_mask_fname=ModelF.GROUP_REV_INDXS_MASK, rev_mask_fname=ModelF.REV_MASK) formatter = PyTorchFormatter() pipeline = Pipeline(reader=reader, error_on_invalid_chunk=False) pipeline.add_step(token_processor) pipeline.add_step(vocab_mapper) pipeline.add_step(dataset_spec_trans) pipeline.add_step(fname_renamer) pipeline.add_step(seq_wrapper) pipeline.add_step(seq_len_computer) pipeline.add_step(padder) pipeline.add_step(sorter) pipeline.add_step(indxs_creator) pipeline.add_step(rev_mapper) pipeline.add_step(formatter) return pipeline
def setUp(self): self.reader = CsvReader(sep=',')
def assemble_infer_pipeline(word_vocab, max_groups_per_chunk=1, max_reviews=10, tokenization_func=lambda x: x.split()): """Assembles a simple inference pipeline for summary generation. Assumes that csv files are read where reviews have the following column names: 'rev1', 'rev2', ..., 'revN', each review separated by \t. Args: word_vocab: word vocabulary to convert words to ids. max_groups_per_chunk: self-explanatory. max_reviews: the maximum number of reviews to load per group. Columns in the CSV file should be `rev1`, ...., `revN`. tokenization_func: self-explanatory. """ rev_fnames = [ f'{InfDataF.REV_PREFIX}{i}' for i in range(1, max_reviews + 1) ] assert START in word_vocab assert END in word_vocab reader = CsvReader(sep='\t', encoding='utf-8', engine='python', quotechar='\'', chunk_size=max_groups_per_chunk) rev_flattener = ReviewFlattener(group_id_fname=InfDataF.GROUP_ID, rev_fnames=rev_fnames) token_processor = TokenProcessor(fnames=ModelF.REV, tokenization_func=tokenization_func) # notice that I don't convert summs tokens to ids vocab_mapper = VocabMapper({ModelF.REV: word_vocab}) seq_wrapper = SeqWrapper(ModelF.REV, start_el=word_vocab[START].id, end_el=word_vocab[END].id) seq_len_computer = SeqLenComputer(ModelF.REV, ModelF.REV_LEN) padder = Padder(fname=ModelF.REV, new_mask_fname=ModelF.REV_MASK, pad_symbol=word_vocab[PAD].id, padding_mode='right') sorter = ChunkSorter(field_name=ModelF.REV_LEN, fields_to_sort=[ModelF.REV, ModelF.GROUP_ID]) # re-using the step summ_rev_indx_creator = GoldSummRevIndxsCreator( group_id_fname=ModelF.GROUP_ID) pipeline = PyTorchPipeline(reader=reader, error_on_invalid_chunk=False) pipeline.add_step(rev_flattener) pipeline.add_step(token_processor) pipeline.add_step(vocab_mapper) pipeline.add_step(seq_wrapper) pipeline.add_step(seq_len_computer) pipeline.add_step(padder) pipeline.add_step(sorter) pipeline.add_step(summ_rev_indx_creator) return pipeline
def assemble_unsup_pipeline(word_vocab, max_groups_per_batch=1, reader_threads=5, min_revs_per_group=None, max_revs_per_group=10, worker_num=1, seed=None, tok_func=None, lowercase=True, max_len=None, shuffler_buffer_size=250): """Creates a data-pipeline that yields batches for to train the unsup. model. Creates a flow of data transformation steps that modify the data until the final form is reached in terms of PyTorch tensors. Args: word_vocab: vocabulary object with words/tokens. max_groups_per_batch: number of groups each batch should have. min_revs_per_group: number of reviews a group should have in order not to be discarded. max_revs_per_group: self-explanatory. seed: used to use the same data subsamples/shuffles every epoch. max_len: if passed will filter out all reviews that a longer than the threshold. Returns: DataPipeline object that allows iteration over batches/chunks. """ assert START in word_vocab and END in word_vocab file_shuffler = FileShuffler() # TODO: explain how grouping works here - each file has reviews of a group reader = CsvReader(sep='\t', engine='c', chunk_size=None, encoding='utf-8', quoting=QUOTE_NONE, buffer_size=200, timeout=None, worker_threads_num=reader_threads, use_lists=True) fname_renamer = FieldRenamer({ InpDataF.REV_TEXT: ModelF.REV, InpDataF.GROUP_ID: ModelF.REV_GROUP_ID, InpDataF.RATING: ModelF.REV_RATING, InpDataF.RATING_DEV: ModelF.RATING_PROP, InpDataF.CAT: ModelF.REV_CAT }) unit_sampler = UnitSampler(id_fname=ModelF.REV_GROUP_ID, sample_all=True, min_units=min_revs_per_group, max_units=max_revs_per_group) unit_sampler_accum = ChunkAccumulator(unit_sampler) # since we're splitting one group into multiple chunks, it's convenient # to postfix each group_id name, such that it would be possible to # associate summaries with different subsets of reviews postfixer = Postfixer(id_fname=ModelF.REV_GROUP_ID) # property and related steps len_prop = LenProp(len_fname=ModelF.REV_LEN, new_fname=ModelF.LEN_PROP) pov_prop = POVProp(text_fname=ModelF.REV, new_fname=ModelF.POV_PROP) rouge_field_merger = FieldMerger( merge_fnames=[InpDataF.ROUGE1, InpDataF.ROUGE2, InpDataF.ROUGEL], new_fname=ModelF.ROUGE_PROP) # to avoid having same product/business appearing in the same merged # data-chunk, buffer a small number of them, shuffle, and release chunk_shuffler = ChunkAccumulator( ChunkShuffler(buffer_size=shuffler_buffer_size)) # accumulates a fixed number of group chunks, merges them # together, and passes along the pipeline chunk_coll = ChunkCollector(buffer_size=max_groups_per_batch, strict=True) chunk_accum = ChunkAccumulator(chunk_coll) # alternation of data entries tokenizer = TokenProcessor(fnames=ModelF.REV, tok_func=tok_func, lowercase=lowercase) vocab_mapper = VocabMapper({ModelF.REV: word_vocab}) seq_wrapper = SeqWrapper(fname=ModelF.REV, start_el=word_vocab[START].token, end_el=word_vocab[END].token) seq_len_computer = SeqLenComputer(ModelF.REV, ModelF.REV_LEN) padder = Padder(fname=ModelF.REV, new_mask_fname=ModelF.REV_MASK, pad_symbol=word_vocab[PAD].id, padding_mode='right') summ_rev_indxs_creator = GroupRevIndxsCreator( rev_group_id_fname=ModelF.REV_GROUP_ID, rev_cat_fname=ModelF.REV_CAT) rev_mapper = RevMapper(group_rev_indxs_fname=ModelF.GROUP_REV_INDXS, group_rev_mask_fname=ModelF.GROUP_REV_INDXS_MASK, rev_mask_fname=ModelF.REV_MASK) # extra steps for the loss associated with probability mass un_word_cal = UniqueWordCalc( new_fname=ModelF.OTHER_REV_UWORDS, rev_fname=ModelF.REV, other_rev_indxs_fname=ModelF.OTHER_REV_INDXS, other_rev_indxs_mask_fname=ModelF.OTHER_REV_INDXS_MASK) un_word_padder = Padder(fname=ModelF.OTHER_REV_UWORDS, new_mask_fname=ModelF.OTHER_REV_UWORDS_MASK, pad_symbol=word_vocab[PAD].id, padding_mode='right') numpy_formatter = NumpyFormatter(fnames=[ ModelF.ROUGE_PROP, ModelF.RATING_PROP, ModelF.LEN_PROP, ModelF.POV_PROP ]) pipeline = PyTorchPipeline(reader=reader, preprocessor=file_shuffler, worker_processes_num=worker_num, seed=seed, output_buffer_size=50, error_on_invalid_chunk=False, timeout=None) pipeline.add_step(fname_renamer) pipeline.add_step(rouge_field_merger) pipeline.add_step(tokenizer) if max_len: pipeline.add_step(TextLenFilter(fname=ModelF.REV, max_len=max_len)) pipeline.add_step(unit_sampler_accum) pipeline.add_step(postfixer) pipeline.add_step(chunk_shuffler) pipeline.add_step(seq_wrapper) pipeline.add_step(seq_len_computer) # properties pipeline.add_step(len_prop) pipeline.add_step(pov_prop) pipeline.add_step(chunk_accum) pipeline.add_step(vocab_mapper) pipeline.add_step(padder) # adding additional fields for attention and summarization pipeline.add_step(summ_rev_indxs_creator) pipeline.add_step(rev_mapper) # adding steps for word count computation pipeline.add_step(un_word_cal) pipeline.add_step(un_word_padder) pipeline.add_step(numpy_formatter) return pipeline
def test_how_to_apply_run(self): data_path = os.path.join(self.tutorials_path, "data/tweets.csv") # paths where vocabs will be saved and later loaded from words_vocab_file_path = os.path.join(self.tutorials_path, "data/vocabs/words.txt") labels_vocab_file_path = os.path.join(self.tutorials_path, 'data/vocabs/labels.txt') # creating step objects twitter_tokenizer = TweetTokenizer() preprocessor = TwitterFilesPreprocessor( input_cols_number=3, tweets_indx=2, add_header=['ids', 'labels', 'tweets']) csv_reader = CsvReader(sep='\t', chunk_size=30) fields_selector = FieldSelector(fnames=["tweets", "labels"]) token_processor = TokenProcessor( fnames="tweets", tok_func=twitter_tokenizer.tokenize, tok_cleaning_func=twitter_text_cleaner, lowercase=True) # data pipeline for vocabularies creation vocab_data_pipeline = Pipeline(reader=csv_reader, preprocessor=preprocessor, worker_processes_num=0, name_prefix="vocabs") vocab_data_pipeline.add_step(fields_selector) vocab_data_pipeline.add_step(token_processor) # creating or loading vocabs words_vocab = Vocabulary(vocab_data_pipeline, name_prefix="words") words_vocab.load_or_create(words_vocab_file_path, data_source={"data_path": data_path}, data_fnames="tweets") labels_vocab = Vocabulary(vocab_data_pipeline, name_prefix="labels") labels_vocab.load_or_create(labels_vocab_file_path, data_source={"data_path": data_path}, data_fnames="labels") print(words_vocab) print(labels_vocab) print(vocab_data_pipeline) # extra steps for training and evaluation mapper = VocabMapper(field_names_to_vocabs={ "tweets": words_vocab, "labels": labels_vocab }) padder = Padder(fname="tweets", new_mask_fname="tweets_mask", pad_symbol=words_vocab[PAD].id) formatter = FeaturesLabelsFormatter(features_field_name="tweets", labels_field_name="labels", classes_number=len(labels_vocab)) # building the actual pipeline dev_data_pipeline = Pipeline(reader=csv_reader, preprocessor=preprocessor, worker_processes_num=1, name_prefix="dev") dev_data_pipeline.add_step(fields_selector) dev_data_pipeline.add_step(token_processor) dev_data_pipeline.add_step(mapper) dev_data_pipeline.add_step(padder) dev_data_pipeline.add_step(formatter) print(dev_data_pipeline) epochs = 2 i_model = ISentiLSTM(dev_data_pipeline) i_model.init_model(words_vocab_size=len(words_vocab), input_dim=50, lstm_hidden_dim=120, number_of_classes=len(labels_vocab), mask_symbol=words_vocab[PAD].id)
def assemble_postproc_pipeline(text_prep_func, min_revs_per_group=None, seed=None, max_revs_per_group=10, rouge_kwargs=None, workers=1): """Creates a data-pipeline that yields batches with computed ROUGE score. Args: min_revs_per_group: number of reviews a group should have in order not to be discarded. max_revs_per_group: self-explanatory. seed: used to use the same data subsamples/shuffles every epoch. Returns: DataPipeline object that allows iteration over batches/chunks. """ reader = CsvReader(sep='\t', engine='c', chunk_size=None, encoding='utf-8', quoting=QUOTE_NONE, use_lists=True, timeout=None, worker_threads_num=1) converter = TypeConverter(fname=InpDataF.RATING, dtype_func=float, remove_invalid=True) unit_sampler = UnitSampler(id_fname=InpDataF.GROUP_ID, sample_all=True, min_units=min_revs_per_group, max_units=max_revs_per_group) unit_sampler_accum = ChunkAccumulator(unit_sampler) # since we're splitting one group into multiple chunks, it's convenient # to postfix each group_id name, such that it would be possible to # associate summaries with different subsets of reviews postfixer = Postfixer(id_fname=InpDataF.GROUP_ID) field_dupl = FieldDuplicator({InpDataF.REV_TEXT: "dummy"}) # the field below is needed as I'm detokenizing (with text_prep_func) # before computing ROUGE. func_appl = FunctionApplier({'dummy': text_prep_func}) # props rouge_prop = RougeProp(rev_fname='dummy', rouge_kwargs=rouge_kwargs) rating_prop = RatingProp(rating_fname=InpDataF.RATING, new_fname=InpDataF.RATING_DEV) field_dropper = FieldDropper('dummy') pipeline = Pipeline(reader=reader, worker_processes_num=workers, seed=seed, output_buffer_size=40, error_on_invalid_chunk=True, timeout=None) pipeline.add_step(converter) pipeline.add_step(unit_sampler_accum) pipeline.add_step(postfixer) pipeline.add_step(field_dupl) pipeline.add_step(func_appl) # props pipeline.add_step(rouge_prop) pipeline.add_step(field_dropper) pipeline.add_step(rating_prop) return pipeline
def _iter(self, data_path): for dc in CsvReader._iter(self, data_path=data_path): yield dc.data