def test_csv_multi_threaded_reader_output(self): """ Check if multi-threaded and single threaded readers produce the correct output. """ data_paths = ['data/small_chunks/chunk1.csv', 'data/small_chunks/chunk2.csv', 'data/small_chunks/chunk3.csv'] chunk_size = 2 reader = CsvReader(chunk_size=chunk_size, worker_threads_num=3) expected_data = read_data_from_csv_file(data_paths) actual_data_chunks = DataChunk() for data_chunk in reader.iter(data_path=data_paths): for key in data_chunk.keys(): if key not in actual_data_chunks: actual_data_chunks[key] = np.array([]) actual_data_chunks[key] = np.concatenate([ actual_data_chunks[key], data_chunk[key] ]) self.compare_unsorted_data_chunks(dc1=expected_data, dc2=actual_data_chunks, sort_key='id')
def calculate_text_length(input_fp, column_names, use_moses=True, sep='\t'): """ Calculates length mean and std of text in a csv file. Uses moses tokenizer. """ use_moses = bool(use_moses) tokenize = moses_tokenizer.tokenize if use_moses else lambda x: x.split() _, ext = os.path.splitext(input_fp) assert ext in ['.csv', '.json'] reader = CsvReader(sep=sep, engine='python') if ext == '.csv' else JsonReader() lens = [] for chunk in reader.iter(data_path=input_fp): for cname in column_names: for text in chunk[cname]: # TODO: change it, this solution seems too hacky! if isinstance(text, list): text = " ".join(text) tokens = tokenize(text) lens.append(len(tokens)) print "min: %f" % np.min(lens) print("mean: %f" % np.mean(lens)) print("std: %f" % np.std(lens))
def test_csv_reader_valid_paths(self): """ Passing an intentionally wrong input to the reader and expecting it to throw an error. """ data_paths = ["a", "b", 123123123, 123.12313] reader = CsvReader() itr = reader.iter(data_path=data_paths) with self.assertRaises(ValueError): chunk = next(itr.__iter__())
def test_invalid_steps(self): """Testing whether an error is raised if an invalid step is present.""" data_path = 'mldp/tests/data/news.csv' data_source = {'data_path': data_path} inv_reader = InvalidCsvReader() val_reader = CsvReader() val_transf1 = FieldSelector("text") val_transf2 = TokenProcessor(fnames='text') inv_transf1 = InvalidTransformer() accum = ChunkAccumulator(new_size=3) formatter = PandasFormatter() # try only the invalid reader and valid steps dp = Pipeline(reader=inv_reader, error_on_invalid_chunk='error') for vs in [val_transf1, val_transf2, accum, formatter]: dp.add_step(vs) with self.assertRaises(DataChunkError): for _ in dp.iter(**data_source): pass # try valid reader and invalid steps steps = [val_transf1, val_transf2, inv_transf1, accum] for st in permutations(steps): dp = Pipeline(reader=val_reader, error_on_invalid_chunk='error') for s in st: dp.add_step(s) dp.add_step(formatter) with self.assertRaises(DataChunkError): for _ in dp.iter(**data_source): pass
def test_readme_example(self): from mldp.pipeline import Pipeline from mldp.steps.readers import CsvReader from mldp.steps.transformers.nlp import TokenProcessor, Padder from mldp.steps.transformers.field import FieldSelector data_path = "mldp/tests/data/tweets.csv" # creating steps csv_reader = CsvReader(sep='\t', chunk_size=30) fields_selector = FieldSelector(fnames=["tweets", "labels"]) token_processor = TokenProcessor(fnames="tweets", tokenization_func=lambda x: x.split(), lower_case=True) padder = Padder(fname="tweets", new_mask_fname="tweets_mask", pad_symbol="<PAD>") # creating the pipeline pipeline = Pipeline(reader=csv_reader, worker_processes_num=1) pipeline.add_step(fields_selector) pipeline.add_step(token_processor) pipeline.add_step(padder) # iterate over data chunks for data_chunk in pipeline.iter(data_path=data_path): pass # generate documentation and print it print(pipeline)
def test_csv_reader_output(self): """Checking if read data-chunks are valid.""" data_path = 'data/small_chunks/chunk2.csv' chunk_size = 2 reader = CsvReader(chunk_size=chunk_size, worker_threads_num=1) data = read_data_from_csv_file(data_path) expected_chunks = create_list_of_data_chunks(data, chunk_size=chunk_size) itr = reader.iter(data_path=data_path) i = 0 for (actual_chunk, expected_chunk) in izip(itr, expected_chunks): self.assertTrue(actual_chunk == expected_chunk) i += 1 self.assertTrue(i == len(expected_chunks))
def test_invalid_pipeline(self): """ Tries to create an invalid data processing pipeline, and expect to get an error. """ reader = CsvReader() with self.assertRaises(ValueError): data_pipeline = Pipeline(reader) data_pipeline.add_step(FieldSelector(["dummy"])) data_pipeline.add_step(PandasFormatter()) data_pipeline.add_step(FunctionApplier({"dummy": lambda x: x}))
def test_empty_chunks(self): """Testing whether empty chunks do not reach user.""" data_path = 'mldp/tests/data/small_chunks' field_names = ['first_name', 'email'] reader = CsvReader(chunk_size=1, sep=",") empty_chunk_transformer = EmptyChunkTransformer(max_count=3) dev_data_pipeline = Pipeline(reader=reader) dev_data_pipeline.add_step(empty_chunk_transformer) dev_data_pipeline.add_step(FieldSelector(field_names)) flag = False for dc in dev_data_pipeline.iter(data_path=data_path): flag = True self.assertFalse(equal_to_constant(dc, EMPTY_CHUNK)) self.assertTrue(flag)
def test_vocabulary_mapper(self): """Testing whether the mapper allows to map back and forth field values. """ data_path = 'mldp/tests/data/mock_data.csv' target_fields = ["first_name", "last_name", "email", "gender"] reader = CsvReader(sep=',') vocab = Vocabulary(reader) for target_field in target_fields: vocab.create(data_source={"data_path": data_path}, data_fnames=target_field) data = read_data_from_csv_file(data_path) data_original = copy.deepcopy(data) mapper_to = VocabMapper({target_field: vocab}, "id") mapper_back = VocabMapper({target_field: vocab}, "token") data = mapper_to(data) data = mapper_back(data) self.assertTrue( (data[target_field] == data_original[target_field]).all())
def test_simple_scenario(self): """ Tries to run the pipeline, and if it works - it's considered to be successful. Tries different numbers of workers. """ data_path = 'mldp/tests/data/small_chunks' field_names = ['first_name', 'email'] worker_processes_nums = [0, 1, 2, 3, 4] reader = CsvReader(sep=",") for wpn in worker_processes_nums: dev_data_pipeline = Pipeline(reader=reader, worker_processes_num=wpn) dev_data_pipeline.add_step(FieldSelector(field_names)) dev_data_pipeline.add_step(ChunkAccumulator(new_size=3)) dev_data_pipeline.add_step(PandasFormatter()) flag = False for data_chunk in dev_data_pipeline.iter(data_path=data_path): flag = True self.assertTrue(len(data_chunk) > 0) self.assertTrue(flag)
def setUp(self): self.reader = CsvReader(sep=',')
def setUp(self): self.reader = CsvReader()
def test_how_to_apply_run(self): data_path = os.path.join(self.tutorials_path, "data/tweets.csv") # paths where vocabs will be saved and later loaded from words_vocab_file_path = os.path.join(self.tutorials_path, "data/vocabs/words.txt") labels_vocab_file_path = os.path.join(self.tutorials_path, 'data/vocabs/labels.txt') # creating step objects twitter_tokenizer = TweetTokenizer() preprocessor = TwitterFilesPreprocessor( input_cols_number=3, tweets_indx=2, add_header=['ids', 'labels', 'tweets']) csv_reader = CsvReader(sep='\t', chunk_size=30) fields_selector = FieldSelector(fnames=["tweets", "labels"]) token_processor = TokenProcessor( fnames="tweets", tokenization_func=twitter_tokenizer.tokenize, token_cleaning_func=twitter_text_cleaner, lower_case=True) # data pipeline for vocabularies creation vocab_data_pipeline = Pipeline(reader=csv_reader, preprocessor=preprocessor, worker_processes_num=0, name_prefix="vocabs") vocab_data_pipeline.add_step(fields_selector) vocab_data_pipeline.add_step(token_processor) # creating or loading vocabs words_vocab = Vocabulary(vocab_data_pipeline, name_prefix="words") words_vocab.load_or_create(words_vocab_file_path, data_source={"data_path": data_path}, data_fnames="tweets") labels_vocab = Vocabulary(vocab_data_pipeline, name_prefix="labels") labels_vocab.load_or_create(labels_vocab_file_path, data_source={"data_path": data_path}, data_fnames="labels") print(words_vocab) print(labels_vocab) print(vocab_data_pipeline) # extra steps for training and evaluation mapper = VocabMapper(field_names_to_vocabs={ "tweets": words_vocab, "labels": labels_vocab }) padder = Padder(fname="tweets", new_mask_fname="tweets_mask", pad_symbol=words_vocab[PAD].id) formatter = FeaturesLabelsFormatter(features_field_name="tweets", labels_field_name="labels", classes_number=len(labels_vocab)) # building the actual pipeline dev_data_pipeline = Pipeline(reader=csv_reader, preprocessor=preprocessor, worker_processes_num=1, name_prefix="dev") dev_data_pipeline.add_step(fields_selector) dev_data_pipeline.add_step(token_processor) dev_data_pipeline.add_step(mapper) dev_data_pipeline.add_step(padder) dev_data_pipeline.add_step(formatter) print(dev_data_pipeline) epochs = 2 i_model = ISentiLSTM(dev_data_pipeline) i_model.init_model(words_vocab_size=len(words_vocab), input_dim=50, lstm_hidden_dim=120, number_of_classes=len(labels_vocab), mask_symbol=words_vocab[PAD].id)
def _iter(self, data_path): for dc in CsvReader._iter(self, data_path=data_path): yield dc.data