def test_csv_multi_threaded_reader_output(self):
        """
        Check if multi-threaded and single threaded readers produce the correct
        output.
        """
        data_paths = ['data/small_chunks/chunk1.csv',
                      'data/small_chunks/chunk2.csv',
                      'data/small_chunks/chunk3.csv']
        chunk_size = 2

        reader = CsvReader(chunk_size=chunk_size, worker_threads_num=3)

        expected_data = read_data_from_csv_file(data_paths)

        actual_data_chunks = DataChunk()
        for data_chunk in reader.iter(data_path=data_paths):
            for key in data_chunk.keys():
                if key not in actual_data_chunks:
                    actual_data_chunks[key] = np.array([])
                actual_data_chunks[key] = np.concatenate([
                                                        actual_data_chunks[key],
                                                        data_chunk[key]
                                                        ])
        self.compare_unsorted_data_chunks(dc1=expected_data,
                                          dc2=actual_data_chunks,
                                          sort_key='id')
예제 #2
0
def calculate_text_length(input_fp, column_names, use_moses=True, sep='\t'):
    """
    Calculates length mean and std of text in a csv file. Uses moses tokenizer.
    """
    use_moses = bool(use_moses)
    tokenize = moses_tokenizer.tokenize if use_moses else lambda x: x.split()
    _, ext = os.path.splitext(input_fp)
    assert ext in ['.csv', '.json']
    reader = CsvReader(sep=sep,
                       engine='python') if ext == '.csv' else JsonReader()
    lens = []
    for chunk in reader.iter(data_path=input_fp):
        for cname in column_names:
            for text in chunk[cname]:

                # TODO: change it, this solution seems too hacky!
                if isinstance(text, list):
                    text = " ".join(text)

                tokens = tokenize(text)
                lens.append(len(tokens))

    print "min: %f" % np.min(lens)
    print("mean: %f" % np.mean(lens))
    print("std: %f" % np.std(lens))
    def test_csv_reader_valid_paths(self):
        """
        Passing an intentionally wrong input to the reader and expecting it to
        throw an error.
        """
        data_paths = ["a", "b", 123123123, 123.12313]
        reader = CsvReader()
        itr = reader.iter(data_path=data_paths)

        with self.assertRaises(ValueError):
            chunk = next(itr.__iter__())
예제 #4
0
    def test_invalid_steps(self):
        """Testing whether an error is raised if an invalid step is present."""
        data_path = 'mldp/tests/data/news.csv'
        data_source = {'data_path': data_path}

        inv_reader = InvalidCsvReader()
        val_reader = CsvReader()

        val_transf1 = FieldSelector("text")
        val_transf2 = TokenProcessor(fnames='text')
        inv_transf1 = InvalidTransformer()
        accum = ChunkAccumulator(new_size=3)
        formatter = PandasFormatter()

        # try only the invalid reader and valid steps
        dp = Pipeline(reader=inv_reader, error_on_invalid_chunk='error')
        for vs in [val_transf1, val_transf2, accum, formatter]:
            dp.add_step(vs)
        with self.assertRaises(DataChunkError):
            for _ in dp.iter(**data_source):
                pass

        # try valid reader and invalid steps
        steps = [val_transf1, val_transf2, inv_transf1, accum]
        for st in permutations(steps):
            dp = Pipeline(reader=val_reader, error_on_invalid_chunk='error')
            for s in st:
                dp.add_step(s)
            dp.add_step(formatter)
            with self.assertRaises(DataChunkError):
                for _ in dp.iter(**data_source):
                    pass
예제 #5
0
    def test_readme_example(self):
        from mldp.pipeline import Pipeline
        from mldp.steps.readers import CsvReader
        from mldp.steps.transformers.nlp import TokenProcessor, Padder
        from mldp.steps.transformers.field import FieldSelector

        data_path = "mldp/tests/data/tweets.csv"

        # creating steps
        csv_reader = CsvReader(sep='\t', chunk_size=30)
        fields_selector = FieldSelector(fnames=["tweets", "labels"])
        token_processor = TokenProcessor(fnames="tweets",
                                         tokenization_func=lambda x: x.split(),
                                         lower_case=True)
        padder = Padder(fname="tweets", new_mask_fname="tweets_mask",
                        pad_symbol="<PAD>")

        # creating the pipeline
        pipeline = Pipeline(reader=csv_reader, worker_processes_num=1)
        pipeline.add_step(fields_selector)
        pipeline.add_step(token_processor)
        pipeline.add_step(padder)

        # iterate over data chunks
        for data_chunk in pipeline.iter(data_path=data_path):
            pass

        # generate documentation and print it
        print(pipeline)
    def test_csv_reader_output(self):
        """Checking if read data-chunks are valid."""
        data_path = 'data/small_chunks/chunk2.csv'
        chunk_size = 2

        reader = CsvReader(chunk_size=chunk_size, worker_threads_num=1)

        data = read_data_from_csv_file(data_path)
        expected_chunks = create_list_of_data_chunks(data,
                                                     chunk_size=chunk_size)

        itr = reader.iter(data_path=data_path)
        i = 0
        for (actual_chunk, expected_chunk) in izip(itr, expected_chunks):
            self.assertTrue(actual_chunk == expected_chunk)
            i += 1

        self.assertTrue(i == len(expected_chunks))
예제 #7
0
 def test_invalid_pipeline(self):
     """
     Tries to create an invalid data processing pipeline, and expect to get
     an error.
     """
     reader = CsvReader()
     with self.assertRaises(ValueError):
         data_pipeline = Pipeline(reader)
         data_pipeline.add_step(FieldSelector(["dummy"]))
         data_pipeline.add_step(PandasFormatter())
         data_pipeline.add_step(FunctionApplier({"dummy": lambda x: x}))
예제 #8
0
    def test_empty_chunks(self):
        """Testing whether empty chunks do not reach user."""
        data_path = 'mldp/tests/data/small_chunks'
        field_names = ['first_name', 'email']
        reader = CsvReader(chunk_size=1, sep=",")
        empty_chunk_transformer = EmptyChunkTransformer(max_count=3)

        dev_data_pipeline = Pipeline(reader=reader)
        dev_data_pipeline.add_step(empty_chunk_transformer)
        dev_data_pipeline.add_step(FieldSelector(field_names))

        flag = False
        for dc in dev_data_pipeline.iter(data_path=data_path):
            flag = True
            self.assertFalse(equal_to_constant(dc, EMPTY_CHUNK))

        self.assertTrue(flag)
예제 #9
0
    def test_vocabulary_mapper(self):
        """Testing whether the mapper allows to map back and forth field values.
        """
        data_path = 'mldp/tests/data/mock_data.csv'
        target_fields = ["first_name", "last_name", "email", "gender"]

        reader = CsvReader(sep=',')
        vocab = Vocabulary(reader)

        for target_field in target_fields:
            vocab.create(data_source={"data_path": data_path},
                         data_fnames=target_field)

            data = read_data_from_csv_file(data_path)
            data_original = copy.deepcopy(data)

            mapper_to = VocabMapper({target_field: vocab}, "id")
            mapper_back = VocabMapper({target_field: vocab}, "token")

            data = mapper_to(data)
            data = mapper_back(data)

            self.assertTrue(
                (data[target_field] == data_original[target_field]).all())
예제 #10
0
    def test_simple_scenario(self):
        """
        Tries to run the pipeline, and if it works - it's considered to be
        successful. Tries different numbers of workers.
        """
        data_path = 'mldp/tests/data/small_chunks'
        field_names = ['first_name', 'email']
        worker_processes_nums = [0, 1, 2, 3, 4]

        reader = CsvReader(sep=",")

        for wpn in worker_processes_nums:

            dev_data_pipeline = Pipeline(reader=reader, worker_processes_num=wpn)
            dev_data_pipeline.add_step(FieldSelector(field_names))
            dev_data_pipeline.add_step(ChunkAccumulator(new_size=3))
            dev_data_pipeline.add_step(PandasFormatter())

            flag = False
            for data_chunk in dev_data_pipeline.iter(data_path=data_path):
                flag = True
                self.assertTrue(len(data_chunk) > 0)

        self.assertTrue(flag)
예제 #11
0
 def setUp(self):
     self.reader = CsvReader(sep=',')
예제 #12
0
 def setUp(self):
     self.reader = CsvReader()
예제 #13
0
    def test_how_to_apply_run(self):

        data_path = os.path.join(self.tutorials_path, "data/tweets.csv")

        # paths where vocabs will be saved and later loaded from
        words_vocab_file_path = os.path.join(self.tutorials_path,
                                             "data/vocabs/words.txt")
        labels_vocab_file_path = os.path.join(self.tutorials_path,
                                              'data/vocabs/labels.txt')

        # creating step objects
        twitter_tokenizer = TweetTokenizer()
        preprocessor = TwitterFilesPreprocessor(
            input_cols_number=3,
            tweets_indx=2,
            add_header=['ids', 'labels', 'tweets'])
        csv_reader = CsvReader(sep='\t', chunk_size=30)
        fields_selector = FieldSelector(fnames=["tweets", "labels"])
        token_processor = TokenProcessor(
            fnames="tweets",
            tokenization_func=twitter_tokenizer.tokenize,
            token_cleaning_func=twitter_text_cleaner,
            lower_case=True)

        # data pipeline for vocabularies creation
        vocab_data_pipeline = Pipeline(reader=csv_reader,
                                       preprocessor=preprocessor,
                                       worker_processes_num=0,
                                       name_prefix="vocabs")
        vocab_data_pipeline.add_step(fields_selector)
        vocab_data_pipeline.add_step(token_processor)

        # creating or loading vocabs
        words_vocab = Vocabulary(vocab_data_pipeline, name_prefix="words")
        words_vocab.load_or_create(words_vocab_file_path,
                                   data_source={"data_path": data_path},
                                   data_fnames="tweets")

        labels_vocab = Vocabulary(vocab_data_pipeline, name_prefix="labels")
        labels_vocab.load_or_create(labels_vocab_file_path,
                                    data_source={"data_path": data_path},
                                    data_fnames="labels")

        print(words_vocab)

        print(labels_vocab)

        print(vocab_data_pipeline)

        # extra steps for training and evaluation
        mapper = VocabMapper(field_names_to_vocabs={
            "tweets": words_vocab,
            "labels": labels_vocab
        })
        padder = Padder(fname="tweets",
                        new_mask_fname="tweets_mask",
                        pad_symbol=words_vocab[PAD].id)
        formatter = FeaturesLabelsFormatter(features_field_name="tweets",
                                            labels_field_name="labels",
                                            classes_number=len(labels_vocab))

        # building the actual pipeline
        dev_data_pipeline = Pipeline(reader=csv_reader,
                                     preprocessor=preprocessor,
                                     worker_processes_num=1,
                                     name_prefix="dev")
        dev_data_pipeline.add_step(fields_selector)
        dev_data_pipeline.add_step(token_processor)
        dev_data_pipeline.add_step(mapper)
        dev_data_pipeline.add_step(padder)
        dev_data_pipeline.add_step(formatter)

        print(dev_data_pipeline)

        epochs = 2

        i_model = ISentiLSTM(dev_data_pipeline)
        i_model.init_model(words_vocab_size=len(words_vocab),
                           input_dim=50,
                           lstm_hidden_dim=120,
                           number_of_classes=len(labels_vocab),
                           mask_symbol=words_vocab[PAD].id)
예제 #14
0
 def _iter(self, data_path):
     for dc in CsvReader._iter(self, data_path=data_path):
         yield dc.data