def test_csv_multi_threaded_reader_output(self):
        """
        Check if multi-threaded and single threaded readers produce the correct
        output.
        """
        data_paths = ['data/small_chunks/chunk1.csv',
                      'data/small_chunks/chunk2.csv',
                      'data/small_chunks/chunk3.csv']
        chunk_size = 2

        reader = CsvReader(chunk_size=chunk_size, worker_threads_num=3)

        expected_data = read_data_from_csv_file(data_paths)

        actual_data_chunks = DataChunk()
        for data_chunk in reader.iter(data_path=data_paths):
            for key in data_chunk.keys():
                if key not in actual_data_chunks:
                    actual_data_chunks[key] = np.array([])
                actual_data_chunks[key] = np.concatenate([
                                                        actual_data_chunks[key],
                                                        data_chunk[key]
                                                        ])
        self.compare_unsorted_data_chunks(dc1=expected_data,
                                          dc2=actual_data_chunks,
                                          sort_key='id')
예제 #2
0
def calculate_text_length(input_fp, column_names, use_moses=True, sep='\t'):
    """
    Calculates length mean and std of text in a csv file. Uses moses tokenizer.
    """
    use_moses = bool(use_moses)
    tokenize = moses_tokenizer.tokenize if use_moses else lambda x: x.split()
    _, ext = os.path.splitext(input_fp)
    assert ext in ['.csv', '.json']
    reader = CsvReader(sep=sep,
                       engine='python') if ext == '.csv' else JsonReader()
    lens = []
    for chunk in reader.iter(data_path=input_fp):
        for cname in column_names:
            for text in chunk[cname]:

                # TODO: change it, this solution seems too hacky!
                if isinstance(text, list):
                    text = " ".join(text)

                tokens = tokenize(text)
                lens.append(len(tokens))

    print "min: %f" % np.min(lens)
    print("mean: %f" % np.mean(lens))
    print("std: %f" % np.std(lens))
    def test_csv_reader_valid_paths(self):
        """
        Passing an intentionally wrong input to the reader and expecting it to
        throw an error.
        """
        data_paths = ["a", "b", 123123123, 123.12313]
        reader = CsvReader()
        itr = reader.iter(data_path=data_paths)

        with self.assertRaises(ValueError):
            chunk = next(itr.__iter__())
    def test_csv_reader_output(self):
        """Checking if read data-chunks are valid."""
        data_path = 'data/small_chunks/chunk2.csv'
        chunk_size = 2

        reader = CsvReader(chunk_size=chunk_size, worker_threads_num=1)

        data = read_data_from_csv_file(data_path)
        expected_chunks = create_list_of_data_chunks(data,
                                                     chunk_size=chunk_size)

        itr = reader.iter(data_path=data_path)
        i = 0
        for (actual_chunk, expected_chunk) in izip(itr, expected_chunks):
            self.assertTrue(actual_chunk == expected_chunk)
            i += 1

        self.assertTrue(i == len(expected_chunks))