Exemplo n.º 1
0
    def test_fast_csv_reader_column_vals_full(self):
        def _make_column_val_full_data(chunk_row_size):
            columns = {}
            count_row = chunk_row_size
            columns['f1'] = ['a' for _ in range(count_row // 2)] + [
                'b' * 1000 for _ in range(count_row // 2, count_row)
            ]
            df = pd.DataFrame(columns)
            column_offsets = np.array([0, 10], dtype=np.int64) * chunk_row_size

            csv_buffer = StringIO()
            df.to_csv(csv_buffer, index=False)

            return csv_buffer, df, column_offsets

        chunk_row_size = 10

        csv_buffer, df, column_offsets = _make_column_val_full_data(
            chunk_row_size)
        content = np.frombuffer(csv_buffer.getvalue().encode(), dtype=np.uint8)

        _, count_columns, count_rows, _ = get_file_stat(
            csv_buffer, chunk_row_size)
        column_inds = np.zeros((count_columns, count_rows + 1), dtype=np.int64)
        column_vals = np.zeros(column_offsets[-1], dtype=np.uint8)

        _, _, is_indices_full, is_values_full, val_full_col_idx = fast_csv_reader(
            content, 0, column_inds, column_vals, column_offsets, True,
            ESCAPE_VALUE, SEPARATOR_VALUE, NEWLINE_VALUE, WHITE_SPACE_VALUE)
        self.assertFalse(is_indices_full)
        self.assertTrue(is_values_full)
        self.assertEqual(val_full_col_idx, 0)
Exemplo n.º 2
0
    def test_csv_fast_correctness(self):
        file_lines, chunk_row_size = 3, 100

        csv_buffer, df, _, _, column_offsets = _make_test_data(
            TEST_SCHEMA, file_lines, chunk_row_size)
        content = np.frombuffer(csv_buffer.getvalue().encode(), dtype=np.uint8)

        _, count_columns, count_rows, _ = get_file_stat(
            csv_buffer, chunk_row_size)
        column_inds = np.zeros((count_columns, count_rows + 1), dtype=np.int64)
        column_vals = np.zeros(column_offsets[-1], dtype=np.uint8)

        _, written_row_count, _, _, _ = fast_csv_reader(
            content, 0, column_inds, column_vals, column_offsets, True,
            ESCAPE_VALUE, SEPARATOR_VALUE, NEWLINE_VALUE, WHITE_SPACE_VALUE)

        self.assertEqual(written_row_count, 3)
        self.assertListEqual(list(column_inds[0][:written_row_count + 1]),
                             [0, 3, 5, 9])
        self.assertListEqual(list(column_inds[1][:written_row_count + 1]),
                             [0, 1, 2, 3])
        self.assertListEqual(
            list(column_vals[column_offsets[0]:column_offsets[0] + 9]),
            [99, 99, 99, 98, 98, 100, 100, 100, 100])
        self.assertListEqual(
            list(column_vals[column_offsets[1]:column_offsets[1] + 3]),
            [49, 48, 49])
Exemplo n.º 3
0
    def test_escape_bad_formed_csv_2(self):
        open_csv = StringIO('id,f1\n1,"abc"de\n')
        content = np.frombuffer(open_csv.getvalue().encode(), dtype=np.uint8)

        chunk_row_size = 10
        _, count_columns, count_rows, _ = get_file_stat(
            open_csv, chunk_row_size)
        column_offsets = np.array([0, 1, 11], dtype=np.int64) * chunk_row_size

        column_inds = np.zeros(
            (count_columns, count_rows + 1),
            dtype=np.int64)  # add one more row for initial index 0
        column_vals = np.zeros(column_offsets[-1], dtype=np.uint8)

        with self.assertRaises(Exception) as context:
            fast_csv_reader(content, 0, column_inds, column_vals,
                            column_offsets, True, ESCAPE_VALUE,
                            SEPARATOR_VALUE, NEWLINE_VALUE, WHITE_SPACE_VALUE)

        self.assertEqual(str(context.exception), 'invalid double quote')
Exemplo n.º 4
0
    def test_escape_well_formed_csv(self):
        open_csv = StringIO('id,f1,f2,f3\n1,"abc","a""b""c","""ab"""\n')
        content = np.frombuffer(open_csv.getvalue().encode(), dtype=np.uint8)

        chunk_row_size = 10
        _, count_columns, count_rows, _ = get_file_stat(
            open_csv, chunk_row_size)
        column_offsets = np.array([0, 1, 11, 21, 31],
                                  dtype=np.int64) * chunk_row_size

        column_inds = np.zeros(
            (count_columns, count_rows + 1),
            dtype=np.int64)  # add one more row for initial index 0
        column_vals = np.zeros(column_offsets[-1], dtype=np.uint8)

        _, written_row_count, _, _, _ = fast_csv_reader(
            content, 0, column_inds, column_vals, column_offsets, True,
            ESCAPE_VALUE, SEPARATOR_VALUE, NEWLINE_VALUE, WHITE_SPACE_VALUE)
        self.assertEqual(written_row_count, 1)
        self.assertEqual(column_inds[1, 1], 3)  # abc
        self.assertEqual(column_inds[2, 1], 5)  # a"b"c
        self.assertEqual(column_inds[3, 1], 4)  # "ab"
Exemplo n.º 5
0
    def test_fast_csv_reader_column_inds_full(self):
        def _make_column_inds_full_data(chunk_row_size):
            columns = {}
            count_row = chunk_row_size
            count_col = 3
            column_offsets = np.zeros(count_col + 1, dtype=np.int64)

            for i in range(count_col):
                fieldname = 'f' + str(i)
                columns[fieldname] = ['abcd'] + [''] * (count_row - 1)
                column_offsets[i + 1] = column_offsets[i] + 10 * chunk_row_size

            df = pd.DataFrame(columns)

            csv_buffer = StringIO()
            df.to_csv(csv_buffer, index=False)

            return csv_buffer, df, column_offsets

        chunk_row_size = 10

        csv_buffer, df, column_offsets = _make_column_inds_full_data(
            chunk_row_size)
        content = np.frombuffer(csv_buffer.getvalue().encode(), dtype=np.uint8)

        _, count_columns, count_rows, _ = get_file_stat(
            csv_buffer, chunk_row_size)
        column_inds = np.zeros((count_columns, count_rows + 1), dtype=np.int64)
        column_vals = np.zeros(column_offsets[-1], dtype=np.uint8)

        _, _, is_indices_full, is_values_full, _ = fast_csv_reader(
            content, 0, column_inds, column_vals, column_offsets, True,
            ESCAPE_VALUE, SEPARATOR_VALUE, NEWLINE_VALUE, WHITE_SPACE_VALUE)

        self.assertTrue(is_indices_full)
        self.assertFalse(is_values_full)