Пример #1
0
    def test_utf_decoding_errors(self):
        table = [[
            'hello',
            u'\x80\x81\xffThis unicode string encoded as latin-1 is not a valid utf-8\xaa\xbb\xcc'
        ], ['hello', 'world']]
        delim = ','
        policy = 'simple'
        encoding = 'latin-1'
        csv_data = table_to_csv_string_random(table, delim, policy)
        stream = io.BytesIO(csv_data.encode('latin-1'))
        record_iterator = csv_utils.CSVRecordIterator(stream, True, encoding,
                                                      delim, policy)
        parsed_table = record_iterator._get_all_records()
        self.assertEqual(table, parsed_table)

        parsed_table = write_and_parse_back(table, encoding, delim, policy)
        self.assertEqual(table, parsed_table)

        stream = io.BytesIO(csv_data.encode('latin-1'))
        record_iterator = csv_utils.CSVRecordIterator(stream,
                                                      True,
                                                      'utf-8',
                                                      delim=delim,
                                                      policy=policy)
        with self.assertRaises(Exception) as cm:
            parsed_table = record_iterator._get_all_records()
        e = cm.exception
        self.assertTrue(
            str(e).find('Unable to decode input table as UTF-8') != -1)
Пример #2
0
 def test_bom_warning(self):
     table = list()
     table.append([u'\xef\xbb\xbfcde', '1234'])
     table.append(['abc', '1234'])
     table.append(['abc', '1234'])
     table.append(['efg', '100'])
     table.append(['abc', '100'])
     table.append(['cde', '12999'])
     table.append(['aaa', '2000'])
     table.append(['abc', '100'])
     delim = ','
     policy = 'simple'
     encoding = 'latin-1'
     csv_data = table_to_csv_string_random(table, delim, policy)
     stream = io.BytesIO(csv_data.encode('latin-1'))
     record_iterator = csv_utils.CSVRecordIterator(stream, True, encoding,
                                                   delim, policy)
     parsed_table = record_iterator._get_all_records()
     expected_warnings = [
         'UTF-8 Byte Order Mark (BOM) was found and skipped in input table'
     ]
     actual_warnings = record_iterator.get_warnings()
     self.assertEqual(expected_warnings, actual_warnings)
     expected_table = copy.deepcopy(table)
     expected_table[0][0] = 'cde'
     self.assertEqual(expected_table, parsed_table)
Пример #3
0
def write_and_parse_back(table, encoding, delim, policy):
    writer_stream = io.BytesIO() if encoding is not None else io.StringIO()
    line_separator = random.choice(line_separators)
    writer = csv_utils.CSVWriter(writer_stream, False, encoding, delim, policy,
                                 line_separator)
    writer._write_all(table)
    assert not len(writer.get_warnings())
    writer_stream.seek(0)
    record_iterator = csv_utils.CSVRecordIterator(writer_stream,
                                                  True,
                                                  encoding,
                                                  delim=delim,
                                                  policy=policy)
    parsed_table = record_iterator._get_all_records()
    return parsed_table
Пример #4
0
 def test_split_lines_custom(self):
     test_cases = list()
     test_cases.append(('', []))
     test_cases.append(('hello', ['hello']))
     test_cases.append(('hello\nworld', ['hello', 'world']))
     test_cases.append(('hello\rworld\n', ['hello', 'world']))
     test_cases.append(('hello\r\nworld\rhello world\nhello\n',
                        ['hello', 'world', 'hello world', 'hello']))
     for tc in test_cases:
         src, expected_res = tc
         stream, encoding = string_to_randomly_encoded_stream(src)
         line_iterator = csv_utils.CSVRecordIterator(stream,
                                                     True,
                                                     encoding,
                                                     delim=None,
                                                     policy=None,
                                                     chunk_size=6)
         test_res = line_iterator._get_all_rows()
         self.assertEqual(expected_res, test_res)
Пример #5
0
 def test_split_chunk_sizes(self):
     source_tokens = ['', 'defghIJKLMN', 'a', 'bc'] + ['\n', '\r\n', '\r']
     for test_case in xrange6(1000):
         num_tokens = random.randint(0, 12)
         chunk_size = random.randint(1, 5) if random.randint(
             0, 1) else random.randint(1, 100)
         src = ''
         for tnum in xrange6(num_tokens):
             token = random.choice(source_tokens)
             src += token
         stream, encoding = string_to_randomly_encoded_stream(src)
         line_iterator = csv_utils.CSVRecordIterator(stream,
                                                     True,
                                                     encoding,
                                                     delim=None,
                                                     policy=None,
                                                     chunk_size=chunk_size)
         test_res = line_iterator._get_all_rows()
         expected_res = src.splitlines()
         self.assertEqual(expected_res, test_res)
Пример #6
0
    def test_iterator(self):
        for _test_num in xrange6(100):
            table = generate_random_decoded_binary_table(10, 10)
            delims = ['\t', ',', ';', '|']
            delim = random.choice(delims)
            table_has_delim = find_in_table(table, delim)
            policy = 'quoted' if table_has_delim else random.choice(
                ['quoted', 'simple'])
            csv_data = table_to_csv_string_random(table, delim, policy)
            stream, encoding = string_to_randomly_encoded_stream(csv_data)

            record_iterator = csv_utils.CSVRecordIterator(stream,
                                                          True,
                                                          encoding,
                                                          delim=delim,
                                                          policy=policy)
            parsed_table = record_iterator._get_all_records()
            self.assertEqual(table, parsed_table)

            parsed_table = write_and_parse_back(table, encoding, delim, policy)
            self.assertEqual(table, parsed_table)
Пример #7
0
    def test_whitespace_separated_parsing(self):
        data_lines = []
        data_lines.append('hello world')
        data_lines.append('   hello   world  ')
        data_lines.append('hello   world  ')
        data_lines.append('  hello   ')
        data_lines.append('  hello   world')
        expected_table = [['hello', 'world'], ['hello', 'world'],
                          ['hello', 'world'], ['hello'], ['hello', 'world']]
        csv_data = '\n'.join(data_lines)
        stream = io.StringIO(csv_data)
        delim = ' '
        policy = 'whitespace'
        encoding = None
        record_iterator = csv_utils.CSVRecordIterator(stream, True, encoding,
                                                      delim, policy)
        parsed_table = record_iterator._get_all_records()
        self.assertEqual(expected_table, parsed_table)

        parsed_table = write_and_parse_back(expected_table, encoding, delim,
                                            policy)
        self.assertEqual(expected_table, parsed_table)
Пример #8
0
    def test_monocolumn_separated_parsing(self):
        for i in xrange6(10):
            self.maxDiff = None
            table = list()
            num_rows = 20
            for irow in xrange6(num_rows):
                min_len = 0 if irow + 1 < num_rows else 1
                table.append([
                    make_random_decoded_binary_csv_entry(
                        min_len, 20, restricted_chars=['\r', '\n'])
                ])
            csv_data = table_to_csv_string_random(table, None, 'monocolumn')
            stream = io.StringIO(csv_data)
            delim = None
            policy = 'monocolumn'
            encoding = None
            record_iterator = csv_utils.CSVRecordIterator(
                stream, True, encoding, delim, policy)
            parsed_table = record_iterator._get_all_records()
            self.assertEqual(table, parsed_table)

            parsed_table = write_and_parse_back(table, encoding, delim, policy)
            self.assertEqual(table, parsed_table)