def test_utf_decoding_errors(self): table = [[ 'hello', u'\x80\x81\xffThis unicode string encoded as latin-1 is not a valid utf-8\xaa\xbb\xcc' ], ['hello', 'world']] delim = ',' policy = 'simple' encoding = 'latin-1' csv_data = table_to_csv_string_random(table, delim, policy) stream = io.BytesIO(csv_data.encode('latin-1')) record_iterator = csv_utils.CSVRecordIterator(stream, True, encoding, delim, policy) parsed_table = record_iterator._get_all_records() self.assertEqual(table, parsed_table) parsed_table = write_and_parse_back(table, encoding, delim, policy) self.assertEqual(table, parsed_table) stream = io.BytesIO(csv_data.encode('latin-1')) record_iterator = csv_utils.CSVRecordIterator(stream, True, 'utf-8', delim=delim, policy=policy) with self.assertRaises(Exception) as cm: parsed_table = record_iterator._get_all_records() e = cm.exception self.assertTrue( str(e).find('Unable to decode input table as UTF-8') != -1)
def test_bom_warning(self): table = list() table.append([u'\xef\xbb\xbfcde', '1234']) table.append(['abc', '1234']) table.append(['abc', '1234']) table.append(['efg', '100']) table.append(['abc', '100']) table.append(['cde', '12999']) table.append(['aaa', '2000']) table.append(['abc', '100']) delim = ',' policy = 'simple' encoding = 'latin-1' csv_data = table_to_csv_string_random(table, delim, policy) stream = io.BytesIO(csv_data.encode('latin-1')) record_iterator = csv_utils.CSVRecordIterator(stream, True, encoding, delim, policy) parsed_table = record_iterator._get_all_records() expected_warnings = [ 'UTF-8 Byte Order Mark (BOM) was found and skipped in input table' ] actual_warnings = record_iterator.get_warnings() self.assertEqual(expected_warnings, actual_warnings) expected_table = copy.deepcopy(table) expected_table[0][0] = 'cde' self.assertEqual(expected_table, parsed_table)
def write_and_parse_back(table, encoding, delim, policy): writer_stream = io.BytesIO() if encoding is not None else io.StringIO() line_separator = random.choice(line_separators) writer = csv_utils.CSVWriter(writer_stream, False, encoding, delim, policy, line_separator) writer._write_all(table) assert not len(writer.get_warnings()) writer_stream.seek(0) record_iterator = csv_utils.CSVRecordIterator(writer_stream, True, encoding, delim=delim, policy=policy) parsed_table = record_iterator._get_all_records() return parsed_table
def test_split_lines_custom(self): test_cases = list() test_cases.append(('', [])) test_cases.append(('hello', ['hello'])) test_cases.append(('hello\nworld', ['hello', 'world'])) test_cases.append(('hello\rworld\n', ['hello', 'world'])) test_cases.append(('hello\r\nworld\rhello world\nhello\n', ['hello', 'world', 'hello world', 'hello'])) for tc in test_cases: src, expected_res = tc stream, encoding = string_to_randomly_encoded_stream(src) line_iterator = csv_utils.CSVRecordIterator(stream, True, encoding, delim=None, policy=None, chunk_size=6) test_res = line_iterator._get_all_rows() self.assertEqual(expected_res, test_res)
def test_split_chunk_sizes(self): source_tokens = ['', 'defghIJKLMN', 'a', 'bc'] + ['\n', '\r\n', '\r'] for test_case in xrange6(1000): num_tokens = random.randint(0, 12) chunk_size = random.randint(1, 5) if random.randint( 0, 1) else random.randint(1, 100) src = '' for tnum in xrange6(num_tokens): token = random.choice(source_tokens) src += token stream, encoding = string_to_randomly_encoded_stream(src) line_iterator = csv_utils.CSVRecordIterator(stream, True, encoding, delim=None, policy=None, chunk_size=chunk_size) test_res = line_iterator._get_all_rows() expected_res = src.splitlines() self.assertEqual(expected_res, test_res)
def test_iterator(self): for _test_num in xrange6(100): table = generate_random_decoded_binary_table(10, 10) delims = ['\t', ',', ';', '|'] delim = random.choice(delims) table_has_delim = find_in_table(table, delim) policy = 'quoted' if table_has_delim else random.choice( ['quoted', 'simple']) csv_data = table_to_csv_string_random(table, delim, policy) stream, encoding = string_to_randomly_encoded_stream(csv_data) record_iterator = csv_utils.CSVRecordIterator(stream, True, encoding, delim=delim, policy=policy) parsed_table = record_iterator._get_all_records() self.assertEqual(table, parsed_table) parsed_table = write_and_parse_back(table, encoding, delim, policy) self.assertEqual(table, parsed_table)
def test_whitespace_separated_parsing(self): data_lines = [] data_lines.append('hello world') data_lines.append(' hello world ') data_lines.append('hello world ') data_lines.append(' hello ') data_lines.append(' hello world') expected_table = [['hello', 'world'], ['hello', 'world'], ['hello', 'world'], ['hello'], ['hello', 'world']] csv_data = '\n'.join(data_lines) stream = io.StringIO(csv_data) delim = ' ' policy = 'whitespace' encoding = None record_iterator = csv_utils.CSVRecordIterator(stream, True, encoding, delim, policy) parsed_table = record_iterator._get_all_records() self.assertEqual(expected_table, parsed_table) parsed_table = write_and_parse_back(expected_table, encoding, delim, policy) self.assertEqual(expected_table, parsed_table)
def test_monocolumn_separated_parsing(self): for i in xrange6(10): self.maxDiff = None table = list() num_rows = 20 for irow in xrange6(num_rows): min_len = 0 if irow + 1 < num_rows else 1 table.append([ make_random_decoded_binary_csv_entry( min_len, 20, restricted_chars=['\r', '\n']) ]) csv_data = table_to_csv_string_random(table, None, 'monocolumn') stream = io.StringIO(csv_data) delim = None policy = 'monocolumn' encoding = None record_iterator = csv_utils.CSVRecordIterator( stream, True, encoding, delim, policy) parsed_table = record_iterator._get_all_records() self.assertEqual(table, parsed_table) parsed_table = write_and_parse_back(table, encoding, delim, policy) self.assertEqual(table, parsed_table)