def test_can_read_delimited_rows(self): # TODO: either get rid of the CID and move it to test_iotools or use validate.Reader and move it to test_validate. delimited_cid = interface.Cid(dev_test.CID_CUSTOMERS_ODS_PATH) delimited_rows = rowio.delimited_rows(dev_test.CUSTOMERS_CSV_PATH, delimited_cid.data_format) title_row = next(delimited_rows) self.assertEqual(title_row, ['customer_id', 'surname', 'first_name', 'born', 'gender']) first_data_row = next(delimited_rows) self.assertEqual(first_data_row, ['1', 'Beck', 'Tyler', '1995-11-15', 'male'])
def test_fails_on_delimited_with_unterminated_quote(self): customer_cid = interface.Cid(dev_test.CID_CUSTOMERS_ODS_PATH) broken_delimited_path = dev_test.path_to_test_data('broken_customers_with_unterminated_quote.csv') try: list(rowio.delimited_rows(broken_delimited_path, customer_cid.data_format)) except errors.DataFormatError as error: error_message = '%s' % error self.assertTrue( 'cannot parse delimited file' in error_message, 'error_message=%r' % error_message)
def test_can_read_delimited_rows(self): # TODO: either get rid of the CID and move it to test_iotools or use validate.Reader and move it to test_validate. delimited_cid = interface.Cid( dev_test.path_to_test_cid("icd_customers.xls")) delimited_rows = rowio.delimited_rows( dev_test.path_to_test_data("valid_customers.csv"), delimited_cid._data_format) first_row = next(delimited_rows) self.assertEqual(first_row, ['38000', '23', 'John', 'Doe', 'male', '08.03.1957'])
def test_can_read_delimited_rows(self): # TODO: either get rid of the CID and move it to test_iotools or use validate.Reader and move it to test_validate. delimited_cid = interface.Cid(dev_test.CID_CUSTOMERS_ODS_PATH) delimited_rows = rowio.delimited_rows(dev_test.CUSTOMERS_CSV_PATH, delimited_cid.data_format) title_row = next(delimited_rows) self.assertEqual( title_row, ['customer_id', 'surname', 'first_name', 'born', 'gender']) first_data_row = next(delimited_rows) self.assertEqual(first_data_row, ['1', 'Beck', 'Tyler', '1995-11-15', 'male'])
def _convert_to_rst(cid_path, data_path, target_rst_path, target_encoding='utf-8'): assert cid_path is not None assert data_path is not None _log.info('read CID from "%s"', cid_path) cid = cutplace.Cid(cid_path) data_format = cid.data_format if data_format.format != data.FORMAT_DELIMITED: raise NotImplementedError('format=%s' % data_format.format) if cid.data_format.header >= 2: raise NotImplementedError('cid.data_format.header=%s' % cid.data_format.header) first_row_is_heading = cid.data_format.header == 1 _log.info('read data from "%s"', data_path) rows = list(rowio.delimited_rows(data_path, data_format)) # Find out the length of each column. lengths = [] for row_number, row in enumerate(rows): for column_index, item in enumerate(row): item_length = len(item) is_first_row = row_number == 0 is_past_last_column = column_index == len(lengths) if is_first_row or is_past_last_column: lengths.append(item_length) elif lengths[column_index] < item_length: lengths[column_index] = item_length if len(lengths) == 0: raise ValueError('file must contain columns: "%s"' % data_path) for column_index in range(len(lengths)): if lengths[column_index] == 0: raise ValueError( 'column %d in file "%s" must not always be empty' % (column_index + 1, data_path)) _log.info('write RST to "%s"', target_rst_path) with io.open(target_rst_path, mode='w', encoding=target_encoding) as rst_target_file: is_first_row = first_row_is_heading _write_rst_separator_line(rst_target_file, lengths, "-") for row_number, row in enumerate(rows): _write_rst_row(rst_target_file, lengths, row) is_first_row = row_number == 0 if is_first_row and first_row_is_heading: line_separator = "=" else: line_separator = "-" _write_rst_separator_line(rst_target_file, lengths, line_separator)
def _raw_rows(self): data_format = self.cid.data_format format = data_format.format if format == data.FORMAT_EXCEL: return rowio.excel_rows(self._source_data_stream_or_path, data_format.sheet) elif format == data.FORMAT_DELIMITED: return rowio.delimited_rows(self._source_data_stream_or_path, data_format) elif format == data.FORMAT_FIXED: return rowio.fixed_rows( self._source_data_stream_or_path, data_format.encoding, interface.field_names_and_lengths(self.cid), data_format.line_delimiter) elif format == data.FORMAT_ODS: return rowio.ods_rows(self._source_data_stream_or_path, data_format.sheet) else: assert False, 'format=%r' % format
def _convert_to_rst(cid_path, data_path, target_rst_path, target_encoding='utf-8'): assert cid_path is not None assert data_path is not None _log.info('read CID from "%s"', cid_path) cid = cutplace.Cid(cid_path) data_format = cid.data_format if data_format.format != data.FORMAT_DELIMITED: raise NotImplementedError('format=%s' % data_format.format) if cid.data_format.header >= 2: raise NotImplementedError('cid.data_format.header=%s' % cid.data_format.header) first_row_is_heading = cid.data_format.header == 1 _log.info('read data from "%s"', data_path) rows = list(rowio.delimited_rows(data_path, data_format)) # Find out the length of each column. lengths = [] for row_number, row in enumerate(rows): for column_index, item in enumerate(row): item_length = len(item) is_first_row = row_number == 0 is_past_last_column = column_index == len(lengths) if is_first_row or is_past_last_column: lengths.append(item_length) elif lengths[column_index] < item_length: lengths[column_index] = item_length if len(lengths) == 0: raise ValueError('file must contain columns: "%s"' % data_path) for column_index in range(len(lengths)): if lengths[column_index] == 0: raise ValueError('column %d in file "%s" must not always be empty' % (column_index + 1, data_path)) _log.info('write RST to "%s"', target_rst_path) with io.open(target_rst_path, mode='w', encoding=target_encoding) as rst_target_file: is_first_row = first_row_is_heading _write_rst_separator_line(rst_target_file, lengths, "-") for row_number, row in enumerate(rows): _write_rst_row(rst_target_file, lengths, row) is_first_row = row_number == 0 if is_first_row and first_row_is_heading: line_separator = "=" else: line_separator = "-" _write_rst_separator_line(rst_target_file, lengths, line_separator)
def test_can_read_delimited_non_ascii(self): data_format = data.DataFormat(data.FORMAT_DELIMITED) data_format.validate() with io.StringIO('eggs\nsp\u00c4m') as data_stream: actual_rows = list(rowio.delimited_rows(data_stream, data_format)) self.assertEqual([['eggs'], ['sp\u00c4m']], actual_rows)