def test_fixed_streaming(self): with open('examples/testfixed', 'r') as f: with open('examples/testfixed_schema.csv', 'r') as schema: output = six.StringIO() fixed.fixed2csv(f, schema, output=output) output = output.getvalue() with open('examples/testfixed_converted.csv', 'r') as f: self.assertEqual(f.read(), output)
def convert(f, format, schema=None, key=None, **kwargs): """ Convert a file of a specified format to CSV. """ if not f: raise ValueError('f must not be None') if not format: raise ValueError('format must not be None') if format == 'fixed': if not schema: raise ValueError('schema must not be null when format is "fixed"') return fixed2csv(f, schema, **kwargs) elif format == 'xls': return xls2csv(f, **kwargs) elif format == 'xlsx': return xlsx2csv(f, **kwargs) elif format == 'json': return json2csv(f, key, **kwargs) elif format == 'ndjson': return ndjson2csv(f, **kwargs) elif format == 'geojson': return geojson2csv(f, **kwargs) elif format == 'csv': return csv2csv(f, **kwargs) elif format == 'dbf': if six.PY3: raise ValueError('format "dbf" is not supported forthis version of Python.') return dbf2csv(f, **kwargs) else: raise ValueError('format "%s" is not supported' % format)
def test_fixed(self): with open('examples/testfixed', 'r') as f: with open('examples/testfixed_schema.csv', 'r') as schema: output = fixed.fixed2csv(f, schema) with open('examples/testfixed_converted.csv', 'r') as f: self.assertEqual(f.read(), output)
def test_fixed_skip_lines(self): with open('examples/testfixed_skip_lines', 'r') as f: with open('examples/testfixed_schema.csv', 'r') as schema: output = fixed.fixed2csv(f, schema, skip_lines=3) with open('examples/testfixed_converted.csv', 'r') as f: self.assertEqual(f.read(), output)
def convert(f, format, schema=None, key=None, **kwargs): """ Convert a file of a specified format to CSV. """ if not f: raise ValueError('f must not be None') if not format: raise ValueError('format must not be None') if format == 'fixed': if not schema: raise ValueError('schema must not be null when format is "fixed"') return fixed2csv(f, schema, **kwargs) elif format == 'xls': return xls2csv(f, **kwargs) elif format == 'xlsx': return xlsx2csv(f, **kwargs) elif format == 'json': return json2csv(f, key, **kwargs) elif format == 'ndjson': return ndjson2csv(f, **kwargs) elif format == 'geojson': return geojson2csv(f, **kwargs) elif format == 'csv': return csv2csv(f, **kwargs) elif format == 'dbf': if six.PY3: raise ValueError( 'format "dbf" is not supported forthis version of Python.') return dbf2csv(f, **kwargs) else: raise ValueError('format "%s" is not supported' % format)
def convert(f, format, schema=None, key=None, output=None, **kwargs): """ Convert a file of a specified format to CSV. """ if format == 'fixed': if not schema: raise ValueError('schema must not be null when format is "fixed"') output.write(fixed2csv(f, schema, output=output, **kwargs)) elif format == 'geojson': output.write(geojson2csv(f, **kwargs)) elif format in ('csv', 'dbf', 'json', 'ndjson', 'xls', 'xlsx'): if format == 'csv': table = agate.Table.from_csv(f, **kwargs) elif format == 'json': table = agate.Table.from_json(f, key=key, **kwargs) elif format == 'ndjson': table = agate.Table.from_json(f, key=key, newline=True, **kwargs) elif format == 'xls': table = agate.Table.from_xls(f, sheet=kwargs.get('sheet', None)) elif format == 'xlsx': table = agate.Table.from_xlsx(f, sheet=kwargs.get('sheet', None)) elif format == 'dbf': with dbf.Table(f.name) as db: column_names = db.field_names table = agate.Table(db, column_names) table.to_csv(output) else: raise ValueError('format "%s" is not supported' % format)
def _make_csv(self, fixed_width_data): """Convert a fixed-width data file to a CSV. Args: fixed_width_data: A text file-like object containing fixed-width data, following the format described in self._schema. Returns: A text file-like object containing CSV data. """ self._schema.seek(0) fixed_width_data.seek(0) fixed_width_text = TextIOWrapper(fixed_width_data, encoding='latin-1') csv_file = TemporaryFile(mode='w+') fixed2csv(fixed_width_text, self._schema, output=csv_file) fixed_width_text.close() csv_file.seek(0) self.logger.debug('Converted fixed-width data to CSV') return csv_file
def main(self): path = self.args.input_path # Determine the file type. if self.args.filetype: filetype = self.args.filetype elif self.args.schema: filetype = 'fixed' elif self.args.key: filetype = 'json' else: if not path or path == '-': self.argparser.error( 'You must specify a format when providing input as piped data via STDIN.' ) filetype = convert.guess_format(path) if not filetype: self.argparser.error( 'Unable to automatically determine the format of the input file. Try specifying ' 'a format with --format.') if self.args.names_only: if filetype in ('xls', 'xlsx'): sheets = self.sheet_names(path, filetype) for sheet in sheets: self.output_file.write('%s\n' % sheet) else: self.argparser.error( 'You cannot use the -n or --names options with non-Excel files.' ) return # Set the input file. if filetype in ('xls', 'xlsx'): self.input_file = self.open_excel_input_file(path) else: self.input_file = self._open_input_file(path) # Set the reader's arguments. kwargs = {} if self.args.schema: schema = self._open_input_file(self.args.schema) elif filetype == 'fixed': raise ValueError('schema must not be null when format is "fixed"') if filetype == 'csv': kwargs.update(self.reader_kwargs) kwargs['sniff_limit'] = self.args.sniff_limit if filetype in ('xls', 'xlsx'): kwargs['header'] = not self.args.no_header_row if filetype not in ('dbf', 'geojson', 'json', 'ndjson'): # csv, fixed, xls, xlsx kwargs['skip_lines'] = self.args.skip_lines if filetype != 'dbf': kwargs['column_types'] = self.get_column_types() # Convert the file. if (filetype == 'csv' and self.args.no_inference and not self.args.no_header_row and not self.args.skip_lines and self.args.sniff_limit == 0): reader = agate.csv.reader(self.input_file, **self.reader_kwargs) writer = agate.csv.writer(self.output_file, **self.writer_kwargs) writer.writerows(reader) elif filetype == 'fixed': self.output_file.write( fixed2csv(self.input_file, schema, output=self.output_file, **kwargs)) elif filetype == 'geojson': self.output_file.write(geojson2csv(self.input_file, **kwargs)) elif filetype in ('csv', 'dbf', 'json', 'ndjson', 'xls', 'xlsx'): if filetype == 'csv': table = agate.Table.from_csv(self.input_file, **kwargs) elif filetype == 'json': table = agate.Table.from_json(self.input_file, key=self.args.key, **kwargs) elif filetype == 'ndjson': table = agate.Table.from_json(self.input_file, key=self.args.key, newline=True, **kwargs) elif filetype == 'xls': table = agate.Table.from_xls( self.input_file, sheet=self.args.sheet, encoding_override=self.args.encoding_xls, **kwargs) elif filetype == 'xlsx': table = agate.Table.from_xlsx(self.input_file, sheet=self.args.sheet, **kwargs) elif filetype == 'dbf': if not hasattr(self.input_file, 'name'): raise ValueError( 'DBF files can not be converted from stdin. You must pass a filename.' ) table = agate.Table.from_dbf(self.input_file.name, **kwargs) table.to_csv(self.output_file, **self.writer_kwargs) if self.args.write_sheets: # Close and re-open the file, as the file object has been mutated or closed. self.input_file.close() self.input_file = self.open_excel_input_file(path) if self.args.write_sheets == '-': sheets = self.sheet_names(path, filetype) else: sheets = [ int(sheet) if sheet.isdigit() else sheet for sheet in self.args.write_sheets.split(',') ] if filetype == 'xls': tables = agate.Table.from_xls( self.input_file, sheet=sheets, encoding_override=self.args.encoding_xls, **kwargs) elif filetype == 'xlsx': tables = agate.Table.from_xlsx(self.input_file, sheet=sheets, **kwargs) base = splitext(self.input_file.name)[0] for i, table in enumerate(tables.values()): with open('%s_%d.csv' % (base, i), 'w') as f: table.to_csv(f, **self.writer_kwargs) self.input_file.close() if self.args.schema: schema.close()
def main(self): # Determine the file type. if self.args.filetype: filetype = self.args.filetype if filetype not in SUPPORTED_FORMATS: self.argparser.error('"%s" is not a supported format' % self.args.filetype) elif self.args.schema: filetype = 'fixed' elif self.args.key: filetype = 'json' else: if not self.args.input_path or self.args.input_path == '-': self.argparser.error( 'You must specify a format when providing data via STDIN (pipe).' ) filetype = convert.guess_format(self.args.input_path) if not filetype: self.argparser.error( 'Unable to automatically determine the format of the input file. Try specifying a format with --format.' ) self.buffers_input = filetype == 'csv' or not self.args.no_inference # Set the input file. if filetype in ('xls', 'xlsx'): self.input_file = open(self.args.input_path, 'rb') else: self.input_file = self._open_input_file(self.args.input_path) # Set the reader's arguments. kwargs = {} if self.args.schema: schema = self._open_input_file(self.args.schema) elif filetype == 'fixed': raise ValueError('schema must not be null when format is "fixed"') if self.args.sheet: kwargs['sheet'] = self.args.sheet if filetype == 'csv': kwargs.update(self.reader_kwargs) # Streaming CSV musn't set sniff_limit, but non-streaming should. if not self.args.no_inference: kwargs['sniff_limit'] = self.args.sniff_limit if self.args.no_header_row: kwargs['header'] = False elif self.args.no_inference: # Streaming CSV musn't set column_types, but other formats should. kwargs['column_types'] = agate.TypeTester(limit=0) # Convert the file. if filetype == 'csv' and self.args.no_inference: reader = agate.csv.reader(self.input_file, **self.reader_kwargs) writer = agate.csv.writer(self.output_file, **self.writer_kwargs) writer.writerows(reader) elif filetype == 'fixed': self.output_file.write( fixed2csv(self.input_file, schema, output=self.output_file, **kwargs)) elif filetype == 'geojson': self.output_file.write(geojson2csv(self.input_file, **kwargs)) elif filetype in ('csv', 'dbf', 'json', 'ndjson', 'xls', 'xlsx'): if filetype == 'csv': table = agate.Table.from_csv(self.input_file, **kwargs) elif filetype == 'json': table = agate.Table.from_json(self.input_file, key=self.args.key, **kwargs) elif filetype == 'ndjson': table = agate.Table.from_json(self.input_file, key=self.args.key, newline=True, **kwargs) elif filetype == 'xls': table = agate.Table.from_xls(self.input_file, sheet=kwargs.get('sheet')) elif filetype == 'xlsx': table = agate.Table.from_xlsx(self.input_file, sheet=kwargs.get('sheet')) elif filetype == 'dbf': if not hasattr(self.input_file, 'name'): raise ValueError( 'DBF files can not be converted from stdin. You must pass a filename.' ) table = agate.Table.from_dbf(self.input_file.name, **kwargs) table.to_csv(self.output_file) self.input_file.close() if self.args.schema: schema.close()
def main(self): # Determine the file type. if self.args.filetype: filetype = self.args.filetype if filetype not in SUPPORTED_FORMATS: self.argparser.error('"%s" is not a supported format' % self.args.filetype) elif self.args.schema: filetype = 'fixed' elif self.args.key: filetype = 'json' else: if not self.args.input_path or self.args.input_path == '-': self.argparser.error('You must specify a format when providing data via STDIN (pipe).') filetype = convert.guess_format(self.args.input_path) if not filetype: self.argparser.error('Unable to automatically determine the format of the input file. Try specifying a format with --format.') # Set the input file. if filetype in ('xls', 'xlsx'): self.input_file = open(self.args.input_path, 'rb') else: self.input_file = self._open_input_file(self.args.input_path) # Set the reader's arguments. kwargs = {} if self.args.schema: schema = self._open_input_file(self.args.schema) elif filetype == 'fixed': raise ValueError('schema must not be null when format is "fixed"') if self.args.sheet: kwargs['sheet'] = self.args.sheet if filetype == 'csv': kwargs.update(self.reader_kwargs) # Streaming CSV musn't set sniff_limit, but non-streaming should. if not self.args.no_inference: kwargs['sniff_limit'] = self.args.sniff_limit if self.args.no_header_row: kwargs['header'] = False elif self.args.no_inference: # Streaming CSV musn't set column_types, but other formats should. kwargs['column_types'] = agate.TypeTester(limit=0) # Convert the file. if filetype == 'csv' and self.args.no_inference: reader = agate.reader(self.input_file, **self.reader_kwargs) writer = agate.writer(self.output_file, **self.writer_kwargs) writer.writerows(reader) elif filetype == 'fixed': self.output_file.write(fixed2csv(self.input_file, schema, output=self.output_file, **kwargs)) elif filetype == 'geojson': self.output_file.write(geojson2csv(self.input_file, **kwargs)) elif filetype in ('csv', 'dbf', 'json', 'ndjson', 'xls', 'xlsx'): if filetype == 'csv': table = agate.Table.from_csv(self.input_file, **kwargs) elif filetype == 'json': table = agate.Table.from_json(self.input_file, key=self.args.key, **kwargs) elif filetype == 'ndjson': table = agate.Table.from_json(self.input_file, key=self.args.key, newline=True, **kwargs) elif filetype == 'xls': table = agate.Table.from_xls(self.input_file, sheet=kwargs.get('sheet', None)) elif filetype == 'xlsx': table = agate.Table.from_xlsx(self.input_file, sheet=kwargs.get('sheet', None)) elif filetype == 'dbf': if not hasattr(self.input_file, 'name'): raise ValueError('DBF files can not be converted from stdin. You must pass a filename.') table = agate.Table.from_dbf(self.input_file.name, **kwargs) table.to_csv(self.output_file)
def main(self): path = self.args.input_path # Determine the file type. if self.args.filetype: filetype = self.args.filetype if filetype not in SUPPORTED_FORMATS: self.argparser.error('"%s" is not a supported format' % self.args.filetype) elif self.args.schema: filetype = 'fixed' elif self.args.key: filetype = 'json' else: if not path or path == '-': self.argparser.error('You must specify a format when providing input as piped data via STDIN.') filetype = convert.guess_format(path) if not filetype: self.argparser.error('Unable to automatically determine the format of the input file. Try specifying a format with --format.') if self.args.names_only: sheets = self.sheet_names(path, filetype) if sheets: for sheet in sheets: self.output_file.write('%s\n' % sheet) else: self.argparser.error('You cannot use the -n or --names options with non-Excel files.') return # Set the input file. if filetype in ('xls', 'xlsx'): self.input_file = self.open_excel_input_file(path) else: self.input_file = self._open_input_file(path) # Set the reader's arguments. kwargs = {} if self.args.schema: schema = self._open_input_file(self.args.schema) elif filetype == 'fixed': raise ValueError('schema must not be null when format is "fixed"') if filetype == 'csv': kwargs.update(self.reader_kwargs) kwargs['sniff_limit'] = self.args.sniff_limit if filetype in ('xls', 'xlsx'): kwargs['header'] = not self.args.no_header_row if filetype not in ('dbf', 'geojson', 'json', 'ndjson'): # csv, fixed, xls, xlsx kwargs['skip_lines'] = self.args.skip_lines if filetype != 'dbf': kwargs['column_types'] = self.get_column_types() # Convert the file. if filetype == 'csv' and self.args.no_inference and not self.args.no_header_row and not self.args.skip_lines and self.args.sniff_limit == 0: reader = agate.csv.reader(self.input_file, **self.reader_kwargs) writer = agate.csv.writer(self.output_file, **self.writer_kwargs) writer.writerows(reader) elif filetype == 'fixed': self.output_file.write(fixed2csv(self.input_file, schema, output=self.output_file, **kwargs)) elif filetype == 'geojson': self.output_file.write(geojson2csv(self.input_file, **kwargs)) elif filetype in ('csv', 'dbf', 'json', 'ndjson', 'xls', 'xlsx'): if filetype == 'csv': table = agate.Table.from_csv(self.input_file, **kwargs) elif filetype == 'json': table = agate.Table.from_json(self.input_file, key=self.args.key, **kwargs) elif filetype == 'ndjson': table = agate.Table.from_json(self.input_file, key=self.args.key, newline=True, **kwargs) elif filetype == 'xls': table = agate.Table.from_xls(self.input_file, sheet=self.args.sheet, encoding_override=self.args.encoding_xls, **kwargs) elif filetype == 'xlsx': table = agate.Table.from_xlsx(self.input_file, sheet=self.args.sheet, **kwargs) elif filetype == 'dbf': if not hasattr(self.input_file, 'name'): raise ValueError('DBF files can not be converted from stdin. You must pass a filename.') table = agate.Table.from_dbf(self.input_file.name, **kwargs) table.to_csv(self.output_file, **self.writer_kwargs) if self.args.write_sheets: # Close and re-open the file, as the file object has been mutated or closed. self.input_file.close() self.input_file = self.open_excel_input_file(path) if self.args.write_sheets == '-': sheets = self.sheet_names(path, filetype) else: sheets = [int(sheet) if sheet.isdigit() else sheet for sheet in self.args.write_sheets.split(',')] if filetype == 'xls': tables = agate.Table.from_xls(self.input_file, sheet=sheets, encoding_override=self.args.encoding_xls, **kwargs) elif filetype == 'xlsx': tables = agate.Table.from_xlsx(self.input_file, sheet=sheets, **kwargs) base = splitext(self.input_file.name)[0] for i, table in enumerate(tables.values()): with open('%s_%d.csv' % (base, i), 'w') as f: table.to_csv(f, **self.writer_kwargs) self.input_file.close() if self.args.schema: schema.close()
def main(self): # Determine the file type. if self.args.filetype: filetype = self.args.filetype if filetype not in SUPPORTED_FORMATS: self.argparser.error('"%s" is not a supported format' % self.args.filetype) elif self.args.schema: filetype = 'fixed' elif self.args.key: filetype = 'json' else: if not self.args.input_path or self.args.input_path == '-': self.argparser.error( 'You must specify a format when providing data via STDIN (pipe).' ) filetype = convert.guess_format(self.args.input_path) if not filetype: self.argparser.error( 'Unable to automatically determine the format of the input file. Try specifying a format with --format.' ) # Buffer standard input if the input file is in CSV format or if performing type inference. self.buffers_input = filetype == 'csv' or not self.args.no_inference # Set the input file. if filetype in ('xls', 'xlsx'): self.input_file = open(self.args.input_path, 'rb') else: self.input_file = self._open_input_file(self.args.input_path) if self.args.names_only: sheet_names = None if filetype == 'xls': sheet_names = xlrd.open_workbook( file_contents=self.input_file.read()).sheet_names() elif filetype == 'xlsx': sheet_names = openpyxl.load_workbook(self.input_file, read_only=True, data_only=True).sheetnames if sheet_names: for name in sheet_names: self.output_file.write('%s\n' % name) else: self.argparser.error( 'You cannot use the -n or --names options with non-Excel files.' ) self.input_file.close() return # Set the reader's arguments. kwargs = {} if self.args.schema: schema = self._open_input_file(self.args.schema) elif filetype == 'fixed': raise ValueError('schema must not be null when format is "fixed"') if self.args.sheet: kwargs['sheet'] = self.args.sheet if filetype == 'csv': kwargs.update(self.reader_kwargs) kwargs['sniff_limit'] = self.args.sniff_limit if filetype not in ('dbf', 'geojson', 'json', 'ndjson'): kwargs['skip_lines'] = self.args.skip_lines if filetype != 'dbf': kwargs['column_types'] = self.get_column_types() # Convert the file. if filetype == 'csv' and self.args.no_inference and not self.args.skip_lines: reader = agate.csv.reader(self.input_file, **self.reader_kwargs) writer = agate.csv.writer(self.output_file, **self.writer_kwargs) writer.writerows(reader) elif filetype == 'fixed': self.output_file.write( fixed2csv(self.input_file, schema, output=self.output_file, **kwargs)) elif filetype == 'geojson': self.output_file.write(geojson2csv(self.input_file, **kwargs)) elif filetype in ('csv', 'dbf', 'json', 'ndjson', 'xls', 'xlsx'): if filetype == 'csv': table = agate.Table.from_csv(self.input_file, **kwargs) elif filetype == 'json': table = agate.Table.from_json(self.input_file, key=self.args.key, **kwargs) elif filetype == 'ndjson': table = agate.Table.from_json(self.input_file, key=self.args.key, newline=True, **kwargs) elif filetype == 'xls': table = agate.Table.from_xls(self.input_file, **kwargs) elif filetype == 'xlsx': table = agate.Table.from_xlsx(self.input_file, **kwargs) elif filetype == 'dbf': if not hasattr(self.input_file, 'name'): raise ValueError( 'DBF files can not be converted from stdin. You must pass a filename.' ) table = agate.Table.from_dbf(self.input_file.name, **kwargs) table.to_csv(self.output_file) self.input_file.close() if self.args.schema: schema.close()