def main(self): if self.args.names_only: self.print_column_names() return rows = agate.reader(self.input_file, **self.reader_kwargs) if self.args.no_header_row: row = next(rows) column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = next(rows) column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based, self.args.not_columns) output = agate.writer(self.output_file, **self.writer_kwargs) output.writerow([column_names[c] for c in column_ids]) for row in rows: out_row = [row[c] if c < len(row) else None for c in column_ids] if self.args.delete_empty: if ''.join(out_row) == '': continue output.writerow(out_row)
def test_string_match(self): args = [ '-c', '1', '-m', 'ILLINOIS', 'examples/realdata/FY09_EDU_Recipients_by_State.csv' ] output_file = six.StringIO() utility = CSVGrep(args, output_file) utility.main() input_file = six.StringIO(output_file.getvalue()) reader = agate.reader(input_file) self.assertEqual(next(reader), [ 'State Name', 'State Abbreviate', 'Code', 'Montgomery GI Bill-Active Duty', 'Montgomery GI Bill- Selective Reserve', 'Dependents\' Educational Assistance', 'Reserve Educational Assistance Program', 'Post-Vietnam Era Veteran\'s Educational Assistance Program', 'TOTAL', '' ]) self.assertEqual(next(reader), [ 'ILLINOIS', 'IL', '17', '15,659', '2,491', '2,025', '1,770', '19', '21,964', '' ])
def main(self): if self.args.names_only: self.print_column_names() return if not self.args.columns: self.argparser.error('You must specify at least one column to search using the -c option.') if self.args.regex is None and self.args.pattern is None and self.args.matchfile is None: self.argparser.error('One of -r, -m or -f must be specified, unless using the -n option.') rows = agate.reader(self.input_file, **self.reader_kwargs) column_names = next(rows) column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based) if self.args.regex: pattern = re.compile(self.args.regex) elif self.args.matchfile: lines = set(line.rstrip() for line in self.args.matchfile) pattern = lambda x: x in lines else: pattern = self.args.pattern patterns = dict((c, pattern) for c in column_ids) output = agate.writer(self.output_file, **self.writer_kwargs) output.writerow(column_names) filter_reader = FilteringCSVReader(rows, header=False, patterns=patterns, inverse=self.args.inverse) for row in filter_reader: output.writerow(row)
def main(self): self.input_files = [] for path in self.args.input_paths: self.input_files.append(self._open_input_file(path)) if not self.input_files: self.argparser.error('You must specify at least one file to stack.') if self.args.group_by_filenames: groups = [os.path.split(f.name)[1] for f in self.input_files] elif self.args.groups: groups = self.args.groups.split(',') if len(groups) != len(self.input_files): self.argparser.error('The number of grouping values must be equal to the number of CSV files being stacked.') else: groups = None group_name = self.args.group_name if self.args.group_name else 'group' output = agate.writer(self.output_file, **self.writer_kwargs) for i, f in enumerate(self.input_files): rows = agate.reader(f, **self.reader_kwargs) # If we have header rows, use them if not self.args.no_header_row: headers = next(rows, []) if i == 0: if groups: headers.insert(0, group_name) output.writerow(headers) # If we don't generate simple column names based on first row else: row = next(rows, []) headers = make_default_headers(len(row)) if i == 0: if groups: headers.insert(0, group_name) output.writerow(headers) if groups: row.insert(0, groups[i]) output.writerow(row) for row in rows: if groups: row.insert(0, groups[i]) output.writerow(row) f.close()
def main(self): rows = agate.reader(self.input_file, **self.reader_kwargs) # Make a default header row if none exists if self.args.no_header_row: row = next(rows) column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = next(rows) column_names = list(column_names) # prepend 'line_number' column with line numbers if --linenumbers option if self.args.line_numbers: column_names.insert(0, 'line_number') rows = [ list(itertools.chain([str(i + 1)], row)) for i, row in enumerate(rows) ] # Convert to normal list of rows rows = list(rows) # Insert the column names at the top rows.insert(0, column_names) widths = [] for row in rows: for i, v in enumerate(row): try: if len(v) > widths[i]: widths[i] = len(v) except IndexError: widths.append(len(v)) # Dashes span each width with '+' character at intersection of # horizontal and vertical dividers. divider = '|--' + '-+-'.join('-' * w for w in widths) + '--|' self.output_file.write('%s\n' % divider) for i, row in enumerate(rows): output = [] for j, d in enumerate(row): if d is None: d = '' output.append(' %s ' % six.text_type(d).ljust(widths[j])) self.output_file.write('| %s |\n' % ('|'.join(output))) if (i == 0 or i == len(rows) - 1): self.output_file.write('%s\n' % divider)
def main(self): reader = agate.reader(self.input_file, **self.reader_kwargs) if self.args.dryrun: checker = RowChecker(reader) for row in checker.checked_rows(): pass if checker.errors: for e in checker.errors: self.output_file.write('Line %i: %s\n' % (e.line_number, e.msg)) else: self.output_file.write('No errors.\n') if checker.joins: self.output_file.write( '%i rows would have been joined/reduced to %i rows after eliminating expected internal line breaks.\n' % (checker.rows_joined, checker.joins)) else: base, ext = splitext(self.input_file.name) with open('%s_out.csv' % base, 'w') as f: clean_writer = agate.writer(f, **self.writer_kwargs) checker = RowChecker(reader) clean_writer.writerow(checker.column_names) for row in checker.checked_rows(): clean_writer.writerow(row) if checker.errors: error_filename = '%s_err.csv' % base with open(error_filename, 'w') as f: error_writer = agate.writer(f, **self.writer_kwargs) error_header = ['line_number', 'msg'] error_header.extend(checker.column_names) error_writer.writerow(error_header) error_count = len(checker.errors) for e in checker.errors: error_writer.writerow(self._format_error_row(e)) self.output_file.write( '%i error%s logged to %s\n' % (error_count, '' if error_count == 1 else 's', error_filename)) else: self.output_file.write('No errors.\n') if checker.joins: self.output_file.write( '%i rows were joined/reduced to %i rows after eliminating expected internal line breaks.\n' % (checker.rows_joined, checker.joins))
def main(self): rows = agate.reader(self.input_file, **self.reader_kwargs) # Make a default header row if none exists if self.args.no_header_row: row = next(rows) column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = next(rows) column_names = list(column_names) # prepend 'line_number' column with line numbers if --linenumbers option if self.args.line_numbers: column_names.insert(0, 'line_number') rows = [list(itertools.chain([str(i + 1)], row)) for i, row in enumerate(rows)] # Convert to normal list of rows rows = list(rows) # Insert the column names at the top rows.insert(0, column_names) widths = [] for row in rows: for i, v in enumerate(row): try: if len(v) > widths[i]: widths[i] = len(v) except IndexError: widths.append(len(v)) # Dashes span each width with '+' character at intersection of # horizontal and vertical dividers. divider = '|--' + '-+-'.join('-'* w for w in widths) + '--|' self.output_file.write('%s\n' % divider) for i, row in enumerate(rows): output = [] for j, d in enumerate(row): if d is None: d = '' output.append(' %s ' % six.text_type(d).ljust(widths[j])) self.output_file.write('| %s |\n' % ('|'.join(output))) if (i == 0 or i == len(rows) - 1): self.output_file.write('%s\n' % divider)
def test_no_match(self): args = ['-c', '1', '-m', 'NO MATCH', 'examples/dummy.csv'] output_file = six.StringIO() utility = CSVGrep(args, output_file) utility.main() input_file = six.StringIO(output_file.getvalue()) reader = agate.reader(input_file) self.assertEqual(next(reader), ['a', 'b', 'c'])
def __init__(self, schema): self.fields = [] # A list of FixedWidthFields schema_reader = agate.reader(schema) schema_decoder = SchemaDecoder(next(schema_reader)) for i, row in enumerate(schema_reader): try: self.fields.append(schema_decoder(row)) except Exception as e: raise ValueError("Error reading schema at line %i: %s" % (i + 2, e))
def main(self): reader = agate.reader(self.input_file, **self.reader_kwargs) if self.args.dryrun: checker = RowChecker(reader) for row in checker.checked_rows(): pass if checker.errors: for e in checker.errors: self.output_file.write('Line %i: %s\n' % (e.line_number, e.msg)) else: self.output_file.write('No errors.\n') if checker.joins: self.output_file.write('%i rows would have been joined/reduced to %i rows after eliminating expected internal line breaks.\n' % (checker.rows_joined, checker.joins)) else: if self.input_file == sys.stdin: base = 'stdin' # "<stdin>_out.csv" is invalid on Windows else: base = splitext(self.input_file.name)[0] with open('%s_out.csv' % base, 'w') as f: clean_writer = agate.writer(f, **self.writer_kwargs) checker = RowChecker(reader) clean_writer.writerow(checker.column_names) for row in checker.checked_rows(): clean_writer.writerow(row) if checker.errors: error_filename = '%s_err.csv' % base with open(error_filename, 'w') as f: error_writer = agate.writer(f, **self.writer_kwargs) error_header = ['line_number', 'msg'] error_header.extend(checker.column_names) error_writer.writerow(error_header) error_count = len(checker.errors) for e in checker.errors: error_writer.writerow(self._format_error_row(e)) self.output_file.write('%i error%s logged to %s\n' % (error_count, '' if error_count == 1 else 's', error_filename)) else: self.output_file.write('No errors.\n') if checker.joins: self.output_file.write('%i rows were joined/reduced to %i rows after eliminating expected internal line breaks.\n' % (checker.rows_joined, checker.joins))
def test_exclude(self): args = ['-C', '1,3', 'examples/dummy.csv'] output_file = six.StringIO() utility = CSVCut(args, output_file) utility.main() input_file = six.StringIO(output_file.getvalue()) reader = agate.reader(input_file) self.assertEqual(next(reader), ['b']) self.assertEqual(next(reader), ['2'])
def test_no_header_row(self): args = ['-c', '2', '--no-header-row', 'examples/no_header_row.csv'] output_file = six.StringIO() utility = CSVCut(args, output_file) utility.main() input_file = six.StringIO(output_file.getvalue()) reader = agate.reader(input_file) self.assertEqual(next(reader), ['column2']) self.assertEqual(next(reader), ['2'])
def test_display_column_names(self): args = ['-n', 'examples/realdata/FY09_EDU_Recipients_by_State.csv'] output_file = six.StringIO() utility = CSVGrep(args, output_file) utility.main() input_file = six.StringIO(output_file.getvalue()) reader = agate.reader(input_file) self.assertEqual(next(reader), [' 1: State Name']) self.assertEqual(next(reader), [' 2: State Abbreviate'])
def test_with_bzip2(self): args = ['-c', '1,3', 'examples/dummy.csv.bz2'] output_file = six.StringIO() utility = CSVCut(args, output_file) utility.main() input_file = six.StringIO(output_file.getvalue()) reader = agate.reader(input_file) self.assertEqual(next(reader), ['a', 'c']) self.assertEqual(next(reader), ['1', '3'])
def test_string_match(self): args = ['-c', '1', '-m', 'ILLINOIS', 'examples/realdata/FY09_EDU_Recipients_by_State.csv'] output_file = six.StringIO() utility = CSVGrep(args, output_file) utility.main() input_file = six.StringIO(output_file.getvalue()) reader = agate.reader(input_file) self.assertEqual(next(reader), ['State Name', 'State Abbreviate', 'Code', 'Montgomery GI Bill-Active Duty', 'Montgomery GI Bill- Selective Reserve', 'Dependents\' Educational Assistance', 'Reserve Educational Assistance Program', 'Post-Vietnam Era Veteran\'s Educational Assistance Program', 'TOTAL', '']) self.assertEqual(next(reader), ['ILLINOIS', 'IL', '17', '15,659', '2,491', '2,025', '1,770', '19', '21,964', ''])
def test_unicode(self): args = ['-c', '1,3', 'examples/test_utf8.csv'] output_file = six.StringIO() utility = CSVCut(args, output_file) utility.main() input_file = six.StringIO(output_file.getvalue()) reader = agate.reader(input_file) self.assertEqual(next(reader), ['a', 'c']) self.assertEqual(next(reader), ['1', '3']) self.assertEqual(next(reader), ['4', u'ʤ'])
def test_no_inference(self): args = ['--no-inference', '-c', '1', 'examples/test_literal_order.csv'] output_file = six.StringIO() utility = CSVSort(args, output_file) utility.main() input_file = six.StringIO(output_file.getvalue()) reader = agate.reader(input_file) test_order = [u'a', u'192', u'27', u'3'] new_order = [six.text_type(r[0]) for r in reader] self.assertEqual(test_order, new_order)
def test_no_header_row(self): args = ['--no-header-row', '-c', '1', '-r', 'examples/no_header_row3.csv'] output_file = six.StringIO() utility = CSVSort(args, output_file) utility.main() input_file = six.StringIO(output_file.getvalue()) reader = agate.reader(input_file) test_order = ['column1', '4', '1'] new_order = [six.text_type(r[0]) for r in reader] self.assertEqual(test_order, new_order)
def main(self): if self.args.names_only: self.print_column_names() return if not self.args.columns: self.argparser.error( 'You must specify at least one column to search using the -c option.' ) if self.args.regex is None and self.args.pattern is None and self.args.matchfile is None: self.argparser.error( 'One of -r, -m or -f must be specified, unless using the -n option.' ) rows = agate.reader(self.input_file, **self.reader_kwargs) if self.args.no_header_row: row = next(rows) column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = next(rows) column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based) if self.args.regex: pattern = re.compile(self.args.regex) elif self.args.matchfile: lines = set(line.rstrip() for line in self.args.matchfile) pattern = lambda x: x in lines else: pattern = self.args.pattern patterns = dict((c, pattern) for c in column_ids) output = agate.writer(self.output_file, **self.writer_kwargs) output.writerow(column_names) filter_reader = FilteringCSVReader(rows, header=False, patterns=patterns, inverse=self.args.inverse) for row in filter_reader: output.writerow(row)
def get_rows_and_column_names_and_column_ids(self): rows = agate.reader(self.input_file, **self.reader_kwargs) if self.args.no_header_row: # Peek at a row to get the number of columns. row = next(rows) rows = itertools.chain([row], rows) column_names = make_default_headers(len(row)) else: column_names = next(rows) column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based, getattr(self.args, 'not_columns', None)) return rows, column_names, column_ids
def test_sort_date(self): args = ['-c', '2', 'examples/testxls_converted.csv'] output_file = six.StringIO() utility = CSVSort(args, output_file) utility.main() input_file = six.StringIO(output_file.getvalue()) reader = agate.reader(input_file) test_order = [u'text', u'This row has blanks', u'Unicode! Σ', u'Chicago Tribune', u'Chicago Sun-Times', u'Chicago Reader'] new_order = [six.text_type(r[0]) for r in reader] self.assertEqual(test_order, new_order)
def test_sort_ints_and_nulls(self): args = ['-c', '2', 'examples/sort_ints_nulls.csv'] output_file = six.StringIO() utility = CSVSort(args, output_file) utility.main() input_file = six.StringIO(output_file.getvalue()) reader = agate.reader(input_file) test_order = ['b', '', '1', '2'] new_order = [six.text_type(r[1]) for r in reader] self.assertEqual(test_order, new_order)
def test_no_grouping(self): # stack two CSV files args = ["examples/dummy.csv", "examples/dummy2.csv"] output_file = six.StringIO() utility = CSVStack(args, output_file) utility.main() # verify the stacked file's contents input_file = six.StringIO(output_file.getvalue()) reader = agate.reader(input_file) self.assertEqual(next(reader), ["a", "b", "c"]) self.assertEqual(next(reader)[0], "1") self.assertEqual(next(reader)[0], "1")
def test_no_header_row(self): # stack two CSV files args = ["--no-header-row", "examples/no_header_row.csv", "examples/no_header_row2.csv"] output_file = six.StringIO() utility = CSVStack(args, output_file) utility.main() # verify the stacked file's contents input_file = six.StringIO(output_file.getvalue()) reader = agate.reader(input_file) self.assertEqual(next(reader)[0], "column1") self.assertEqual(next(reader)[0], "1") self.assertEqual(next(reader)[0], "4")
def test_filenames_grouping(self): # stack two CSV files args = ['--filenames', '-n', 'path', 'examples/dummy.csv', 'examples/dummy2.csv'] output_file = six.StringIO() utility = CSVStack(args, output_file) utility.main() # verify the stacked file's contents input_file = six.StringIO(output_file.getvalue()) reader = agate.reader(input_file) self.assertEqual(next(reader), ['path', 'a', 'b', 'c']) self.assertEqual(next(reader)[0], 'dummy.csv') self.assertEqual(next(reader)[0], 'dummy2.csv')
def test_no_grouping(self): # stack two CSV files args = ['examples/dummy.csv', 'examples/dummy2.csv'] output_file = six.StringIO() utility = CSVStack(args, output_file) utility.main() # verify the stacked file's contents input_file = six.StringIO(output_file.getvalue()) reader = agate.reader(input_file) self.assertEqual(next(reader), ['a', 'b', 'c']) self.assertEqual(next(reader)[0], '1') self.assertEqual(next(reader)[0], '1')
def test_single_file_stack(self): # stacking single file works fine args = ['examples/dummy.csv'] output_file = six.StringIO() utility = CSVStack(args, output_file) utility.main() # verify the stacked file's contents input_file = six.StringIO(output_file.getvalue()) reader = agate.reader(input_file) self.assertEqual(next(reader), ['a', 'b', 'c']) self.assertEqual(next(reader)[0], '1')
def test_single_file_stack(self): # stacking single file works fine args = ["examples/dummy.csv"] output_file = six.StringIO() utility = CSVStack(args, output_file) utility.main() # verify the stacked file's contents input_file = six.StringIO(output_file.getvalue()) reader = agate.reader(input_file) self.assertEqual(next(reader), ["a", "b", "c"]) self.assertEqual(next(reader)[0], "1")
def test_multiple_file_stack(self): # stacking multiple files works fine args = ['examples/dummy.csv', 'examples/dummy2.csv'] output_file = six.StringIO() utility = CSVStack(args, output_file) utility.main() # verify the stacked file's contents input_file = six.StringIO(output_file.getvalue()) reader = agate.reader(input_file) self.assertEqual(next(reader), ['a', 'b', 'c']) self.assertEqual(next(reader)[0], '1') self.assertEqual(next(reader)[0], '1')
def test_no_header_row(self): # stack two CSV files args = [ '--no-header-row', 'examples/no_header_row.csv', 'examples/no_header_row2.csv' ] output_file = six.StringIO() utility = CSVStack(args, output_file) utility.main() # verify the stacked file's contents input_file = six.StringIO(output_file.getvalue()) reader = agate.reader(input_file) self.assertEqual(next(reader)[0], 'column1') self.assertEqual(next(reader)[0], '1') self.assertEqual(next(reader)[0], '4')
def get_rows_and_column_names_and_column_ids(self, **kwargs): rows = agate.reader(self.input_file, **kwargs) if self.args.no_header_row: # Peek at a row to get the number of columns. row = next(rows) rows = itertools.chain([row], rows) column_names = make_default_headers(len(row)) else: column_names = next(rows) column_offset = self.get_column_offset() if self.args.line_numbers: column_offset -= 1 column_ids = parse_column_identifiers( self.args.columns, column_names, column_offset, getattr(self.args, 'not_columns', None)) return rows, column_names, column_ids
def print_column_names(self): """ Pretty-prints the names and indices of all columns to a file-like object (usually sys.stdout). """ if getattr(self.args, 'no_header_row', None): raise RequiredHeaderError('You cannot use --no-header-row with the -n or --names options.') f = self.input_file output = self.output_file try: zero_based = self.args.zero_based except: zero_based = False rows = agate.reader(f, **self.reader_kwargs) column_names = next(rows) for i, c in enumerate(column_names): if not zero_based: i += 1 output.write('%3i: %s\n' % (i, c))
def print_column_names(self): """ Pretty-prints the names and indices of all columns to a file-like object (usually sys.stdout). """ if getattr(self.args, 'no_header_row', None): raise RequiredHeaderError( 'You cannot use --no-header-row with the -n or --names options.' ) f = self.input_file output = self.output_file try: zero_based = self.args.zero_based except: zero_based = False rows = agate.reader(f, **self.reader_kwargs) column_names = next(rows) for i, c in enumerate(column_names): if not zero_based: i += 1 output.write('%3i: %s\n' % (i, c))
def main(self): self.input_files = [] for path in self.args.input_paths: self.input_files.append(self._open_input_file(path)) if len(self.input_files) < 2: self.argparser.error('You must specify at least two files to join.') if self.args.columns: join_column_names = self._parse_join_column_names(self.args.columns) if len(join_column_names) == 1: join_column_names = join_column_names * len(self.input_files) if len(join_column_names) != len(self.input_files): self.argparser.error('The number of join column names must match the number of files, or be a single column name that exists in all files.') if (self.args.left_join or self.args.right_join or self.args.outer_join) and not self.args.columns: self.argparser.error('You must provide join column names when performing an outer join.') if self.args.left_join and self.args.right_join: self.argparser.error('It is not valid to specify both a left and a right join.') tables = [] for f in self.input_files: tables.append(list(agate.reader(f, **self.reader_kwargs))) f.close() join_column_ids = [] if self.args.columns: for i, t in enumerate(tables): join_column_ids.append(match_column_identifier(t[0], join_column_names[i])) jointab = [] if self.args.left_join: # Left outer join jointab = tables[0] for i, t in enumerate(tables[1:]): jointab = join.left_outer_join(jointab, join_column_ids[0], t, join_column_ids[i + 1]) elif self.args.right_join: # Right outer join jointab = tables[-1] remaining_tables = tables[:-1] remaining_tables.reverse() for i, t in enumerate(remaining_tables): jointab = join.right_outer_join(t, join_column_ids[-(i + 2)], jointab, join_column_ids[-1]) elif self.args.outer_join: # Full outer join jointab = tables[0] for i, t in enumerate(tables[1:]): jointab = join.full_outer_join(jointab, join_column_ids[0], t, join_column_ids[i + 1]) else: if self.args.columns: # Inner join jointab = tables[0] for i, t in enumerate(tables[1:]): jointab = join.inner_join(jointab, join_column_ids[0], t, join_column_ids[i + 1]) else: jointab = tables[0] # Sequential join for t in tables[1:]: jointab = join.sequential_join(jointab, t) output = agate.writer(self.output_file, **self.writer_kwargs) for row in jointab: output.writerow(row)
def main(self): operations = [ op for op in OPERATIONS if getattr(self.args, op + '_only') ] if len(operations) > 1: self.argparser.error( 'Only one statistic argument may be specified (mean, median, etc).' ) if operations and self.args.count_only: self.argparser.error( 'You may not specify --count and a statistical argument at the same time.' ) if self.args.count_only: count = len(list(agate.reader(self.input_file))) if not self.args.no_header_row: count -= 1 self.output_file.write('Row count: %i\n' % count) return tab = table.Table.from_csv(self.input_file, snifflimit=self.args.snifflimit, column_ids=self.args.columns, zero_based=self.args.zero_based, no_header_row=self.args.no_header_row, **self.reader_kwargs) for c in tab: values = sorted(filter(lambda i: i is not None, c)) stats = {} # Output a single stat if len(operations) == 1: op = operations[0] stat = getattr(self, 'get_%s' % op)(c, values, {}) # Formatting if op == 'unique': stat = len(stat) elif op == 'freq': stat = ', '.join([('"%s": %s' % (six.text_type(k), count)) for k, count in stat]) stat = '{ %s }' % stat if len(tab) == 1: self.output_file.write(six.text_type(stat)) else: self.output_file.write('%3i. %s: %s\n' % (c.order + 1, c.name, stat)) # Output all stats else: for op in OPERATIONS: stats[op] = getattr(self, 'get_%s' % op)(c, values, stats) self.output_file.write(('%3i. %s\n' % (c.order + 1, c.name))) if c.type == None: self.output_file.write('\tEmpty column\n') continue self.output_file.write('\t%s\n' % c.type) self.output_file.write('\tNulls: %s\n' % stats['nulls']) if len(stats['unique']) <= MAX_UNIQUE and c.type is not bool: uniques = [six.text_type(u) for u in list(stats['unique'])] data = u'\tValues: %s\n' % ', '.join(uniques) self.output_file.write(data) else: if c.type not in [six.text_type, bool]: self.output_file.write('\tMin: %s\n' % stats['min']) self.output_file.write('\tMax: %s\n' % stats['max']) if c.type in [int, float]: self.output_file.write('\tSum: %s\n' % stats['sum']) self.output_file.write('\tMean: %s\n' % stats['mean']) self.output_file.write('\tMedian: %s\n' % stats['median']) self.output_file.write( '\tStandard Deviation: %s\n' % stats['stdev']) self.output_file.write('\tUnique values: %i\n' % len(stats['unique'])) if len(stats['unique']) != len(values): self.output_file.write('\t%i most frequent values:\n' % MAX_FREQ) for value, count in stats['freq']: self.output_file.write( ('\t\t%s:\t%s\n' % (six.text_type(value), count))) if c.type == six.text_type: self.output_file.write('\tMax length: %i\n' % stats['len']) if not operations: self.output_file.write('\n') self.output_file.write('Row count: %s\n' % tab.count_rows())
def main(self): # Determine the file type. if self.args.filetype: filetype = self.args.filetype if filetype not in SUPPORTED_FORMATS: self.argparser.error('"%s" is not a supported format' % self.args.filetype) elif self.args.schema: filetype = 'fixed' elif self.args.key: filetype = 'json' else: if not self.args.input_path or self.args.input_path == '-': self.argparser.error( 'You must specify a format when providing data via STDIN (pipe).' ) filetype = convert.guess_format(self.args.input_path) if not filetype: self.argparser.error( 'Unable to automatically determine the format of the input file. Try specifying a format with --format.' ) # Set the input file. if filetype in ('xls', 'xlsx'): self.input_file = open(self.args.input_path, 'rb') else: self.input_file = self._open_input_file(self.args.input_path) # Set the reader's arguments. kwargs = {} if self.args.schema: schema = self._open_input_file(self.args.schema) elif filetype == 'fixed': raise ValueError('schema must not be null when format is "fixed"') if self.args.sheet: kwargs['sheet'] = self.args.sheet if filetype == 'csv': kwargs.update(self.reader_kwargs) # Streaming CSV musn't set sniff_limit, but non-streaming should. if not self.args.no_inference: kwargs['sniff_limit'] = self.args.sniff_limit if self.args.no_header_row: kwargs['header'] = False elif self.args.no_inference: # Streaming CSV musn't set column_types, but other formats should. kwargs['column_types'] = agate.TypeTester(limit=0) # Convert the file. if filetype == 'csv' and self.args.no_inference: reader = agate.reader(self.input_file, **self.reader_kwargs) writer = agate.writer(self.output_file, **self.writer_kwargs) writer.writerows(reader) elif filetype == 'fixed': self.output_file.write( fixed2csv(self.input_file, schema, output=self.output_file, **kwargs)) elif filetype == 'geojson': self.output_file.write(geojson2csv(self.input_file, **kwargs)) elif filetype in ('csv', 'dbf', 'json', 'ndjson', 'xls', 'xlsx'): if filetype == 'csv': table = agate.Table.from_csv(self.input_file, **kwargs) elif filetype == 'json': table = agate.Table.from_json(self.input_file, key=self.args.key, **kwargs) elif filetype == 'ndjson': table = agate.Table.from_json(self.input_file, key=self.args.key, newline=True, **kwargs) elif filetype == 'xls': table = agate.Table.from_xls(self.input_file, sheet=kwargs.get('sheet', None)) elif filetype == 'xlsx': table = agate.Table.from_xlsx(self.input_file, sheet=kwargs.get('sheet', None)) elif filetype == 'dbf': if not hasattr(self.input_file, 'name'): raise ValueError( 'DBF files can not be converted from stdin. You must pass a filename.' ) table = agate.Table.from_dbf(self.input_file.name, **kwargs) table.to_csv(self.output_file)
def from_csv(cls, f, name='from_csv_table', snifflimit=None, column_ids=None, blanks_as_nulls=True, zero_based=False, infer_types=True, no_header_row=False, **kwargs): """ Creates a new Table from a file-like object containing CSV data. Note: the column_ids argument will cause only those columns with a matching identifier to be parsed, type inferred, etc. However, their order/index property will reflect the original data (e.g. column 8 will still be "order" 7, even if it's the third column in the resulting Table. """ # This bit of nonsense is to deal with "files" from stdin, # which are not seekable and thus must be buffered contents = f.read() # snifflimit == 0 means do not sniff if snifflimit is None: kwargs['dialect'] = sniff_dialect(contents) elif snifflimit > 0: kwargs['dialect'] = sniff_dialect(contents[:snifflimit]) f = six.StringIO(contents) rows = agate.reader(f, **kwargs) try: if no_header_row: # Peek at a row to infer column names from, and put it back on top row = next(rows) rows = itertools.chain([row], rows) headers = make_default_headers(len(row)) else: headers = next(rows) except StopIteration: # The file is `/dev/null`. headers = [] pass if no_header_row or column_ids: column_ids = parse_column_identifiers(column_ids, headers, zero_based) headers = [headers[c] for c in column_ids] else: column_ids = range(len(headers)) data_columns = [[] for c in headers] width = len(data_columns) for i, row in enumerate(rows): j = 0 for j, d in enumerate(row): try: data_columns[j].append(row[column_ids[j]].strip()) except IndexError: # Non-rectangular data is truncated break j += 1 # Populate remaining columns with None while j < width: data_columns[j].append(None) j += 1 columns = [] for i, c in enumerate(data_columns): columns.append(Column(column_ids[i], headers[i], c, blanks_as_nulls=blanks_as_nulls, infer_types=infer_types)) return Table(columns, name=name)
def get_output_as_reader(self, args): return agate.reader(self.get_output_as_io(args))
def main(self): self.input_files = [] for path in self.args.input_paths: self.input_files.append(self._open_input_file(path)) if not self.input_files: self.argparser.error( 'You must specify at least one file to stack.') if self.args.group_by_filenames: groups = [os.path.split(f.name)[1] for f in self.input_files] elif self.args.groups: groups = self.args.groups.split(',') if len(groups) != len(self.input_files): self.argparser.error( 'The number of grouping values must be equal to the number of CSV files being stacked.' ) else: groups = None group_name = self.args.group_name if self.args.group_name else 'group' output = agate.writer(self.output_file, **self.writer_kwargs) for i, f in enumerate(self.input_files): rows = agate.reader(f, **self.reader_kwargs) # If we have header rows, use them if not self.args.no_header_row: headers = next(rows, []) if i == 0: if groups: headers.insert(0, group_name) output.writerow(headers) # If we don't generate simple column names based on first row else: row = next(rows, []) headers = make_default_headers(len(row)) if i == 0: if groups: headers.insert(0, group_name) output.writerow(headers) if groups: row.insert(0, groups[i]) output.writerow(row) for row in rows: if groups: row.insert(0, groups[i]) output.writerow(row) f.close()
def main(self): reader = agate.reader(self.input_file, **self.reader_kwargs) writer = agate.writer(self.output_file, **self.writer_kwargs) writer.writerows(reader)
def main(self): if six.PY2: stream = codecs.getwriter('utf-8')(self.output_file) else: stream = self.output_file json_kwargs = { 'ensure_ascii': False, 'indent': self.args.indent, } if six.PY2: json_kwargs['encoding'] = 'utf-8' def dump_json(data, newline=False): json.dump(data, stream, **json_kwargs) if newline: stream.write("\n") """ Convert CSV to JSON. """ if self.args.lat and not self.args.lon: self.argparser.error('--lon is required whenever --lat is specified.') if self.args.lon and not self.args.lat: self.argparser.error('--lat is required whenever --lon is specified.') if self.args.crs and not self.args.lat: self.argparser.error('--crs is only allowed when --lat and --lon are also specified.') if self.args.streamOutput and (self.args.lat or self.args.lon or self.args.key): self.argparser.error('--stream is only allowed if --lat, --lon and --key are not specified.') # GeoJSON if self.args.lat and self.args.lon: rows = agate.reader(self.input_file, **self.reader_kwargs) column_names = next(rows) features = [] min_lon = None min_lat = None max_lon = None max_lat = None lat_column = match_column_identifier(column_names, self.args.lat, self.args.zero_based) lon_column = match_column_identifier(column_names, self.args.lon, self.args.zero_based) if self.args.key: id_column = match_column_identifier(column_names, self.args.key, self.args.zero_based) else: id_column = None for row in rows: feature = OrderedDict() feature['type'] = 'Feature' properties = OrderedDict() geoid = None lat = None lon = None for i, c in enumerate(row): if i == lat_column: try: lat = float(c) except ValueError: lat = None if min_lat is None or lat < min_lat: min_lat = lat if max_lat is None or lat > max_lat: max_lat = lat elif i == lon_column: try: lon = float(c) except ValueError: lon = None if min_lon is None or lon < min_lon: min_lon = lon if max_lon is None or lon > max_lon: max_lon = lon elif id_column is not None and i == id_column: geoid = c else: properties[column_names[i]] = c if id_column is not None: feature['id'] = geoid feature['geometry'] = OrderedDict([ ('type', 'Point'), ('coordinates', [lon, lat]) ]) feature['properties'] = properties features.append(feature) output = OrderedDict([ ('type', 'FeatureCollection'), ('bbox', [min_lon, min_lat, max_lon, max_lat]), ('features', features) ]) if self.args.crs: output['crs'] = OrderedDict([ ('type', 'name'), ('properties', { 'name': self.args.crs }) ]) dump_json(output) elif self.args.streamOutput and self.args.no_inference: rows = agate.reader(self.input_file, **self.reader_kwargs) column_names = next(rows) for row in rows: data = OrderedDict() for i, column in enumerate(column_names): try: data[column] = row[i] except IndexError: data[column] = None dump_json(data, newline=True) else: table = agate.Table.from_csv(self.input_file, sniff_limit=self.args.sniff_limit, column_types=self.get_column_types()) table.to_json(stream, key=self.args.key, newline=self.args.streamOutput, indent=self.args.indent)
def main(self): if six.PY2: stream = codecs.getwriter('utf-8')(self.output_file) else: stream = self.output_file json_kwargs = { 'ensure_ascii': False, 'indent': self.args.indent, } if six.PY2: json_kwargs['encoding'] = 'utf-8' def dump_json(data, newline=False): json.dump(data, stream, **json_kwargs) if newline: stream.write("\n") """ Convert CSV to JSON. """ if self.args.lat and not self.args.lon: self.argparser.error( '--lon is required whenever --lat is specified.') if self.args.lon and not self.args.lat: self.argparser.error( '--lat is required whenever --lon is specified.') if self.args.crs and not self.args.lat: self.argparser.error( '--crs is only allowed when --lat and --lon are also specified.' ) if self.args.streamOutput and (self.args.lat or self.args.lon or self.args.key): self.argparser.error( '--stream is only allowed if --lat, --lon and --key are not specified.' ) rows = agate.reader(self.input_file, **self.reader_kwargs) column_names = next(rows) # GeoJSON if self.args.lat and self.args.lon: features = [] min_lon = None min_lat = None max_lon = None max_lat = None lat_column = match_column_identifier(column_names, self.args.lat, self.args.zero_based) lon_column = match_column_identifier(column_names, self.args.lon, self.args.zero_based) if self.args.key: id_column = match_column_identifier(column_names, self.args.key, self.args.zero_based) else: id_column = None for row in rows: feature = OrderedDict() feature['type'] = 'Feature' properties = OrderedDict() geoid = None lat = None lon = None for i, c in enumerate(row): if i == lat_column: try: lat = float(c) except ValueError: lat = None if min_lat is None or lat < min_lat: min_lat = lat if max_lat is None or lat > max_lat: max_lat = lat elif i == lon_column: try: lon = float(c) except ValueError: lon = None if min_lon is None or lon < min_lon: min_lon = lon if max_lon is None or lon > max_lon: max_lon = lon elif id_column is not None and i == id_column: geoid = c else: properties[column_names[i]] = c if id_column is not None: feature['id'] = geoid feature['geometry'] = OrderedDict([('type', 'Point'), ('coordinates', [lon, lat])]) feature['properties'] = properties features.append(feature) output = OrderedDict([('type', 'FeatureCollection'), ('bbox', [min_lon, min_lat, max_lon, max_lat]), ('features', features)]) if self.args.crs: output['crs'] = OrderedDict([('type', 'name'), ('properties', { 'name': self.args.crs })]) dump_json(output) # Keyed JSON elif self.args.key: output = OrderedDict() for row in rows: data = OrderedDict() for i, column in enumerate(column_names): data[column] = row[i] k = data[self.args.key] if k in output: raise NonUniqueKeyColumnException( 'Value %s is not unique in the key column.' % six.text_type(k)) output[k] = data dump_json(output) # Boring JSON else: output = [] for row in rows: data = OrderedDict() for i, column in enumerate(column_names): try: data[column] = row[i] except IndexError: data[column] = None if (self.args.streamOutput): dump_json(data, newline=True) else: output.append(data) if not self.args.streamOutput: dump_json(output)
def from_csv(cls, f, name='from_csv_table', snifflimit=None, column_ids=None, blanks_as_nulls=True, zero_based=False, infer_types=True, no_header_row=False, **kwargs): """ Creates a new Table from a file-like object containing CSV data. Note: the column_ids argument will cause only those columns with a matching identifier to be parsed, type inferred, etc. However, their order/index property will reflect the original data (e.g. column 8 will still be "order" 7, even if it's the third column in the resulting Table. """ # This bit of nonsense is to deal with "files" from stdin, # which are not seekable and thus must be buffered contents = f.read() # snifflimit == 0 means do not sniff if snifflimit is None: kwargs['dialect'] = sniffer.sniff_dialect(contents) elif snifflimit > 0: kwargs['dialect'] = sniffer.sniff_dialect(contents[:snifflimit]) f = six.StringIO(contents) rows = agate.reader(f, **kwargs) try: if no_header_row: # Peek at a row to infer column names from, and put it back on top row = next(rows) rows = itertools.chain([row], rows) headers = make_default_headers(len(row)) else: headers = next(rows) except StopIteration: # The file is `/dev/null`. headers = [] pass if no_header_row or column_ids: column_ids = parse_column_identifiers(column_ids, headers, zero_based) headers = [headers[c] for c in column_ids] else: column_ids = range(len(headers)) data_columns = [[] for c in headers] width = len(data_columns) for i, row in enumerate(rows): j = 0 for j, d in enumerate(row): try: data_columns[j].append(row[column_ids[j]].strip()) except IndexError: # Non-rectangular data is truncated break j += 1 # Populate remaining columns with None while j < width: data_columns[j].append(None) j += 1 columns = [] for i, c in enumerate(data_columns): columns.append(Column(column_ids[i], headers[i], c, blanks_as_nulls=blanks_as_nulls, infer_types=infer_types)) return Table(columns, name=name)
def main(self): # Determine the file type. if self.args.filetype: filetype = self.args.filetype if filetype not in SUPPORTED_FORMATS: self.argparser.error('"%s" is not a supported format' % self.args.filetype) elif self.args.schema: filetype = 'fixed' elif self.args.key: filetype = 'json' else: if not self.args.input_path or self.args.input_path == '-': self.argparser.error('You must specify a format when providing data via STDIN (pipe).') filetype = convert.guess_format(self.args.input_path) if not filetype: self.argparser.error('Unable to automatically determine the format of the input file. Try specifying a format with --format.') # Set the input file. if filetype in ('xls', 'xlsx'): self.input_file = open(self.args.input_path, 'rb') else: self.input_file = self._open_input_file(self.args.input_path) # Set the reader's arguments. kwargs = {} if self.args.schema: schema = self._open_input_file(self.args.schema) elif filetype == 'fixed': raise ValueError('schema must not be null when format is "fixed"') if self.args.sheet: kwargs['sheet'] = self.args.sheet if filetype == 'csv': kwargs.update(self.reader_kwargs) # Streaming CSV musn't set sniff_limit, but non-streaming should. if not self.args.no_inference: kwargs['sniff_limit'] = self.args.sniff_limit if self.args.no_header_row: kwargs['header'] = False elif self.args.no_inference: # Streaming CSV musn't set column_types, but other formats should. kwargs['column_types'] = agate.TypeTester(limit=0) # Convert the file. if filetype == 'csv' and self.args.no_inference: reader = agate.reader(self.input_file, **self.reader_kwargs) writer = agate.writer(self.output_file, **self.writer_kwargs) writer.writerows(reader) elif filetype == 'fixed': self.output_file.write(fixed2csv(self.input_file, schema, output=self.output_file, **kwargs)) elif filetype == 'geojson': self.output_file.write(geojson2csv(self.input_file, **kwargs)) elif filetype in ('csv', 'dbf', 'json', 'ndjson', 'xls', 'xlsx'): if filetype == 'csv': table = agate.Table.from_csv(self.input_file, **kwargs) elif filetype == 'json': table = agate.Table.from_json(self.input_file, key=self.args.key, **kwargs) elif filetype == 'ndjson': table = agate.Table.from_json(self.input_file, key=self.args.key, newline=True, **kwargs) elif filetype == 'xls': table = agate.Table.from_xls(self.input_file, sheet=kwargs.get('sheet', None)) elif filetype == 'xlsx': table = agate.Table.from_xlsx(self.input_file, sheet=kwargs.get('sheet', None)) elif filetype == 'dbf': if not hasattr(self.input_file, 'name'): raise ValueError('DBF files can not be converted from stdin. You must pass a filename.') table = agate.Table.from_dbf(self.input_file.name, **kwargs) table.to_csv(self.output_file)
def main(self): if self.args.names_only: self.print_column_names() return operations = [op for op in OPERATIONS if getattr(self.args, op + '_only')] if len(operations) > 1: self.argparser.error('Only one statistic argument may be specified (mean, median, etc).') if operations and self.args.count_only: self.argparser.error('You may not specify --count and a statistical argument at the same time.') if self.args.count_only: count = len(list(agate.reader(self.input_file))) if not self.args.no_header_row: count -= 1 self.output_file.write('Row count: %i\n' % count) return tab = table.Table.from_csv( self.input_file, sniff_limit=self.args.sniff_limit, column_ids=self.args.columns, zero_based=self.args.zero_based, no_header_row=self.args.no_header_row, **self.reader_kwargs ) for c in tab: values = sorted(filter(lambda i: i is not None, c)) stats = {} # Output a single stat if len(operations) == 1: op = operations[0] stat = getattr(self, 'get_%s' % op)(c, values, {}) # Formatting if op == 'unique': stat = len(stat) elif op == 'freq': stat = ', '.join([('"%s": %s' % (six.text_type(k), count)) for k, count in stat]) stat = '{ %s }' % stat if len(tab) == 1: self.output_file.write(six.text_type(stat)) else: self.output_file.write('%3i. %s: %s\n' % (c.order + 1, c.name, stat)) # Output all stats else: for op in OPERATIONS: stats[op] = getattr(self, 'get_%s' % op)(c, values, stats) self.output_file.write(('%3i. %s\n' % (c.order + 1, c.name))) if c.type is None: self.output_file.write('\tEmpty column\n') continue self.output_file.write('\t%s\n' % c.type) self.output_file.write('\tNulls: %s\n' % stats['nulls']) if len(stats['unique']) <= MAX_UNIQUE and c.type is not bool: uniques = [six.text_type(u) for u in list(stats['unique'])] data = u'\tValues: %s\n' % ', '.join(uniques) self.output_file.write(data) else: if c.type not in [six.text_type, bool]: self.output_file.write('\tMin: %s\n' % stats['min']) self.output_file.write('\tMax: %s\n' % stats['max']) if c.type in [int, float]: self.output_file.write('\tSum: %s\n' % stats['sum']) self.output_file.write('\tMean: %s\n' % stats['mean']) self.output_file.write('\tMedian: %s\n' % stats['median']) self.output_file.write('\tStandard Deviation: %s\n' % stats['stdev']) self.output_file.write('\tUnique values: %i\n' % len(stats['unique'])) if len(stats['unique']) != len(values): self.output_file.write('\t%i most frequent values:\n' % MAX_FREQ) for value, count in stats['freq']: self.output_file.write(('\t\t%s:\t%s\n' % (six.text_type(value), count))) if c.type == six.text_type: self.output_file.write('\tMax length: %i\n' % stats['len']) if not operations: self.output_file.write('\n') self.output_file.write('Row count: %s\n' % tab.count_rows())