def test_string_match(self): args = [ '-c', '1', '-m', 'ILLINOIS', 'examples/realdata/FY09_EDU_Recipients_by_State.csv' ] output_file = StringIO.StringIO() utility = CSVGrep(args, output_file) utility.main() input_file = StringIO.StringIO(output_file.getvalue()) reader = CSVKitReader(input_file) self.assertEqual(reader.next(), [ 'State Name', 'State Abbreviate', 'Code', 'Montgomery GI Bill-Active Duty', 'Montgomery GI Bill- Selective Reserve', 'Dependents\' Educational Assistance', 'Reserve Educational Assistance Program', 'Post-Vietnam Era Veteran\'s Educational Assistance Program', 'TOTAL', '' ]) self.assertEqual(reader.next(), [ 'ILLINOIS', 'IL', '17', '15,659', '2,491', '2,025', '1,770', '19', '21,964', '' ])
def main(self): if self.args.names_only: self.print_column_names() return rows = CSVKitReader(self.args.file, **self.reader_kwargs) if self.args.no_header_row: row = rows.next() column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = rows.next() column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based, self.args.not_columns) output = CSVKitWriter(self.output_file, **self.writer_kwargs) output.writerow([column_names[c] for c in column_ids]) for i, row in enumerate(rows): out_row = [row[c] if c < len(row) else None for c in column_ids] if self.args.delete_empty: if ''.join(out_row) == '': continue output.writerow(out_row)
def main(self): rows = CSVKitReader(self.args.file, **self.reader_kwargs) if self.args.no_header_row: row = rows.next() column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = rows.next() column_names = self.args.columns.split(',') part_count = 0 output = CSVKitWriter( open(self.args.file._lazy_args[0]+".part.%d" % part_count, 'w'), **self.writer_kwargs) output.writerow(column_names) count = 0 for row in rows: if (self.args.lines > 0) and (count == self.args.lines): part_count += 1 count = 0 # couldn't find a better way to close the file del output output = CSVKitWriter( open(self.args.file._lazy_args[0]+".part.%d" % part_count, 'w'), **self.writer_kwargs) output.writerow(column_names) output.writerow(row) count += 1
def main(self): rows = CSVKitReader(self.args.file, **self.reader_kwargs) # Make a default header row if none exists if self.args.no_header_row: row = rows.next() column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = rows.next() column_names = list(column_names) # prepend 'line_number' column with line numbers if --linenumbers option if self.args.line_numbers: column_names.insert(0, 'line_number') rows = [ list(itertools.chain([str(i + 1)], row)) for i, row in enumerate(rows) ] # Convert to normal list of rows rows = list(rows) # Insert the column names at the top rows.insert(0, column_names) widths = [] for row in rows: for i, v in enumerate(row): try: if len(v) > widths[i]: widths[i] = len(v) except IndexError: widths.append(len(v)) # Dashes span each width with '+' character at intersection of # horizontal and vertical dividers. divider = '|--' + '-+-'.join('-' * w for w in widths) + '--|' self.output_file.write('%s\n' % divider) for i, row in enumerate(rows): output = [] for j, d in enumerate(row): if d is None: d = '' output.append(' %s ' % unicode(d).ljust(widths[j])) self.output_file.write( ('| %s |\n' % ('|'.join(output))).encode('utf-8')) if (i == 0 or i == len(rows) - 1): self.output_file.write('%s\n' % divider)
def sample_data(path, dialect_parameters, sample_size, encoding='utf-8'): with open(path, 'r') as f: reader = CSVKitReader(f, encoding=encoding, **dialect_parameters) try: reader.next() # skip headers samples = [] for row in islice(reader, sample_size): samples.append(row) except UnicodeDecodeError: raise DataSamplingError(_('This CSV file contains characters that are not %s encoded. You need to input the correct encoding in order to import data from this file.') % (encoding)) return samples
def test_no_grouping(self): # stack two CSV files args = ['examples/dummy.csv', 'examples/dummy2.csv'] output_file = StringIO.StringIO() utility = CSVStack(args, output_file) utility.main() # verify the stacked file's contents input_file = StringIO.StringIO(output_file.getvalue()) reader = CSVKitReader(input_file) self.assertEqual(reader.next(), ['a', 'b', 'c']) self.assertEqual(reader.next()[0], '1') self.assertEqual(reader.next()[0], '1')
def test_no_header_row(self): # stack two CSV files args = ['--no-header-row', 'examples/no_header_row.csv', 'examples/no_header_row2.csv'] output_file = StringIO.StringIO() utility = CSVStack(args, output_file) utility.main() # verify the stacked file's contents input_file = StringIO.StringIO(output_file.getvalue()) reader = CSVKitReader(input_file) self.assertEqual(reader.next()[0], 'column1') self.assertEqual(reader.next()[0], '1') self.assertEqual(reader.next()[0], '4')
def test_explicit_grouping(self): # stack two CSV files args = ['--groups', 'asd,sdf', '-n', 'foo', 'examples/dummy.csv', 'examples/dummy2.csv'] output_file = StringIO.StringIO() utility = CSVStack(args, output_file) utility.main() # verify the stacked file's contents input_file = StringIO.StringIO(output_file.getvalue()) reader = CSVKitReader(input_file) self.assertEqual(reader.next(), ['foo', 'a', 'b', 'c']) self.assertEqual(reader.next()[0], 'asd') self.assertEqual(reader.next()[0], 'sdf')
def from_csv(cls, f, name='from_csv_table', **kwargs): """ Creates a new Table from a file-like object containing CSV data. """ # This bit of nonsense is to deal with "files" from stdin, # which are not seekable and thus must be buffered contents = f.read() sample = contents dialect = sniffer.sniff_dialect(sample, **kwargs) f = StringIO(contents) reader = CSVKitReader(f, dialect=dialect, **kwargs) headers = reader.next() data_columns = [[] for c in headers] for row in reader: for i, d in enumerate(row): try: data_columns[i].append(d.strip()) except IndexError: # Non-rectangular data is truncated break columns = [] for i, c in enumerate(data_columns): columns.append(Column(i, headers[i], c)) return Table(columns, name=name)
def main(self): """ Convert CSV to JSON. """ rows = CSVKitReader(self.args.file, **self.reader_kwargs) column_names = rows.next() stream = codecs.getwriter('utf-8')(self.output_file) if self.args.key: output = {} for row in rows: row_dict = dict(zip(column_names, row)) k = row_dict[self.args.key] if k in output: raise NonUniqueKeyColumnException( 'Value %s is not unique in the key column.' % unicode(k)) output[k] = row_dict else: output = [dict(zip(column_names, row)) for row in rows] json.dump(output, stream, ensure_ascii=False, indent=self.args.indent, encoding='utf-8')
def main(self): if self.args.names_only: self.print_column_names() return if not self.args.regex and not self.args.pattern and not self.args.matchfile: self.argparser.error("One of -r, -m or -f must be specified, unless using the -n option.") rows = CSVKitReader(self.args.file, **self.reader_kwargs) column_names = rows.next() column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based) if self.args.regex: pattern = re.compile(self.args.regex) elif self.args.matchfile: lines = set(line.rstrip() for line in self.args.matchfile) pattern = lambda x: x in lines else: pattern = self.args.pattern patterns = dict((c, pattern) for c in column_ids) output = CSVKitWriter(self.output_file, **self.writer_kwargs) output.writerow(column_names) filter_reader = FilteringCSVReader(rows, header=False, patterns=patterns, inverse=self.args.inverse) for i, row in enumerate(filter_reader): output.writerow(row)
def main(self): if self.args.names_only: self.print_column_names() return if not self.args.regex and not self.args.pattern and not self.args.matchfile: self.argparser.error("One of -r, -m or -f must be specified, unless using the -n option.") rows = CSVKitReader(self.args.file, **self.reader_kwargs) column_names = rows.next() column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based) if self.args.regex: pattern = re.compile(self.args.regex) elif self.args.matchfile: lines = [line.rstrip() for line in self.args.matchfile] pattern = lambda x: x in lines else: pattern = self.args.pattern patterns = dict((c, pattern) for c in column_ids) output = CSVKitWriter(self.output_file, **self.writer_kwargs) output.writerow(column_names) filter_reader = FilteringCSVReader(rows, header=False, patterns=patterns, inverse=self.args.inverse) for i, row in enumerate(filter_reader): output.writerow(row)
def main(self): if len(self.args.files) < 2: self.argparser.error('You must specify at least two files to stack.') if self.args.group_by_filenames: groups = [os.path.split(f.name)[1] for f in self.args.files] elif self.args.groups: groups = self.args.groups.split(',') if len(groups) != len(self.args.files): self.argparser.error('The number of grouping values must be equal to the number of CSV files being stacked.') else: groups = None group_name = self.args.group_name if self.args.group_name else 'group' output = CSVKitWriter(self.output_file, **self.writer_kwargs) for i, f in enumerate(self.args.files): rows = CSVKitReader(f, **self.reader_kwargs) headers = rows.next() if i == 0: if groups: headers.insert(0, group_name) output.writerow(headers) for row in rows: if groups: row.insert(0, groups[i]) output.writerow(row)
def sample_data(path, dialect_parameters, sample_size, encoding='utf-8'): with open(path, 'r') as f: reader = CSVKitReader(f, encoding=encoding, **dialect_parameters) try: reader.next() # skip headers samples = [] for row in islice(reader, sample_size): samples.append(row) except UnicodeDecodeError: raise DataSamplingError( 'This CSV file contains characters that are not %s encoded. You need to input the correct encoding in order to import data from this file.' % (encoding)) return samples
def from_csv(cls, f, name='from_csv_table', **kwargs): """ Creates a new Table from a file-like object containing CSV data. """ # This bit of nonsense is to deal with "files" from stdin, # which are not seekable and thus must be buffered contents = f.read() sample = contents dialect = sniffer.sniff_dialect(sample) f = StringIO(contents) reader = CSVKitReader(f, dialect=dialect, **kwargs) headers = reader.next() data_columns = [[] for c in headers] for row in reader: for i, d in enumerate(row): try: data_columns[i].append(d.strip()) except IndexError: # Non-rectangular data is truncated break columns = [] for i, c in enumerate(data_columns): columns.append(Column(i, headers[i], c)) return Table(columns, name=name)
def main(self): if self.args.names_only: self.print_column_names() return #Read in header and rows reader = CSVKitReader(self.input_file, **self.reader_kwargs) column_names = reader.next() if self.args.columns is None: grouped_columns_ids = [] else: grouped_columns_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based) aggregations = [] try: for (fun, cols) in map(lambda (f, cols): ( f, parse_column_identifiers(cols, column_names, self.args.zero_based)), self.args.aggregations): for col in cols: aggregations.append(aggregate_functions[fun](col)) except KeyError: self.argparser.error("Wrong aggregator function. Available: " + ', '.join(aggregate_functions.keys())) #Determine columns to group by, default to all columns #Write the output output = CSVKitWriter(self.output_file, **self.writer_kwargs) for row in group_rows(column_names, reader, grouped_columns_ids, aggregations): output.writerow(row)
def main(self): if len(self.args.files) < 2: sys.exit('You must specify at least two files to stack.') if self.args.group_by_filenames: groups = [os.path.split(f.name)[1] for f in self.args.files] elif self.args.groups: groups = self.args.groups.split(',') if len(groups) != len(self.args.files): sys.exit( 'The number of grouping values must be equal to the number of CSV files being stacked.' ) else: groups = None group_name = self.args.group_name if self.args.group_name else 'group' output = CSVKitWriter(self.output_file, **self.writer_kwargs) for i, f in enumerate(self.args.files): rows = CSVKitReader(f, **self.reader_kwargs) headers = rows.next() if i == 0: if groups: headers.insert(0, group_name) output.writerow(headers) for row in rows: if groups: row.insert(0, groups[i]) output.writerow(row)
def from_csv(cls, f, name='from_csv_table', snifflimit=None, column_ids=None, blanks_as_nulls=True, zero_based=False, **kwargs): """ Creates a new Table from a file-like object containing CSV data. Note: the column_ids argument will cause only those columns with a matching identifier to be parsed, type inferred, etc. However, their order/index property will reflect the original data (e.g. column 8 will still be "order" 7, even if it's the third column in the resulting Table. """ # This bit of nonsense is to deal with "files" from stdin, # which are not seekable and thus must be buffered contents = f.read() if snifflimit: sample = contents[:snifflimit] else: sample = contents dialect = sniffer.sniff_dialect(sample) f = StringIO(contents) reader = CSVKitReader(f, dialect=dialect, **kwargs) headers = reader.next() if column_ids: column_ids = parse_column_identifiers(column_ids, headers, zero_based) headers = [headers[c] for c in column_ids] else: column_ids = range(len(headers)) data_columns = [[] for c in headers] for row in reader: for i, d in enumerate(row): try: data_columns[i].append(row[column_ids[i]].strip()) except IndexError: # Non-rectangular data is truncated break columns = [] for i, c in enumerate(data_columns): columns.append( Column(column_ids[i], headers[i], c, blanks_as_nulls=blanks_as_nulls)) return Table(columns, name=name)
def load(self): ''' Loads the cleaned up csv files into the database Checks record count against csv line count ''' ## get a list of tables in the database c = connection.cursor() c.execute('SHOW TABLES') table_list = [t[0] for t in c.fetchall()] ### build a dictionary of tables and the paths to the csvs for loading table_dict = {} for name in os.listdir(self.csv_dir): csv_path = os.path.join( self.csv_dir, name ) for table in table_list: if table == name.replace('.csv', '').upper(): table_dict[name] = {'table_name': table, 'csv_path': csv_path} ## load up the data for csv_name, query_dict in table_dict.items(): #print 'working on %s' % csv_name table_name = query_dict['table_name'] csv_path = query_dict['csv_path'] c.execute('DELETE FROM %s' % table_name) #print 'deleted records from %s' % table_name bulk_sql_load_part_1 = ''' LOAD DATA LOCAL INFILE '%s' INTO TABLE %s FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '"' IGNORE 1 LINES ( ''' % (csv_path, table_name) infile = open(csv_path) csv_reader = CSVKitReader(infile) headers = csv_reader.next() infile.close() infile = open(csv_path) csv_record_cnt = len(infile.readlines()) - 1 infile.close() sql_fields = ['`%s`' % h for h in headers] bulk_sql_load = bulk_sql_load_part_1 + ','.join(sql_fields) + ')' cnt = c.execute(bulk_sql_load) transaction.commit_unless_managed() # check load, make sure record count matches if cnt == csv_record_cnt: print "record counts match\t\t\t\t%s" % csv_name else: print 'table_cnt: %s\tcsv_lines: %s\t\t%s' % (cnt, csv_record_cnt, csv_name)
def __init__(self, schema): self.fields = [] # A list of FixedWidthFields schema_reader = CSVKitReader(schema) schema_decoder = SchemaDecoder(schema_reader.next()) for row in schema_reader: self.fields.append(schema_decoder(row))
def infer_types(f, sample_size=100): reader = CSVKitReader(f) headers = reader.next() sample = islice(reader, sample_size) normal_types, normal_values = normalize_table(sample) return zip(headers, [t.__name__ for t in normal_types])
def from_csv(cls, f, name='from_csv_table', snifflimit=None, column_ids=None, blanks_as_nulls=True, zero_based=False, infer_types=True, no_header_row=False, **kwargs): """ Creates a new Table from a file-like object containing CSV data. Note: the column_ids argument will cause only those columns with a matching identifier to be parsed, type inferred, etc. However, their order/index property will reflect the original data (e.g. column 8 will still be "order" 7, even if it's the third column in the resulting Table. """ # This bit of nonsense is to deal with "files" from stdin, # which are not seekable and thus must be buffered contents = f.read() # snifflimit == 0 means do not sniff if snifflimit is None: kwargs['dialect'] = sniffer.sniff_dialect(contents) elif snifflimit > 0: kwargs['dialect'] = sniffer.sniff_dialect(contents[:snifflimit]) f = StringIO(contents) rows = CSVKitReader(f, **kwargs) if no_header_row: # Peek at a row to infer column names from row = next(rows) headers = make_default_headers(len(row)) column_ids = parse_column_identifiers(column_ids, headers, zero_based) headers = [headers[c] for c in column_ids] data_columns = [[] for c in headers] # Put row back on top rows = itertools.chain([row], rows) else: headers = rows.next() if column_ids: column_ids = parse_column_identifiers(column_ids, headers, zero_based) headers = [headers[c] for c in column_ids] else: column_ids = range(len(headers)) data_columns = [[] for c in headers] for i, row in enumerate(rows): for j, d in enumerate(row): try: data_columns[j].append(row[column_ids[j]].strip()) except IndexError: # Non-rectangular data is truncated break columns = [] for i, c in enumerate(data_columns): columns.append(Column(column_ids[i], headers[i], c, blanks_as_nulls=blanks_as_nulls, infer_types=infer_types)) return Table(columns, name=name)
def test_no_header_row(self): # stack two CSV files args = [ '--no-header-row', 'examples/no_header_row.csv', 'examples/no_header_row2.csv' ] output_file = StringIO.StringIO() utility = CSVStack(args, output_file) utility.main() # verify the stacked file's contents input_file = StringIO.StringIO(output_file.getvalue()) reader = CSVKitReader(input_file) self.assertEqual(reader.next()[0], 'column1') self.assertEqual(reader.next()[0], '1') self.assertEqual(reader.next()[0], '4')
def print_column_names(f, output, **reader_kwargs): """ Pretty-prints the names and indices of all columns to a file-like object (usually sys.stdout). """ rows = CSVKitReader(f, **reader_kwargs) column_names = rows.next() for i, c in enumerate(column_names): output.write('%3i: %s\n' % (i + 1, c))
def main(self): reader = CSVKitReader(self.args.file, **self.reader_kwargs) cnames = reader.next() cids = parse_column_identifiers(self.args.columns, cnames, self.args.zero_based) mods = {idx: self.args.expr for idx in cids} output = CSVKitWriter(self.output_file, **self.writer_kwargs) reader = sed.CsvFilter(reader, mods, header=False) output.writerow(cnames) for row in reader: output.writerow(row)
def extract_column_names(path, dialect_parameters, encoding='utf-8'): with open(path, 'r') as f: reader = CSVKitReader(f, encoding=encoding, **dialect_parameters) try: headers = reader.next() except UnicodeDecodeError: raise DataSamplingError(_('This CSV file contains characters that are not %s encoded. You need to input the correct encoding in order to import data from this file.') % encoding) return headers
def load_data(input_file): for file in glob.glob(input_file): print file open_file = open(input_file) grasp = CSVKitReader(open_file, encoding='utf-8', delimiter='\t') #grasp = csv.reader(open_file, delimiter="\t") grasp.next() # skip header # bad_rows = [] for row in grasp: #assert len(row) == VALID_COLUMN_NO try: one_snp_json = _map_line_to_json(row) #if one_snp_json: yield one_snp_json except: diff_rows = enumerate(row) wrong = [(i, row) for (i, row) in diff_rows] print wrong[-1] open_file.close()
def test_no_match(self): args = ['-c', '1', '-m', 'NO MATCH', 'examples/dummy.csv'] output_file = StringIO.StringIO() utility = CSVGrep(args, output_file) utility.main() input_file = StringIO.StringIO(output_file.getvalue()) reader = CSVKitReader(input_file) self.assertEqual(reader.next(), ['a', 'b', 'c'])
def __init__(self, schema): self.fields = [] # A list of FixedWidthFields schema_reader = CSVKitReader(schema) schema_decoder = SchemaDecoder(schema_reader.next()) for i,row in enumerate(schema_reader): try: self.fields.append(schema_decoder(row)) except Exception,e: raise ValueError("Error reading schema at line %i: %s" % (i + 2,e))
def test_include_and_exclude(self): args = ['-c', '1,3', '-C', '3', 'examples/dummy.csv'] output_file = StringIO.StringIO() utility = CSVCut(args, output_file) utility.main() input_file = StringIO.StringIO(output_file.getvalue()) reader = CSVKitReader(input_file) self.assertEqual(reader.next(), ['a']) self.assertEqual(reader.next(), ['1'])
def test_with_bzip2(self): args = ['-c', '1,3', 'examples/dummy.csv.bz2'] output_file = StringIO.StringIO() utility = CSVCut(args, output_file) utility.main() input_file = StringIO.StringIO(output_file.getvalue()) reader = CSVKitReader(input_file) self.assertEqual(reader.next(), ['a', 'c']) self.assertEqual(reader.next(), ['1', '3'])
def test_no_header_row(self): args = ['-c', '2', '--no-header-row', 'examples/no_header_row.csv'] output_file = StringIO.StringIO() utility = CSVCut(args, output_file) utility.main() input_file = StringIO.StringIO(output_file.getvalue()) reader = CSVKitReader(input_file) self.assertEqual(reader.next(), ['column2']) self.assertEqual(reader.next(), ['2'])
def test_string_match(self): args = ['-c', '1', '-m', 'ILLINOIS', 'examples/realdata/FY09_EDU_Recipients_by_State.csv'] output_file = StringIO.StringIO() utility = CSVGrep(args, output_file) utility.main() input_file = StringIO.StringIO(output_file.getvalue()) reader = CSVKitReader(input_file) self.assertEqual(reader.next(), ['State Name', 'State Abbreviate', 'Code', 'Montgomery GI Bill-Active Duty', 'Montgomery GI Bill- Selective Reserve', 'Dependents\' Educational Assistance', 'Reserve Educational Assistance Program', 'Post-Vietnam Era Veteran\'s Educational Assistance Program', 'TOTAL', '']) self.assertEqual(reader.next(), ['ILLINOIS', 'IL', '17', '15,659', '2,491', '2,025', '1,770', '19', '21,964', ''])
def test_include_and_exclude(self): args = ["-c", "1,3", "-C", "3", "examples/dummy.csv"] output_file = StringIO.StringIO() utility = CSVCut(args, output_file) utility.main() input_file = StringIO.StringIO(output_file.getvalue()) reader = CSVKitReader(input_file) self.assertEqual(reader.next(), ["a"]) self.assertEqual(reader.next(), ["1"])
def test_with_bzip2(self): args = ["-c", "1,3", "examples/dummy.csv.bz2"] output_file = StringIO.StringIO() utility = CSVCut(args, output_file) utility.main() input_file = StringIO.StringIO(output_file.getvalue()) reader = CSVKitReader(input_file) self.assertEqual(reader.next(), ["a", "c"]) self.assertEqual(reader.next(), ["1", "3"])
def test_simple(self): args = ['-c', '1,3', 'examples/dummy.csv'] output_file = StringIO.StringIO() utility = CSVCut(args, output_file) utility.main() input_file = StringIO.StringIO(output_file.getvalue()) reader = CSVKitReader(input_file) self.assertEqual(reader.next(), ['a', 'c']) self.assertEqual(reader.next(), ['1', '3'])
def test_no_header_row(self): args = ["-c", "2", "--no-header-row", "examples/no_header_row.csv"] output_file = StringIO.StringIO() utility = CSVCut(args, output_file) utility.main() input_file = StringIO.StringIO(output_file.getvalue()) reader = CSVKitReader(input_file) self.assertEqual(reader.next(), ["column2"]) self.assertEqual(reader.next(), ["2"])
def extract_column_names(path, dialect_parameters, encoding='utf-8'): with open(path, 'r') as f: reader = CSVKitReader(f, encoding=encoding, **dialect_parameters) try: headers = reader.next() except UnicodeDecodeError: raise DataSamplingError( 'This CSV file contains characters that are not %s encoded. You need to input the correct encoding in order to import data from this file.' % encoding) return headers
def sample_data(f, sample_size=5): reader = CSVKitReader(f) headers = reader.next() samples = [] for i, row in enumerate(islice(reader, sample_size), start=1): samples.append({ 'row': i, 'data': row, }) return samples
def infer_schema(f, sample_size=100): reader = CSVKitReader(f) headers = reader.next() sample = islice(reader, sample_size) normal_types, normal_values = normalize_table(sample) type_names = [t.__name__ for t in normal_types] return [{ 'column': h, 'simple_type': t, 'meta_type': None, 'indexed': False } for h, t in zip(headers, type_names)]
def from_csv(cls, f, name='from_csv_table', snifflimit=None, column_ids=None, blanks_as_nulls=True, zero_based=False, type_inference=True, **kwargs): """ Creates a new Table from a file-like object containing CSV data. Note: the column_ids argument will cause only those columns with a matching identifier to be parsed, type inferred, etc. However, their order/index property will reflect the original data (e.g. column 8 will still be "order" 7, even if it's the third column in the resulting Table. """ # This bit of nonsense is to deal with "files" from stdin, # which are not seekable and thus must be buffered contents = f.read() if snifflimit: sample = contents[:snifflimit] else: sample = contents dialect = sniffer.sniff_dialect(sample) normal_type = kwargs.pop("normal_type", InvalidType) f = StringIO(contents) reader = CSVKitReader(f, dialect=dialect, **kwargs) headers = reader.next() if column_ids: column_ids = parse_column_identifiers(column_ids, headers, zero_based) headers = [headers[c] for c in column_ids] else: column_ids = range(len(headers)) data_columns = [[] for c in headers] for row in reader: for i, d in enumerate(row): try: data_columns[i].append(row[column_ids[i]].strip()) except IndexError: # Non-rectangular data is truncated break columns = [] for i, c in enumerate(data_columns): columns.append(Column(column_ids[i], headers[i], c, blanks_as_nulls=blanks_as_nulls, type_inference=type_inference, normal_type=normal_type)) return Table(columns, name=name)
def print_column_names(self): """ Pretty-prints the names and indices of all columns to a file-like object (usually sys.stdout). """ f = self.args.file output = self.output_file try: zero_based = self.args.zero_based except: zero_based = False rows = CSVKitReader(f, **self.reader_kwargs) column_names = rows.next() for i, c in enumerate(column_names): if not zero_based: i += 1 output.write("%3i: %s\n" % (i, c))
def print_column_names(self): """ Pretty-prints the names and indices of all columns to a file-like object (usually sys.stdout). """ f = self.args.file output = self.output_file try: zero_based = self.args.zero_based except: zero_based = False rows = CSVKitReader(f, **self.reader_kwargs) column_names = rows.next() for i, c in enumerate(column_names): if not zero_based: i += 1 output.write('%3i: %s\n' % (i, c))
def guess_column_types(path, dialect, sample_size, encoding='utf-8'): """ Guess column types based on a sample of data. """ with open(path, 'r') as f: reader = CSVKitReader(f, encoding=encoding, **dialect) headers = reader.next() sample = islice(reader, sample_size) normal_types, normal_values = normalize_table(sample) type_names = [] for t in normal_types: # csvkit recognizes dates and times separately, but we lump them together if t in [datetime.date, datetime.time]: type_names.append('datetime') else: type_names.append(t.__name__) return type_names
def print_column_names(self): """ Pretty-prints the names and indices of all columns to a file-like object (usually sys.stdout). """ if self.args.no_header_row: raise RequiredHeaderError, 'You cannot use --no-header-row with the -n or --names options.' f = self.args.file output = self.output_file try: zero_based=self.args.zero_based except: zero_based=False rows = CSVKitReader(f, **self.reader_kwargs) column_names = rows.next() for i, c in enumerate(column_names): if not zero_based: i += 1 output.write('%3i: %s\n' % (i, c))
def main(self): if self.args.names_only: print_column_names(self.args.file, self.output_file, **self.reader_kwargs) return rows = CSVKitReader(self.args.file, **self.reader_kwargs) column_names = rows.next() column_ids = parse_column_identifiers(self.args.columns, column_names) output = CSVKitWriter(self.output_file, **self.writer_kwargs) output.writerow([column_names[c] for c in column_ids]) for i, row in enumerate(rows): self.input_line_number = i + 1 out_row = [row[c] if c < len(row) else None for c in column_ids] if self.args.delete_empty: if ''.join(out_row) == '': continue output.writerow(out_row)
def guess_column_types(path, dialect, sample_size, encoding='utf-8'): """ Guess column types based on a sample of data. """ with open(path, 'r') as f: reader = CSVKitReader(f, encoding=encoding, **dialect) headers = reader.next() sample = islice(reader, sample_size) normal_types, normal_values = normalize_table(sample) type_names = [] for t in normal_types: if t is NoneType: type_names.append(None) else: type_names.append(t.__name__) # If a final column had no values csvkit will have dropped it while len(type_names) < len(headers): type_names.append(None) return type_names
def from_csv(cls, f, name='from_csv_table', snifflimit=None, column_ids=None, blanks_as_nulls=True, zero_based=False, infer_types=True, no_header_row=False, **kwargs): """ Creates a new Table from a file-like object containing CSV data. Note: the column_ids argument will cause only those columns with a matching identifier to be parsed, type inferred, etc. However, their order/index property will reflect the original data (e.g. column 8 will still be "order" 7, even if it's the third column in the resulting Table. """ # This bit of nonsense is to deal with "files" from stdin, # which are not seekable and thus must be buffered contents = f.read() # snifflimit == 0 means do not sniff if snifflimit is None: kwargs['dialect'] = sniffer.sniff_dialect(contents) elif snifflimit > 0: kwargs['dialect'] = sniffer.sniff_dialect(contents[:snifflimit]) f = StringIO(contents) rows = CSVKitReader(f, **kwargs) if no_header_row: # Peek at a row to infer column names from row = next(rows) headers = make_default_headers(len(row)) column_ids = range(len(row)) data_columns = [[] for c in headers] # Put row back on top rows = itertools.chain([row], rows) else: headers = rows.next() if column_ids: column_ids = parse_column_identifiers(column_ids, headers, zero_based) headers = [headers[c] for c in column_ids] else: column_ids = range(len(headers)) data_columns = [[] for c in headers] for i, row in enumerate(rows): for j, d in enumerate(row): try: data_columns[j].append(row[column_ids[j]].strip()) except IndexError: # Non-rectangular data is truncated break columns = [] for i, c in enumerate(data_columns): columns.append( Column(column_ids[i], headers[i], c, blanks_as_nulls=blanks_as_nulls, infer_types=infer_types)) return Table(columns, name=name)
def run(self, dataset_slug, upload_id, external_id_field_index=None, *args, **kwargs): """ Execute import. """ from panda.models import Dataset, DataUpload log = logging.getLogger(self.name) log.info('Beginning import, dataset_slug: %s' % dataset_slug) try: dataset = Dataset.objects.get(slug=dataset_slug) except Dataset.DoesNotExist: log.warning( 'Import failed due to Dataset being deleted, dataset_slug: %s' % dataset_slug) return upload = DataUpload.objects.get(id=upload_id) task_status = dataset.current_task task_status.begin('Preparing to import') line_count = self._count_lines(upload.get_path()) if self.is_aborted(): task_status.abort('Aborted during preperation') log.warning('Import aborted, dataset_slug: %s' % dataset_slug) return f = open(upload.get_path(), 'r') reader = CSVKitReader(f, encoding=upload.encoding, **upload.dialect_as_parameters()) reader.next() add_buffer = [] data_typer = DataTyper(dataset.column_schema) throttle = config_value('PERF', 'TASK_THROTTLE') i = 0 while True: # The row number which is about to be read, for error handling and indexing i += 1 try: row = reader.next() except StopIteration: i -= 1 break except UnicodeDecodeError: raise DataImportError( 'This CSV file contains characters that are not %s encoded in or after row %i. You need to re-upload this file and input the correct encoding in order to import data from this file.' % (upload.encoding, i)) external_id = None if external_id_field_index is not None: external_id = row[external_id_field_index] data = utils.solr.make_data_row(dataset, row, data_upload=upload, external_id=external_id) data = data_typer(data, row) add_buffer.append(data) if i % SOLR_ADD_BUFFER_SIZE == 0: solr.add(settings.SOLR_DATA_CORE, add_buffer) add_buffer = [] task_status.update('%.0f%% complete (estimated)' % floor(float(i) / float(line_count) * 100)) if self.is_aborted(): task_status.abort( 'Aborted after importing %.0f%% (estimated)' % floor(float(i) / float(line_count) * 100)) log.warning('Import aborted, dataset_slug: %s' % dataset_slug) return time.sleep(throttle) if add_buffer: solr.add(settings.SOLR_DATA_CORE, add_buffer) add_buffer = [] solr.commit(settings.SOLR_DATA_CORE) f.close() task_status.update('100% complete') # Refresh dataset from database so there is no chance of crushing changes made since the task started try: dataset = Dataset.objects.get(slug=dataset_slug) except Dataset.DoesNotExist: log.warning( 'Import could not be completed due to Dataset being deleted, dataset_slug: %s' % dataset_slug) return if not dataset.row_count: dataset.row_count = i else: dataset.row_count += i dataset.column_schema = data_typer.schema dataset.save() # Refres upload = DataUpload.objects.get(id=upload_id) upload.imported = True upload.save() log.info('Finished import, dataset_slug: %s' % dataset_slug) return data_typer
'description': 'The crowdsourced jobs list that powers http://www.newsnerdjobs.com/.' } response = panda_put(PANDA_DATASET_URL, json.dumps(dataset), params={'columns': ','.join(COLUMNS)}) # Open connection to Google response = requests.get( 'https://docs.google.com/spreadsheet/pub?key=%s&single=true&gid=4&output=csv' % SPREADSHEET_ID) csv = StringIO(response.content) reader = CSVKitReader(csv) reader.next() put_data = {'objects': []} # Delete existing data in panda response = panda_delete(PANDA_DATA_URL) for i, row in enumerate(reader): put_data['objects'].append({'data': row}) if i and i % PANDA_BULK_UPDATE_SIZE == 0: print 'Updating %i rows...' % PANDA_BULK_UPDATE_SIZE panda_put(PANDA_DATA_URL, json.dumps(put_data)) put_data['objects'] = []
def main(self): """ Convert CSV to JSON. """ if self.args.lat and not self.args.lon: self.argparser.error( '--lon is required whenever --lat is specified.') if self.args.lon and not self.args.lat: self.argparser.error( '--lat is required whenever --lon is specified.') if self.args.crs and not self.args.lat: self.argparser.error( '--crs is only allowed when --lat and --lon are also specified.' ) rows = CSVKitReader(self.args.file, **self.reader_kwargs) column_names = rows.next() stream = codecs.getwriter('utf-8')(self.output_file) # GeoJSON if self.args.lat and self.args.lon: features = [] min_lon = None min_lat = None max_lon = None max_lat = None lat_column = match_column_identifier(column_names, self.args.lat, self.args.zero_based) lon_column = match_column_identifier(column_names, self.args.lon, self.args.zero_based) if self.args.key: id_column = match_column_identifier(column_names, self.args.key, self.args.zero_based) else: id_column = None for row in rows: feature = {'type': 'Feature'} properties = {} geoid = None lat = None lon = None for i, c in enumerate(row): if i == lat_column: lat = float(c) if min_lat is None or lat < min_lat: min_lat = lat if max_lat is None or lat > max_lat: max_lat = lat elif i == lon_column: lon = float(c) if min_lon is None or lon < min_lon: min_lon = lon if max_lon is None or lon > max_lon: max_lon = lon elif id_column is not None and i == id_column: geoid = c else: properties[column_names[i]] = c if id_column is not None: feature['id'] = geoid feature['geometry'] = { 'type': 'Point', 'coordinates': [lon, lat] } feature['properties'] = properties features.append(feature) output = { 'type': 'FeatureCollection', 'bbox': [min_lon, min_lat, max_lon, max_lat], 'features': features } if self.args.crs: output['crs'] = { 'type': 'name', 'properties': { 'name': self.args.crs } } # Keyed JSON elif self.args.key: output = {} for row in rows: row_dict = dict(zip(column_names, row)) k = row_dict[self.args.key] if k in output: raise NonUniqueKeyColumnException( 'Value %s is not unique in the key column.' % unicode(k)) output[k] = row_dict # Boring JSON else: output = [dict(zip(column_names, row)) for row in rows] json.dump(output, stream, ensure_ascii=False, indent=self.args.indent, encoding='utf-8')