def xlsx2csv(f, output=None, **kwargs): """ Convert an Excel .xlsx file to csv. Note: Unlike other convertor's, this one allows output columns to contain mixed data types. Blank headers are also possible. """ streaming = True if output else False book = load_workbook(f, use_iterators=True, data_only=True) sheets = book.get_sheet_names() fname = os.path.splitext(f.name)[0]; for name in sheets: outputfname = fname + "_" + name.replace(" ", "") + ".csv" of = open(outputfname,'w') sheet = book.get_sheet_by_name(name) if not streaming: output = six.StringIO() writer = CSVKitWriter(output) for i, row in enumerate(sheet.iter_rows()): if i == 0: writer.writerow([c.value for c in row]) continue out_row = [] for c in row: value = c.value if value.__class__ is datetime.datetime: # Handle default XLSX date as 00:00 time if value.date() == datetime.date(1904, 1, 1) and not has_date_elements(c): value = value.time() value = normalize_datetime(value) elif value.time() == NULL_TIME: value = value.date() else: value = normalize_datetime(value) elif value.__class__ is float: if value % 1 == 0: value = int(value) if value.__class__ in (datetime.datetime, datetime.date, datetime.time): value = value.isoformat() out_row.append(value) writer.writerow(out_row) if not streaming: data = output.getvalue() of.write(data) of.close() # Return empty string when streaming return ''
def main(self): rows = CSVKitReader(self.input_file, **self.reader_kwargs) if self.args.no_header_row: row = next(rows) column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = next(rows) all_column_ids = parse_column_identifiers(None,column_names, self.args.zero_based, self.args.not_columns) column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based, self.args.not_columns) output = CSVKitWriter(self.output_file, **self.writer_kwargs) output.writerow([column_names[c] for c in all_column_ids]) d = {} # namespace dict for map_expr exec "def f(x): return %s"%(self.args.map_expr) in d for row in rows: out_row = [] for c in all_column_ids: if c in column_ids: out_row.append(d['f'](row[c]) if c <len(row) else None) else: out_row.append(row[c] if c <len(row) else None) output.writerow(out_row)
def log_errors(self, rows): """ Log any errors to a csv file """ # Make sure the log directory exists os.path.exists(self.log_dir) or os.makedirs(self.log_dir) # Log writer log_path = os.path.join( self.log_dir, self.file_name.lower().replace("tsv", "errors.csv") ) log_file = open(log_path, 'w') log_writer = CSVKitWriter(log_file, quoting=csv.QUOTE_ALL) # Add the headers log_writer.writerow([ 'Line number', 'Headers len', 'Fields len', 'Line value' ]) # Log out the rows log_writer.writerows(rows) # Shut it down log_file.close()
def main(self): try: engine, metadata = sql.get_connection(self.args.connection_string) except ImportError: raise ImportError( 'You don\'t appear to have the necessary database backend installed for connection string you\'re trying to use.. Available backends include:\n\nPostgresql:\tpip install psycopg2\nMySQL:\t\tpip install MySQL-python\n\nFor details on connection strings and other backends, please see the SQLAlchemy documentation on dialects at: \n\nhttp://www.sqlalchemy.org/docs/dialects/\n\n' ) conn = engine.connect() if self.args.query: query = self.args.query.strip() else: query = "" for line in self.args.file: query += line rows = conn.execute(query) output = CSVKitWriter(self.output_file, **self.writer_kwargs) if not self.args.no_header_row: output.writerow(rows._metadata.keys) for row in rows: output.writerow(row) conn.close()
def main(self): if self.args.names_only: self.print_column_names() return if self.input_file.name != '<stdin>': # Use filename as table name table_name = os.path.splitext(os.path.split(self.input_file.name)[1])[0] else: table_name = 'csvsql_table' tab = table.Table.from_csv( self.input_file, name=table_name, snifflimit=self.args.snifflimit, no_header_row=self.args.no_header_row, infer_types=(not self.args.no_inference), **self.reader_kwargs ) column_ids = parse_column_identifiers(self.args.columns, tab.headers(), self.args.zero_based) rows = tab.to_rows(serialize_dates=True) sorter = lambda r: [r[c] if r[c] is not None else '' for c in column_ids] rows.sort(key=sorter, reverse=self.args.reverse) rows.insert(0, tab.headers()) output = CSVKitWriter(self.output_file, **self.writer_kwargs) for row in rows: output.writerow(row)
def main(self): rows = CSVKitReader(self.input_file, **self.reader_kwargs) if self.args.no_header_row: row = next(rows) column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = next(rows) column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based, self.args.not_columns) output = CSVKitWriter(self.output_file, **self.writer_kwargs) output.writerow([column_names[c] for c in column_ids]) drop_white = lambda i: re.sub('\s+$', '', re.sub('^\s+', '', i)) for row in rows: out_row = [ drop_white(row[c]) if c < len(row) else None for c in column_ids ] output.writerow(out_row)
def handle(self, *args, **options): self.cursor = connection.cursor() sql = """ SELECT DISTINCT o.name, o.seat, f.filer_id_raw, f.xref_filer_id, f.name, f.party FROM %(candidate)s as c INNER JOIN %(office)s as o ON c.office_id = o.id INNER JOIN %(filer)s as f ON c.filer_id = f.id """ % dict( candidate=models.Candidate._meta.db_table, office=models.Office._meta.db_table, filer=models.Filer._meta.db_table, ) self.cursor.execute(sql) writer = CSVKitWriter(open("./candidates.csv", 'wb')) writer.writerow([ 'office_name', 'office_seat', 'filer_id', 'xref_filer_id', 'name', 'party' ]) writer.writerows(self.cursor.fetchall())
def main(self): if self.args.names_only: self.print_column_names() return rows = CSVKitReader(self.input_file, **self.reader_kwargs) if self.args.no_header_row: row = next(rows) column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = next(rows) column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based, self.args.not_columns) output = CSVKitWriter(self.output_file, **self.writer_kwargs) output.writerow([column_names[c] for c in column_ids]) for row in rows: out_row = [row[c] if c < len(row) else None for c in column_ids] if self.args.delete_empty: if ''.join(out_row) == '': continue output.writerow(out_row)
def main(self): if self.args.names_only: self.print_column_names() return #Read in header and rows reader = CSVKitReader(self.input_file, **self.reader_kwargs) column_names = reader.next() if self.args.columns is None: grouped_columns_ids = [] else: grouped_columns_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based) aggregations = [] try: for (fun, cols) in map(lambda (f, cols): ( f, parse_column_identifiers(cols, column_names, self.args.zero_based)), self.args.aggregations): for col in cols: aggregations.append(aggregate_functions[fun](col)) except KeyError: self.argparser.error("Wrong aggregator function. Available: " + ', '.join(aggregate_functions.keys())) #Determine columns to group by, default to all columns #Write the output output = CSVKitWriter(self.output_file, **self.writer_kwargs) for row in group_rows(column_names, reader, grouped_columns_ids, aggregations): output.writerow(row)
def main(self): if len(self.args.files) < 2: sys.exit('You must specify at least two files to stack.') if self.args.group_by_filenames: groups = [os.path.split(f.name)[1] for f in self.args.files] elif self.args.groups: groups = self.args.groups.split(',') if len(groups) != len(self.args.files): sys.exit( 'The number of grouping values must be equal to the number of CSV files being stacked.' ) else: groups = None group_name = self.args.group_name if self.args.group_name else 'group' output = CSVKitWriter(self.output_file, **self.writer_kwargs) for i, f in enumerate(self.args.files): rows = CSVKitReader(f, **self.reader_kwargs) headers = rows.next() if i == 0: if groups: headers.insert(0, group_name) output.writerow(headers) for row in rows: if groups: row.insert(0, groups[i]) output.writerow(row)
def main(self): if self.args.names_only: self.print_column_names() return rows = CSVKitReader(self.input_file, **self.reader_kwargs) if self.args.no_header_row: row = next(rows) column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = next(rows) column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based, self.args.not_columns) output = CSVKitWriter(self.output_file, **self.writer_kwargs) output.writerow([column_names[c] for c in column_ids]) for row in rows: out_row = [row[c] if c < len(row) else None for c in column_ids] if self.args.delete_empty: if ''.join(out_row) == '': continue output.writerow(out_row)
def main(self): try: engine, metadata = sql.get_connection(self.args.connection_string) except ImportError: raise ImportError('You don\'t appear to have the necessary database backend installed for connection string you\'re trying to use.. Available backends include:\n\nPostgresql:\tpip install psycopg2\nMySQL:\t\tpip install MySQL-python\n\nFor details on connection strings and other backends, please see the SQLAlchemy documentation on dialects at: \n\nhttp://www.sqlalchemy.org/docs/dialects/\n\n') conn = engine.connect() if self.args.query: query = self.args.query.strip() else: query = "" for line in self.args.file: query += line rows = conn.execute(query) output = CSVKitWriter(self.output_file, **self.writer_kwargs) if not self.args.no_header_row: output.writerow(rows._metadata.keys) for row in rows: output.writerow(row) conn.close()
def main(self): if self.args.names_only: self.print_column_names() return if self.input_file.name != '<stdin>': # Use filename as table name table_name = os.path.splitext( os.path.split(self.input_file.name)[1])[0] else: table_name = 'csvsql_table' tab = table.Table.from_csv(self.input_file, name=table_name, snifflimit=self.args.snifflimit, no_header_row=self.args.no_header_row, infer_types=(not self.args.no_inference), **self.reader_kwargs) column_ids = parse_column_identifiers(self.args.columns, tab.headers(), self.args.zero_based) rows = tab.to_rows(serialize_dates=True) sorter = lambda r: [ r[c] if r[c] is not None else '' for c in column_ids ] rows.sort(key=sorter, reverse=self.args.reverse) rows.insert(0, tab.headers()) output = CSVKitWriter(self.output_file, **self.writer_kwargs) for row in rows: output.writerow(row)
def main(self): if self.args.names_only: self.print_column_names() return if not self.args.regex and not self.args.pattern and not self.args.matchfile: self.argparser.error("One of -r, -m or -f must be specified, unless using the -n option.") rows = CSVKitReader(self.args.file, **self.reader_kwargs) column_names = rows.next() column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based) if self.args.regex: pattern = re.compile(self.args.regex) elif self.args.matchfile: lines = [line.rstrip() for line in self.args.matchfile] pattern = lambda x: x in lines else: pattern = self.args.pattern patterns = dict((c, pattern) for c in column_ids) output = CSVKitWriter(self.output_file, **self.writer_kwargs) output.writerow(column_names) filter_reader = FilteringCSVReader(rows, header=False, patterns=patterns, inverse=self.args.inverse) for i, row in enumerate(filter_reader): output.writerow(row)
def main(self): rows = CSVKitReader(self.input_file, **self.reader_kwargs) if self.args.no_header_row: row = next(rows) column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = next(rows) column_ids = parse_column_identifiers(None, column_names, self.args.zero_based) uniq_column_id = parse_column_identifiers(self.args.uniq_column, column_names, self.args.zero_based) output = CSVKitWriter(self.output_file, **self.writer_kwargs) output.writerow([column_names[c] for c in column_ids]) d = set() # cache for used-rows # use tuple as keys for cache cache_key = lambda row: tuple([row[i] for i in uniq_column_id]) for row in rows: if cache_key(row) in d: continue d.update([cache_key(row)]) out_row = [row[c] if c < len(row) else None for c in column_ids] output.writerow(out_row)
def log_errors(self, rows): """ Log any errors to a csv file """ # Make sure the log directory exists os.path.exists(self.log_dir) or os.makedirs(self.log_dir) # Log writer log_path = os.path.join( self.log_dir, self.file_name.lower().replace("tsv", "errors.csv") ) log_file = open(log_path, 'w') log_writer = CSVKitWriter(log_file, quoting=csv.QUOTE_ALL) # Add the headers log_writer.writerow([ 'Line number', 'Headers len', 'Fields len', 'Line value' ]) # Log out the rows log_writer.writerows(rows) # Shut it down log_file.close()
def main(self): if self.args.names_only: self.print_column_names() return if not self.args.regex and not self.args.pattern and not self.args.matchfile: self.argparser.error("One of -r, -m or -f must be specified, unless using the -n option.") rows = CSVKitReader(self.args.file, **self.reader_kwargs) column_names = rows.next() column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based) if self.args.regex: pattern = re.compile(self.args.regex) elif self.args.matchfile: lines = set(line.rstrip() for line in self.args.matchfile) pattern = lambda x: x in lines else: pattern = self.args.pattern patterns = dict((c, pattern) for c in column_ids) output = CSVKitWriter(self.output_file, **self.writer_kwargs) output.writerow(column_names) filter_reader = FilteringCSVReader(rows, header=False, patterns=patterns, inverse=self.args.inverse) for i, row in enumerate(filter_reader): output.writerow(row)
def main(self): if self.args.names_only: self.print_column_names() return if not self.args.columns: self.argparser.error('You must specify at least one column to search using the -c option.') if self.args.regex is None and self.args.pattern is None and self.args.matchfile is None: self.argparser.error('One of -r, -m or -f must be specified, unless using the -n option.') rows = CSVKitReader(self.input_file, **self.reader_kwargs) column_names = next(rows) column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based) if self.args.regex: pattern = re.compile(self.args.regex) elif self.args.matchfile: lines = set(line.rstrip() for line in self.args.matchfile) pattern = lambda x: x in lines else: pattern = self.args.pattern patterns = dict((c, pattern) for c in column_ids) output = CSVKitWriter(self.output_file, **self.writer_kwargs) output.writerow(column_names) filter_reader = FilteringCSVReader(rows, header=False, patterns=patterns, inverse=self.args.inverse) for row in filter_reader: output.writerow(row)
def main(self): if self.args.names_only: print_column_names(self.args.file, self.output_file, **self.reader_kwargs) return if self.args.file.name != '<stdin>': # Use filename as table name table_name = os.path.splitext( os.path.split(self.args.file.name)[1])[0] else: table_name = 'csvsql_table' tab = table.Table.from_csv(self.args.file, name=table_name, snifflimit=self.args.snifflimit, **self.reader_kwargs) column_ids = parse_column_identifiers(self.args.columns, tab.headers()) rows = tab.to_rows(serialize_dates=True) rows.sort(key=lambda r: [r[c] for c in column_ids], reverse=self.args.reverse) rows.insert(0, tab.headers()) output = CSVKitWriter(self.output_file, **self.writer_kwargs) for row in rows: output.writerow(row)
def xlsx2csv(f, output=None, **kwargs): """ Convert an Excel .xlsx file to csv. Note: Unlike other convertor's, this one allows output columns to contain mixed data types. Blank headers are also possible. """ streaming = True if output else False if not streaming: output = six.StringIO() writer = CSVKitWriter(output) book = load_workbook(f, use_iterators=True, data_only=True) if 'sheet' in kwargs: sheet = book.get_sheet_by_name(kwargs['sheet']) else: sheet = book.get_active_sheet() for i, row in enumerate(sheet.iter_rows()): if i == 0: writer.writerow([c.value for c in row]) continue out_row = [] for c in row: value = c.value if value.__class__ is datetime.datetime: # Handle default XLSX date as 00:00 time if value.date() == datetime.date( 1904, 1, 1) and not has_date_elements(c): value = value.time() value = normalize_datetime(value) elif value.time() == NULL_TIME: value = value.date() else: value = normalize_datetime(value) elif value.__class__ is float: if value % 1 == 0: value = int(value) if value.__class__ in (datetime.datetime, datetime.date, datetime.time): value = value.isoformat() out_row.append(value) writer.writerow(out_row) if not streaming: data = output.getvalue() return data # Return empty string when streaming return ''
def main(self): if len(self.args.files) < 2: self.argparser.error('You must specify at least two files to stack.') if self.args.group_by_filenames: groups = [os.path.split(f.name)[1] for f in self.args.files] elif self.args.groups: groups = self.args.groups.split(',') if len(groups) != len(self.args.files): self.argparser.error('The number of grouping values must be equal to the number of CSV files being stacked.') else: groups = None group_name = self.args.group_name if self.args.group_name else 'group' output = CSVKitWriter(self.output_file, **self.writer_kwargs) for i, f in enumerate(self.args.files): rows = CSVKitReader(f, **self.reader_kwargs) headers = rows.next() if i == 0: if groups: headers.insert(0, group_name) output.writerow(headers) for row in rows: if groups: row.insert(0, groups[i]) output.writerow(row)
def geojson2csv(f, key=None, **kwargs): """ Convert a GeoJSON document into CSV format. """ js = json.load(f, object_pairs_hook=OrderedDict) if not isinstance(js, dict): raise TypeError('JSON document is not valid GeoJSON: Root element is not an object.') if 'type' not in js: raise TypeError('JSON document is not valid GeoJSON: No top-level "type" key.') if js['type'] != 'FeatureCollection': raise TypeError('Only GeoJSON with root FeatureCollection type is supported. Not %s' % js['type']) if 'features' not in js: raise TypeError('JSON document is not a valid FeatureCollection: No top-level "features" key.') features = js['features'] features_parsed = [] # tuples in the format (id, properties, geometry) property_fields = [] for feature in features: geoid = feature.get('id', None) properties = feature.get('properties') or {} for prop in properties.keys(): if prop not in property_fields: property_fields.append(prop) geometry = json.dumps(feature['geometry']) features_parsed.append((geoid, properties, geometry)) header = ['id'] header.extend(property_fields) header.append('geojson') o = six.StringIO() writer = CSVKitWriter(o) writer.writerow(header) for geoid, properties, geometry in features_parsed: row = [geoid] for field in property_fields: row.append(properties.get(field, None)) row.append(geometry) writer.writerow(row) output = o.getvalue() o.close() return output
def xlsx2csv(f, output=None, **kwargs): """ Convert an Excel .xlsx file to csv. Note: Unlike other convertor's, this one allows output columns to contain mixed data types. Blank headers are also possible. """ streaming = True if output else False if not streaming: output = StringIO() writer = CSVKitWriter(output) book = load_workbook(f, use_iterators=True, data_only=True) if 'sheet' in kwargs: sheet = book.get_sheet_by_name(kwargs['sheet']) else: sheet = book.get_active_sheet() for i, row in enumerate(sheet.iter_rows()): if i == 0: writer.writerow([c.value for c in row]) continue out_row = [] for c in row: value = c.value if value.__class__ is datetime.datetime: # Handle default XLSX date as 00:00 time if value.date() == datetime.date(1904, 1, 1) and not has_date_elements(c): value = value.time() value = normalize_datetime(value) elif value.time() == NULL_TIME: value = value.date() else: value = normalize_datetime(value) elif value.__class__ is float: if value % 1 == 0: value = int(value) if value.__class__ in (datetime.datetime, datetime.date, datetime.time): value = value.isoformat() out_row.append(value) writer.writerow(out_row) if not streaming: data = output.getvalue() return data # Return empty string when streaming return ''
def main(self): reader = CSVKitReader(self.args.file, **self.reader_kwargs) cnames = reader.next() cids = parse_column_identifiers(self.args.columns, cnames, self.args.zero_based) mods = {idx: self.args.expr for idx in cids} output = CSVKitWriter(self.output_file, **self.writer_kwargs) reader = sed.CsvFilter(reader, mods, header=False) output.writerow(cnames) for row in reader: output.writerow(row)
def main(self): reader = CSVKitReader(self.args.file, **self.reader_kwargs) cnames = reader.next() cids = parse_column_identifiers(self.args.columns, cnames, self.args.zero_based) mods = {idx: self.args.expr for idx in cids} output = CSVKitWriter(self.output_file, **self.writer_kwargs) reader = sed.CsvFilter(reader, mods, header=False) output.writerow(cnames) for row in reader: output.writerow(row)
def xlsx2csv(f, output=None, **kwargs): """ Convert an Excel .xlsx file to csv. Note: Unlike other convertor's, this one allows output columns to contain mixed data types. Blank headers are also possible. """ streaming = True if output else False if not streaming: output = StringIO() writer = CSVKitWriter(output) book = load_workbook(f, use_iterators=True) sheet = book.get_active_sheet() for i, row in enumerate(sheet.iter_rows()): if i == 0: writer.writerow([c.internal_value for c in row]) continue out_row = [] for c in row: value = c.internal_value if value.__class__ is datetime.datetime: if value.time() != NULL_TIME: value = normalize_datetime(value) else: value = value.date() elif value.__class__ is float: if value % 1 == 0: value = int(value) if value.__class__ in (datetime.datetime, datetime.date, datetime.time): value = value.isoformat() if value == "\0": continue out_row.append(value) writer.writerow(out_row) if not streaming: data = output.getvalue() return data # Return empty string when streaming return ''
def json2csv(f, key=None, **kwargs): """ Convert a JSON document into CSV format. The top-level element of the input must be a list or a dictionary. If it is a dictionary, a key must be provided which is an item of the dictionary which contains a list. """ document = f.read() js = json.loads(document) if isinstance(js, dict): if not key: raise TypeError( 'When converting a JSON document with a top-level dictionary element, a key must be specified.' ) js = js[key] if not isinstance(js, list): raise TypeError( 'Only JSON documents with a top-level list element are able to be converted (or a top-level dictionary if specifying a key).' ) field_set = set() flat = [] for obj in js: flat.append(parse_object(obj)) for obj in flat: field_set.update(obj.keys()) fields = sorted(list(field_set)) o = six.StringIO() writer = CSVKitWriter(o) writer.writerow(fields) for i in flat: row = [] for field in fields: if field in i: row.append(i[field]) else: row.append(None) writer.writerow(row) output = o.getvalue() o.close() return output
def xlsx2csv(f, output=None, **kwargs): """ Convert an Excel .xlsx file to csv. Note: Unlike other convertor's, this one allows output columns to contain mixed data types. Blank headers are also possible. """ streaming = True if output else False if not streaming: output = StringIO() writer = CSVKitWriter(output) book = load_workbook(f, use_iterators=True) sheet = book.get_active_sheet() for i, row in enumerate(sheet.iter_rows()): if i == 0: writer.writerow([c.internal_value for c in row]) continue out_row = [] for c in row: value = c.internal_value if value.__class__ is datetime.datetime: if value.time() != NULL_TIME: value = normalize_datetime(value) else: value = value.date() elif value.__class__ is float: if value % 1 == 0: value = int(value) if value.__class__ in (datetime.datetime, datetime.date, datetime.time): value = value.isoformat() if value == "\0": continue out_row.append(value) writer.writerow(out_row) if not streaming: data = output.getvalue() return data # Return empty string when streaming return ""
def main(self): if self.args.names_only: self.print_column_names() return if not self.args.script: self.argparser.error("At least one script -s must be defined.") rows = CSVKitReader(self.args.file, **self.reader_kwargs) output = CSVKitWriter(self.output_file, **self.writer_kwargs) script_reader = ScriptCSVReader(rows, scripts=self.args.script, zero_based=self.args.zero_based) for i, row in enumerate(script_reader): output.writerow(row)
def main(self): self.input_files = [] for path in self.args.input_paths: self.input_files.append(self._open_input_file(path)) if len(self.input_files) < 2: self.argparser.error('You must specify at least two files to stack.') if self.args.group_by_filenames: groups = [os.path.split(f.name)[1] for f in self.input_files] elif self.args.groups: groups = self.args.groups.split(',') if len(groups) != len(self.input_files): self.argparser.error('The number of grouping values must be equal to the number of CSV files being stacked.') else: groups = None group_name = self.args.group_name if self.args.group_name else 'group' output = CSVKitWriter(self.output_file, **self.writer_kwargs) for i, f in enumerate(self.input_files): rows = CSVKitReader(f, **self.reader_kwargs) #headers = next(rows, []) # If we have header rows, use them if not self.args.no_header_row: headers = next(rows, []) if i == 0: if groups: headers.insert(0, group_name) output.writerow(headers) # If we don't generate simple column names based on first row else: row = next(rows, []) headers = make_default_headers(len(row)) if i == 0: if groups: headers.insert(0, group_name) output.writerow(headers) if groups: row.insert(0, groups[i]) output.writerow(row) for row in rows: if groups: row.insert(0, groups[i]) output.writerow(row) f.close()
def json2csv(f, key=None, **kwargs): """ Convert a JSON document into CSV format. The top-level element of the input must be a list or a dictionary. If it is a dictionary, a key must be provided which is an item of the dictionary which contains a list. """ document = f.read() js = json.loads(document) if isinstance(js, dict): if not key: raise TypeError('When converting a JSON document with a top-level dictionary element, a key must be specified.') js = js[key] if not isinstance(js, list): raise TypeError('Only JSON documents with a top-level list element are able to be converted (or a top-level dictionary if specifying a key).') field_set = set() flat = [] for obj in js: flat.append(parse_object(obj)) for obj in flat: field_set.update(obj.keys()) fields = sorted(list(field_set)) o = six.StringIO() writer = CSVKitWriter(o, *kwargs) writer.writerow(fields) for i in flat: row = [] for field in fields: if field in i: row.append(i[field]) else: row.append(None) writer.writerow(row) output = o.getvalue() o.close() return output
def main(self): self.input_files = [] for path in self.args.input_paths: self.input_files.append(self._open_input_file(path)) if len(self.input_files) < 2: self.argparser.error('You must specify at least two files to stack.') if self.args.group_by_filenames: groups = [os.path.split(f.name)[1] for f in self.input_files] elif self.args.groups: groups = self.args.groups.split(',') if len(groups) != len(self.input_files): self.argparser.error('The number of grouping values must be equal to the number of CSV files being stacked.') else: groups = None group_name = self.args.group_name if self.args.group_name else 'group' output = CSVKitWriter(self.output_file, **self.writer_kwargs) for i, f in enumerate(self.input_files): rows = CSVKitReader(f, **self.reader_kwargs) # If we have header rows, use them if not self.args.no_header_row: headers = next(rows, []) if i == 0: if groups: headers.insert(0, group_name) output.writerow(headers) # If we don't generate simple column names based on first row else: row = next(rows, []) headers = make_default_headers(len(row)) if i == 0: if groups: headers.insert(0, group_name) output.writerow(headers) if groups: row.insert(0, groups[i]) output.writerow(row) for row in rows: if groups: row.insert(0, groups[i]) output.writerow(row) f.close()
def json2csv(f, key=None, **kwargs): """ Convert a JSON document into CSV format. The top-level element of the input must be a list or a dictionary. If it is a dictionary, a key must be provided which is an item of the dictionary which contains a list. """ js = json.load(f, object_pairs_hook=OrderedDict) if isinstance(js, dict): if not key: raise TypeError( 'When converting a JSON document with a top-level dictionary element, a key must be specified.' ) js = js[key] if not isinstance(js, list): raise TypeError( 'Only JSON documents with a top-level list element are able to be converted (or a top-level dictionary if specifying a key).' ) fields = [] flat = [] for obj in js: flat.append(parse_object(obj)) for obj in js: for key in obj.keys(): if key not in fields: fields.append(key) o = six.StringIO() writer = CSVKitWriter(o) writer.writerow(fields) for i in flat: row = [] for field in fields: row.append(i.get(field, None)) writer.writerow(row) output = o.getvalue() o.close() return output
def main(self): reader = CSVKitReader(self.args.file, **self.reader_kwargs) if self.args.dryrun: checker = RowChecker(reader) for row in checker.checked_rows(): pass if checker.errors: for e in checker.errors: self.output_file.write("Line %i: %s\n" % (e.line_number, e.msg)) else: self.output_file.write("No errors.\n") if checker.joins: self.output_file.write( "%i rows would have been joined/reduced to %i rows after eliminating expected internal line breaks.\n" % (checker.rows_joined, checker.joins) ) else: base, ext = splitext(self.args.file.name) with open("%s_out.csv" % base, "w") as f: clean_writer = CSVKitWriter(f, **self.writer_kwargs) checker = RowChecker(reader) clean_writer.writerow(checker.column_names) for row in checker.checked_rows(): clean_writer.writerow(row) if checker.errors: error_filename = "%s_err.csv" % base with open(error_filename, "w") as f: error_writer = CSVKitWriter(f, **self.writer_kwargs) error_header = ["line_number", "msg"] error_header.extend(checker.column_names) error_writer.writerow(error_header) error_count = len(checker.errors) for e in checker.errors: error_writer.writerow(self._format_error_row(e)) self.output_file.write( "%i error%s logged to %s\n" % (error_count, "" if error_count == 1 else "s", error_filename) ) else: self.output_file.write("No errors.\n") if checker.joins: self.output_file.write( "%i rows were joined/reduced to %i rows after eliminating expected internal line breaks.\n" % (checker.rows_joined, checker.joins) )
def main(self): reader = CSVKitReader(self.input_file, **self.reader_kwargs) if self.args.dryrun: checker = RowChecker(reader) for row in checker.checked_rows(): pass if checker.errors: for e in checker.errors: self.output_file.write('Line %i: %s\n' % (e.line_number, e.msg)) else: self.output_file.write('No errors.\n') if checker.joins: self.output_file.write( '%i rows would have been joined/reduced to %i rows after eliminating expected internal line breaks.\n' % (checker.rows_joined, checker.joins)) else: base, ext = splitext(self.input_file.name) with open('%s_out.csv' % base, 'w') as f: clean_writer = CSVKitWriter(f, **self.writer_kwargs) checker = RowChecker(reader) clean_writer.writerow(checker.column_names) for row in checker.checked_rows(): clean_writer.writerow(row) if checker.errors: error_filename = '%s_err.csv' % base with open(error_filename, 'w') as f: error_writer = CSVKitWriter(f, **self.writer_kwargs) error_header = ['line_number', 'msg'] error_header.extend(checker.column_names) error_writer.writerow(error_header) error_count = len(checker.errors) for e in checker.errors: error_writer.writerow(self._format_error_row(e)) self.output_file.write( '%i error%s logged to %s\n' % (error_count, '' if error_count == 1 else 's', error_filename)) else: self.output_file.write('No errors.\n') if checker.joins: self.output_file.write( '%i rows were joined/reduced to %i rows after eliminating expected internal line breaks.\n' % (checker.rows_joined, checker.joins))
def writeCsv(self, filename=None): rows = self.getRows() if filename == None: filename = self.name + '.csv' # Output result of last query as CSV row_count = 0 with open(filename, 'wb') as out: output = CSVKitWriter(out) output.writerow(rows._metadata.keys) for row in rows: output.writerow(row) row_count += 1 tool.VERBOSE('wrote {} row(s) to csv {}', row_count, filename)
def sheet2csv(sheet, output=None): streaming = True if output else False if not streaming: output = six.StringIO() writer = CSVKitWriter(output) for i, row in enumerate(sheet.iter_rows()): if i == 0: writer.writerow([c.value for c in row]) continue out_row = [] for c in row: value = c.value if value.__class__ is datetime.datetime: # Handle default XLSX date as 00:00 time if value.date() == datetime.date(1904, 1, 1) and not has_date_elements(c): value = value.time() value = normalize_datetime(value) elif value.time() == NULL_TIME: value = value.date() else: value = normalize_datetime(value) elif value.__class__ is float: if value % 1 == 0: value = int(value) if value.__class__ in (datetime.datetime, datetime.date, datetime.time): value = value.isoformat() out_row.append(value) writer.writerow(out_row) if not streaming: data = output.getvalue() return data # Return empty string when streaming return ''
def main(self): tabname = os.path.splitext( os.path.basename(self.args.file._lazy_args[0]))[0] tab = table.Table.from_csv(self.args.file, name=tabname, **self.reader_kwargs) stmt = make_create_table_statement(make_table(tab), dialect='sqlite') conn = sqlite3.connect(':memory:') c = conn.cursor() c.execute(stmt) for row in tab.to_rows(): vals = ','.join(['?'] * len(row)) prepared = "INSERT INTO %s VALUES(%s)" % (tab.name, vals) c.execute(prepared, row) output = CSVKitWriter(self.output_file, **self.writer_kwargs) for row in c.execute(self.args.query): output.writerow(row)
def json2csv(f, key=None, **kwargs): """ Convert a JSON document into CSV format. The top-level element of the input must be a list or a dictionary. If it is a dictionary, a key must be provided which is an item of the dictionary which contains a list. """ js = json.load(f, object_pairs_hook=OrderedDict) if isinstance(js, dict): if not key: raise TypeError('When converting a JSON document with a top-level dictionary element, a key must be specified.') js = js[key] if not isinstance(js, list): raise TypeError('Only JSON documents with a top-level list element are able to be converted (or a top-level dictionary if specifying a key).') fields = [] flat = [] for obj in js: flat.append(parse_object(obj)) for obj in js: for key in obj.keys(): if key not in fields: fields.append(key) o = six.StringIO() writer = CSVKitWriter(o) writer.writerow(fields) for i in flat: row = [] for field in fields: row.append(i.get(field, None)) writer.writerow(row) output = o.getvalue() o.close() return output
def ndjson2csv(f, key=None, **kwargs): """ Convert a JSON document into CSV format. Supports both JSON and "Newline-delimited JSON". The top-level element of the input must be a list or a dictionary. If it is a dictionary, a key must be provided which is an item of the dictionary which contains a list. """ first_line = f.readline() first_row = json.loads(first_line, object_pairs_hook=OrderedDict) js = itertools.chain( (first_row, ), (json.loads(l, object_pairs_hook=OrderedDict) for l in f)) fields = [] flat = [] for obj in js: flat.append(parse_object(obj)) for key in obj.keys(): if key not in fields: fields.append(key) o = six.StringIO() writer = CSVKitWriter(o) writer.writerow(fields) for i in flat: row = [] for field in fields: row.append(i.get(field, None)) writer.writerow(row) output = o.getvalue() o.close() return output
def json2csv(f, field=None, **kwargs): """ Convert a JSON document into CSV format. The top-level element of the input must be a list or a dictionary. If it is a dictionary, a key must be provided which is an item of the dictionary which contains a list. """ js = json.load(f, object_pairs_hook=OrderedDict) if isinstance(js, dict): if not field: raise TypeError('When converting a JSON document with a top-level dictionary element, a key must be specified.') js = js[field] fields = [] flats = [] for obj in js: (flat_fields, flat_key_values_dict) = parse_object(obj) flats.append(flat_key_values_dict) for field in flat_fields: if field not in fields: fields.append(field) o = six.StringIO() writer = CSVKitWriter(o) writer.writerow(fields) for i in flats: row = [] for field in fields: row.append(i.get(field, None)) writer.writerow(row) output = o.getvalue() o.close() return output
def download_to_csv(event_id): f = StringIO() writer = CSVKitWriter(f) # List headers event = Event.query.get(event_id) headers = [] for question in event.questions: headers.append(question.label) writer.writerow(headers) # List entries for each registration for registration in event.registrations: data = [] for answer in registration.answers: data.append(answer.value) writer.writerow(data) return Response(f.getvalue(), mimetype='text/csv')
def main(self): if self.args.names_only: self.print_column_names() return rows = CSVKitReader(self.args.file, **self.reader_kwargs) column_names = rows.next() column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based, self.args.not_columns) output = CSVKitWriter(self.output_file, **self.writer_kwargs) output.writerow([column_names[c] for c in column_ids]) for i, row in enumerate(rows): out_row = [row[c] if c < len(row) else None for c in column_ids] if self.args.delete_empty: if ''.join(out_row) == '': continue output.writerow(out_row)
def main(self): rows = CSVKitReader(self.args.file, **self.reader_kwargs) if self.args.no_header_row: row = rows.next() column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = rows.next() column_names = self.args.columns.split(',') part_count = 0 output = CSVKitWriter( open(self.args.file._lazy_args[0]+".part.%d" % part_count, 'w'), **self.writer_kwargs) output.writerow(column_names) count = 0 for row in rows: if (self.args.lines > 0) and (count == self.args.lines): part_count += 1 count = 0 # couldn't find a better way to close the file del output output = CSVKitWriter( open(self.args.file._lazy_args[0]+".part.%d" % part_count, 'w'), **self.writer_kwargs) output.writerow(column_names) output.writerow(row) count += 1
def run(*args): avibase_index = 'http://avibase.bsc-eoc.org/checklist.jsp?list=eBird' list_name = args[0] language = synonyms[args[1]] response = urllib2.urlopen(avibase_index) html = lxml.html.fromstring(response.read()) links = html.cssselect('.AVBregions td a') names = [] for link in links: if link.text == list_name: checklist_url = 'http://avibase.bsc-eoc.org/' + link.attrib['href'] if language != 'EN': checklist_url += '&synlang=%s' % language response = urllib2.urlopen(checklist_url) html = lxml.html.fromstring(response.read()) for row in html.cssselect('.AVBlist tr.highlight1'): cells = row.cssselect('td') scientific_name = cells[1].cssselect('i')[0].text if language == 'EN': common_name = cells[0].text else: common_name = cells[2].text if common_name: common_name = common_name.encode('latin_1').decode('utf-8') else: common_name = scientific_name names.append((scientific_name, common_name)) suffix = language.lower() with open('species_names_%s.csv' % suffix, 'wb') as fp: writer = CSVKitWriter(fp) writer.writerow(('scientific_name', 'common_name_%s' % suffix)) for name in names: writer.writerow(name)
def main(self): rows = CSVKitReader(self.input_file, **self.reader_kwargs) if self.args.no_header_row: row = next(rows) column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = next(rows) column_ids = parse_column_identifiers(None, column_names, self.args.zero_based) output = CSVKitWriter(self.output_file, **self.writer_kwargs) # write header output.writerow([column_names[c] for c in column_ids]) def float_or_else(x): try: return float(x) except ValueError: return x if self.args.filter_expr: for row in rows: d = {i:float_or_else(j) for i,j in zip(column_names,row)} if eval(self.args.filter_expr,d): out_row = [row[c] if c < len(row) else None for c in column_ids] output.writerow(out_row) elif self.args.not_filter_expr: for row in rows: d = {i:float_or_else(j) for i,j in zip(column_names,row)} if not eval(self.args.not_filter_expr,d): out_row = [row[c] if c < len(row) else None for c in column_ids] output.writerow(out_row)
def ndjson2csv(f, key=None, **kwargs): """ Convert a JSON document into CSV format. Supports both JSON and "Newline-delimited JSON". The top-level element of the input must be a list or a dictionary. If it is a dictionary, a key must be provided which is an item of the dictionary which contains a list. """ first_line = f.readline() first_row = json.loads(first_line, object_pairs_hook=OrderedDict) js = itertools.chain((first_row, ), (json.loads(l, object_pairs_hook=OrderedDict) for l in f)) fields = [] flat = [] for obj in js: flat.append(parse_object(obj)) for key in obj.keys(): if key not in fields: fields.append(key) o = six.StringIO() writer = CSVKitWriter(o) writer.writerow(fields) for i in flat: row = [] for field in fields: row.append(i.get(field, None)) writer.writerow(row) output = o.getvalue() o.close() return output
def main(self): rows = CSVKitReader(self.input_file, **self.reader_kwargs) if self.args.no_header_row: row = next(rows) column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = next(rows) column_ids = parse_column_identifiers(None,column_names, self.args.zero_based) uniq_column_id = parse_column_identifiers(self.args.uniq_column, column_names, self.args.zero_based) output = CSVKitWriter(self.output_file, **self.writer_kwargs) output.writerow([column_names[c] for c in column_ids]) d = set() # cache for used-rows # use tuple as keys for cache cache_key = lambda row: tuple([row[i] for i in uniq_column_id]) for row in rows: if cache_key(row) in d: continue d.update([ cache_key(row) ]) out_row = [row[c] if c < len(row) else None for c in column_ids] output.writerow(out_row)
def download_to_csv(event_id): """Downloads the registration data to a CSV""" f = StringIO() writer = CSVKitWriter(f) # List headers event = Event.query.get(event_id) headers = [] for question in event.questions: headers.append(question.label) writer.writerow(headers) # List entries for each registration for registration in event.registrations: data = [] sorted_answers = sorted(registration.answers, key=lambda a: a.question.ordinal) for answer in sorted_answers: data.append(answer.value) writer.writerow(data) return Response(f.getvalue(), mimetype='text/csv')
def main(self): reader = CSVKitReader(self.args.file, **self.reader_kwargs) if self.args.dryrun: checker = RowChecker(reader) for row in checker.checked_rows(): pass if checker.errs: for e in checker.errs: self.output_file.write("Line %i: %s\n" % (e.line_number, e.msg)) else: self.output_file.write("No errors.\n") if checker.joins: self.output_file.write( "%i rows would have been joined/reduced to %i rows after eliminating expected internal line breaks.\n" % (checker.rows_joined, checker.joins)) else: base, ext = splitext(self.args.file.name) # should we preserve delimiters and other dialect args from CLI? cleaned_file = CSVKitWriter(open("%s_out.csv" % base, "w"), **self.writer_kwargs) checker = RowChecker(reader) cleaned_file.writerow(checker.column_names) for row in checker.checked_rows(): cleaned_file.writerow(row) if checker.errs: # should we preserve delimiters and other dialect args from CLI? err_filename = "%s_err.csv" % base err_file = CSVKitWriter(open(err_filename, "w"), **self.writer_kwargs) err_header = ['line_number', 'msg'] err_header.extend(checker.column_names) err_file.writerow(err_header) for e in checker.errs: err_file.writerow(self._format_error_row(e)) err_count = len(checker.errs) self.output_file.write( "%i error%s logged to %s\n" % (err_count, "" if err_count == 1 else "s", err_filename)) else: self.output_file.write("No errors.\n") if checker.joins: self.output_file.write( "%i rows were joined/reduced to %i rows after eliminating expected internal line breaks.\n" % (checker.rows_joined, checker.joins))
def main(self): if self.args.names_only: print_column_names(self.args.file, self.output_file, **self.reader_kwargs) return if self.args.file.name != '<stdin>': # Use filename as table name table_name = os.path.splitext(os.path.split(self.args.file.name)[1])[0] else: table_name = 'csvsql_table' tab = table.Table.from_csv(self.args.file, name=table_name, snifflimit=self.args.snifflimit, **self.reader_kwargs) column_ids = parse_column_identifiers(self.args.columns, tab.headers()) rows = tab.to_rows(serialize_dates=True) rows.sort(key=lambda r: [r[c] for c in column_ids], reverse=self.args.reverse) rows.insert(0, tab.headers()) output = CSVKitWriter(self.output_file, **self.writer_kwargs) for row in rows: output.writerow(row)
def download_to_csv(event_id): """Downloads the registration data to a CSV""" f = StringIO() writer = CSVKitWriter(f) # List headers event = Event.query.get(event_id) headers = [] for question in event.questions: headers.append(question.label) writer.writerow(headers) # List entries for each registration for registration in event.registrations: data = [] sorted_answers = sorted(registration.answers, key=lambda a: a.question.ordinal) for answer in sorted_answers: data.append(answer.value) writer.writerow(data) return Response(f.getvalue(), mimetype='text/csv')
def main(self): if self.args.names_only: self.print_column_names() return rows = CSVKitReader(self.args.file, **self.reader_kwargs) column_names = rows.next() column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based, self.args.not_columns) output = CSVKitWriter(self.output_file, **self.writer_kwargs) output.writerow([column_names[c] for c in column_ids]) for i, row in enumerate(rows): out_row = [row[c] if c < len(row) else None for c in column_ids] if self.args.delete_empty: if ''.join(out_row) == '': continue output.writerow(out_row)
def main(self): rows = CSVKitReader(self.input_file, **self.reader_kwargs) if self.args.no_header_row: row = next(rows) column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = next(rows) column_ids = parse_column_identifiers(None, column_names, self.args.zero_based) output = CSVKitWriter(self.output_file, **self.writer_kwargs) # write header output.writerow([column_names[c] for c in column_ids]) def float_or_else(x): try: return float(x) except ValueError: return x if self.args.filter_expr: for row in rows: d = {i: float_or_else(j) for i, j in zip(column_names, row)} if eval(self.args.filter_expr, d): out_row = [ row[c] if c < len(row) else None for c in column_ids ] output.writerow(out_row) elif self.args.not_filter_expr: for row in rows: d = {i: float_or_else(j) for i, j in zip(column_names, row)} if not eval(self.args.not_filter_expr, d): out_row = [ row[c] if c < len(row) else None for c in column_ids ] output.writerow(out_row)
def clean(self, name): """ Cleans the provided source TSV file and writes it out in CSV format """ if self.verbosity > 2: self.log(" Cleaning %s" % name) # Up the CSV data limit csv.field_size_limit(1000000000) # Input and output paths tsv_path = os.path.join(self.tsv_dir, name) csv_path = os.path.join(self.csv_dir, name.lower().replace("tsv", "csv")) # Writer csv_file = open(csv_path, 'w') csv_writer = CSVKitWriter(csv_file, quoting=csv.QUOTE_ALL) # Reader tsv_file = open(tsv_path, 'rb') # Pull and clean the headers try: headers = tsv_file.readline() except StopIteration: return headers = headers.decode("ascii", "replace") headers_csv = CSVKitReader(StringIO(headers), delimiter=str('\t')) try: headers_list = next(headers_csv) except StopIteration: return headers_count = len(headers_list) csv_writer.writerow(headers_list) log_rows = [] # Loop through the rest of the data line_number = 1 for tsv_line in tsv_file: # Goofing around with the encoding while we're in there. tsv_line = tsv_line.decode("ascii", "replace") if six.PY2: tsv_line = tsv_line.replace('\ufffd', '?') # Nuke any null bytes null_bytes = tsv_line.count('\x00') if null_bytes: tsv_line = tsv_line.replace('\x00', ' ') # Nuke ASCII 26 char, the "substitute character" # or chr(26) in python sub_char = tsv_line.count('\x1a') if sub_char: tsv_line = tsv_line.replace('\x1a', '') # Split on tabs so we can later spit it back out as CSV # and remove extra newlines while we are there. csv_field_list = tsv_line.replace("\r\n", "").split("\t") # Check if our values line up with our headers # and if not, see if CSVkit can sort out the problems if not len(csv_field_list) == headers_count: csv_field_list = next( CSVKitReader(StringIO(tsv_line), delimiter=str('\t'))) if not len(csv_field_list) == headers_count: if self.verbosity > 2: msg = ' Bad parse of line %s (%s headers, %s values)' self.failure(msg % (line_number, len(headers_list), len(csv_field_list))) log_rows.append([ line_number, len(headers_list), len(csv_field_list), ','.join(csv_field_list) ]) continue # Write out the row csv_writer.writerow(csv_field_list) line_number += 1 # Log errors if there are any if log_rows: if self.verbosity > 1: msg = ' %s errors' self.failure(msg % (len(log_rows) - 1)) self.log_errors(name, log_rows) # Shut it down tsv_file.close() csv_file.close()
def main(self): connection_string = self.args.connection_string do_insert = self.args.insert query = self.args.query self.input_files = [] for path in self.args.input_paths: self.input_files.append(self._open_input_file(path)) if self.args.table_names: table_names = self.args.table_names.split(',') else: table_names = [] # If one or more filenames are specified, we need to add stdin ourselves (if available) if sys.stdin not in self.input_files: try: if not sys.stdin.isatty(): self.input_files.insert(0, sys.stdin) except: pass # Create an SQLite database in memory if no connection string is specified if query and not connection_string: connection_string = "sqlite:///:memory:" do_insert = True if self.args.dialect and connection_string: self.argparser.error('The --dialect option is only valid when --db is not specified.') if do_insert and not connection_string: self.argparser.error('The --insert option is only valid when --db is also specified.') if self.args.no_create and not do_insert: self.argparser.error('The --no-create option is only valid --insert is also specified.') # Establish database validity before reading CSV files if connection_string: try: engine, metadata = sql.get_connection(connection_string) except ImportError: raise ImportError('You don\'t appear to have the necessary database backend installed for connection string you\'re trying to use. Available backends include:\n\nPostgresql:\tpip install psycopg2\nMySQL:\t\tpip install MySQL-python\n\nFor details on connection strings and other backends, please see the SQLAlchemy documentation on dialects at: \n\nhttp://www.sqlalchemy.org/docs/dialects/\n\n') conn = engine.connect() trans = conn.begin() for f in self.input_files: try: # Try to use name specified via --table table_name = table_names.pop(0) except IndexError: if f == sys.stdin: table_name = "stdin" else: # Use filename as table name table_name = os.path.splitext(os.path.split(f.name)[1])[0] csv_table = table.Table.from_csv( f, name=table_name, snifflimit=self.args.snifflimit, blanks_as_nulls=(not self.args.blanks), infer_types=(not self.args.no_inference), no_header_row=self.args.no_header_row, **self.reader_kwargs ) f.close() if connection_string: sql_table = sql.make_table( csv_table, table_name, self.args.no_constraints, self.args.db_schema, metadata ) # Create table if not self.args.no_create: sql_table.create() # Insert data if do_insert and csv_table.count_rows() > 0: insert = sql_table.insert() headers = csv_table.headers() conn.execute(insert, [dict(zip(headers, row)) for row in csv_table.to_rows()]) # Output SQL statements else: sql_table = sql.make_table(csv_table, table_name, self.args.no_constraints) self.output_file.write('%s\n' % sql.make_create_table_statement(sql_table, dialect=self.args.dialect)) if connection_string: if query: # Execute specified SQL queries queries = query.split(';') rows = None for q in queries: if q: rows = conn.execute(q) # Output result of last query as CSV try: output = CSVKitWriter(self.output_file, **self.writer_kwargs) if not self.args.no_header_row: output.writerow(rows._metadata.keys) for row in rows: output.writerow(row) except AttributeError: pass trans.commit() conn.close()
def run(self, query, task_status_id, filename=None, *args, **kwargs): """ Execute export. """ from panda.models import Dataset, TaskStatus log = logging.getLogger(self.name) log.info('Beginning export, query: %s' % query) task_status = TaskStatus.objects.get(id=task_status_id) task_status.begin('Preparing to import') if not filename: filename = 'search_export_%s' % (now().isoformat()) zip_name = '%s.zip' % filename path = os.path.join(settings.EXPORT_ROOT, filename) zip_path = os.path.join(settings.EXPORT_ROOT, zip_name) try: os.makedirs(os.path.realpath(path)) except: pass zipfile = ZipFile(zip_path, 'w') response = solr.query_grouped(settings.SOLR_DATA_CORE, query, 'dataset_slug', offset=0, limit=1000, group_limit=0, group_offset=0) groups = response['grouped']['dataset_slug']['groups'] datasets = {} for group in groups: dataset_slug = group['groupValue'] count = group['doclist']['numFound'] datasets[dataset_slug] = count total_n = 0 throttle = config_value('PERF', 'TASK_THROTTLE') for dataset_slug in datasets: try: dataset = Dataset.objects.get(slug=dataset_slug) except Dataset.DoesNotExist: log.warning( 'Skipping part of export due to Dataset being deleted, dataset_slug: %s' % dataset_slug) continue filename = '%s.csv' % dataset_slug file_path = os.path.join(path, filename) f = open(file_path, 'w') writer = CSVKitWriter(f) # Header writer.writerow([c['name'] for c in dataset.column_schema]) response = solr.query(settings.SOLR_DATA_CORE, query, offset=0, limit=0) # Update dataset and total counts for progress tracking datasets[dataset_slug] = response['response']['numFound'] total_count = sum(datasets.values()) n = 0 while n < datasets[dataset_slug]: response = solr.query(settings.SOLR_DATA_CORE, 'dataset_slug: %s %s' % (dataset_slug, query), offset=n, limit=SOLR_PAGE_SIZE) results = response['response']['docs'] for row in results: data = json.loads(row['data']) writer.writerow(data) task_status.update( '%.0f%% complete' % floor(float(total_n) / float(total_count) * 100)) if self.is_aborted(): task_status.abort( 'Aborted after exporting %.0f%%' % floor(float(total_n) / float(total_count) * 100)) log.warning('Export aborted, query: %s' % query) return n += SOLR_PAGE_SIZE total_n += response['response']['numFound'] time.sleep(throttle) f.close() # Add to zip and nuke temp file zipfile.write(file_path, filename) os.remove(file_path) # Finish zip file and nuke temp directory zipfile.close() os.rmdir(path) task_status.update('100% complete') log.info('Finished export, query: %s' % query) return zip_name