def from_csv(cls, f, name='from_csv_table', snifflimit=None, column_ids=None, blanks_as_nulls=True, zero_based=False, infer_types=True, no_header_row=False, **kwargs): """ Creates a new Table from a file-like object containing CSV data. Note: the column_ids argument will cause only those columns with a matching identifier to be parsed, type inferred, etc. However, their order/index property will reflect the original data (e.g. column 8 will still be "order" 7, even if it's the third column in the resulting Table. """ # This bit of nonsense is to deal with "files" from stdin, # which are not seekable and thus must be buffered contents = f.read() # snifflimit == 0 means do not sniff if snifflimit is None: kwargs['dialect'] = sniffer.sniff_dialect(contents) elif snifflimit > 0: kwargs['dialect'] = sniffer.sniff_dialect(contents[:snifflimit]) f = six.StringIO(contents) rows = CSVKitReader(f, **kwargs) if no_header_row: # Peek at a row to infer column names from row = next(rows) headers = make_default_headers(len(row)) column_ids = parse_column_identifiers(column_ids, headers, zero_based) headers = [headers[c] for c in column_ids] data_columns = [[] for c in headers] # Put row back on top rows = itertools.chain([row], rows) else: headers = next(rows) if column_ids: column_ids = parse_column_identifiers(column_ids, headers, zero_based) headers = [headers[c] for c in column_ids] else: column_ids = range(len(headers)) data_columns = [[] for c in headers] for i, row in enumerate(rows): for j, d in enumerate(row): try: data_columns[j].append(row[column_ids[j]].strip()) except IndexError: # Non-rectangular data is truncated break columns = [] for i, c in enumerate(data_columns): columns.append(Column(column_ids[i], headers[i], c, blanks_as_nulls=blanks_as_nulls, infer_types=infer_types)) return Table(columns, name=name)
def from_csv(cls, f, name='from_csv_table', **kwargs): """ Creates a new Table from a file-like object containing CSV data. """ # This bit of nonsense is to deal with "files" from stdin, # which are not seekable and thus must be buffered contents = f.read() sample = contents dialect = sniffer.sniff_dialect(sample) f = StringIO(contents) reader = CSVKitReader(f, dialect=dialect, **kwargs) headers = reader.next() data_columns = [[] for c in headers] for row in reader: for i, d in enumerate(row): try: data_columns[i].append(d.strip()) except IndexError: # Non-rectangular data is truncated break columns = [] for i, c in enumerate(data_columns): columns.append(Column(i, headers[i], c)) return Table(columns, name=name)
def from_csv(cls, f, name='from_csv_table', **kwargs): """ Creates a new Table from a file-like object containing CSV data. """ # This bit of nonsense is to deal with "files" from stdin, # which are not seekable and thus must be buffered contents = f.read() sample = contents dialect = sniffer.sniff_dialect(sample, **kwargs) f = StringIO(contents) reader = CSVKitReader(f, dialect=dialect, **kwargs) headers = reader.next() data_columns = [[] for c in headers] for row in reader: for i, d in enumerate(row): try: data_columns[i].append(d.strip()) except IndexError: # Non-rectangular data is truncated break columns = [] for i, c in enumerate(data_columns): columns.append(Column(i, headers[i], c)) return Table(columns, name=name)
def load_file(file_name, db_engine): # Pass file name and SQLAlchemy db engine # Opens file, makes a file object, creates table # Builds and runs a COPY FROM statement print '-----------------' print ' Opening {} ({})...'.format(file_name, datetime.now() - start_time) f = codecs.open('data/{}'.format(file_name), 'rU') print ' Sniffing file dialect ({})...'.format(datetime.now() - start_time) dialect = sniffer.sniff_dialect(f.read()) # Return to top of file f.seek(0) print ' Making csv Table object ({})...'.format(datetime.now() - start_time) from_file = Table.from_csv(f, name = file_name.rstrip('.csv'), encoding = 'utf-8') # Here be some overhead print ' Making SQLAlchemy Table object ({})...'.format(datetime.now() - start_time) sql_table = sql.make_table(from_file) print ' Creating db table ({})...'.format(datetime.now() - start_time) sql_table.create(engine, checkfirst=True) print ' Loading {} ({})...'.format(file_name, datetime.now() - start_time) copy_from_sql = '''COPY "{table_name}" FROM '{file_w_path}' DELIMITER '{delimiter}' QUOTE '{quote_character}' ENCODING 'UTF8' CSV HEADER;'''.format( table_name = file_name.rstrip('.csv') , file_w_path = os.getcwd() + '/data/' + file_name , delimiter = dialect.delimiter , quote_character = dialect.quotechar , escape_character = '' if dialect.escapechar is None else dialect.escapechar ) conn = db_engine.connect() t = conn.begin() try: conn.execute(copy_from_sql) t.commit() except: t.rollback() print copy_from_sql.replace('\t', '') print "Failed to commit." conn.close()
def from_csv(cls, f, name='from_csv_table', snifflimit=None, column_ids=None, blanks_as_nulls=True, zero_based=False, **kwargs): """ Creates a new Table from a file-like object containing CSV data. Note: the column_ids argument will cause only those columns with a matching identifier to be parsed, type inferred, etc. However, their order/index property will reflect the original data (e.g. column 8 will still be "order" 7, even if it's the third column in the resulting Table. """ # This bit of nonsense is to deal with "files" from stdin, # which are not seekable and thus must be buffered contents = f.read() if snifflimit: sample = contents[:snifflimit] else: sample = contents dialect = sniffer.sniff_dialect(sample) f = StringIO(contents) reader = CSVKitReader(f, dialect=dialect, **kwargs) headers = reader.next() if column_ids: column_ids = parse_column_identifiers(column_ids, headers, zero_based) headers = [headers[c] for c in column_ids] else: column_ids = range(len(headers)) data_columns = [[] for c in headers] for row in reader: for i, d in enumerate(row): try: data_columns[i].append(row[column_ids[i]].strip()) except IndexError: # Non-rectangular data is truncated break columns = [] for i, c in enumerate(data_columns): columns.append( Column(column_ids[i], headers[i], c, blanks_as_nulls=blanks_as_nulls)) return Table(columns, name=name)
def getReader(uploadfile): file = uploadfile.file extension = os.path.splitext(uploadfile.filename)[1] csvreader = None # make sure to convert excel files if extension == '.xls': file = StringIO.StringIO(xls2csv(file)) csvreader = reader(file) else: dialect = sniffer.sniff_dialect(file.read(4096)) file.seek(0) csvreader = reader(file, dialect=dialect) return csvreader
def from_csv(cls, f, name='from_csv_table', snifflimit=None, column_ids=None, blanks_as_nulls=True, zero_based=False, type_inference=True, **kwargs): """ Creates a new Table from a file-like object containing CSV data. Note: the column_ids argument will cause only those columns with a matching identifier to be parsed, type inferred, etc. However, their order/index property will reflect the original data (e.g. column 8 will still be "order" 7, even if it's the third column in the resulting Table. """ # This bit of nonsense is to deal with "files" from stdin, # which are not seekable and thus must be buffered contents = f.read() if snifflimit: sample = contents[:snifflimit] else: sample = contents dialect = sniffer.sniff_dialect(sample) normal_type = kwargs.pop("normal_type", InvalidType) f = StringIO(contents) reader = CSVKitReader(f, dialect=dialect, **kwargs) headers = reader.next() if column_ids: column_ids = parse_column_identifiers(column_ids, headers, zero_based) headers = [headers[c] for c in column_ids] else: column_ids = range(len(headers)) data_columns = [[] for c in headers] for row in reader: for i, d in enumerate(row): try: data_columns[i].append(row[column_ids[i]].strip()) except IndexError: # Non-rectangular data is truncated break columns = [] for i, c in enumerate(data_columns): columns.append(Column(column_ids[i], headers[i], c, blanks_as_nulls=blanks_as_nulls, type_inference=type_inference, normal_type=normal_type)) return Table(columns, name=name)
def detect_if_list(v): ''' Detects list using csvkit sniffer to detect delimiter, splitting on delim and filtering common delims to see final list length. Returns either a list or False ''' delimiters = ['|', ',', ';', '$'] LIST_LEN_THRESHOLD = 2 dialect = sniffer.sniff_dialect(v) if dialect: delim = dialect.delimiter split_vals = v.split(delim) filtered_vals = [ x for x in split_vals if (x and (x not in delimiters)) ] if filtered_vals >= 2: return filtered_vals return False
def from_csv(cls, f, name='from_csv_table', snifflimit=None, column_ids=None, blanks_as_nulls=True, zero_based=False, infer_types=True, no_header_row=False, **kwargs): """ Creates a new Table from a file-like object containing CSV data. Note: the column_ids argument will cause only those columns with a matching identifier to be parsed, type inferred, etc. However, their order/index property will reflect the original data (e.g. column 8 will still be "order" 7, even if it's the third column in the resulting Table. """ # This bit of nonsense is to deal with "files" from stdin, # which are not seekable and thus must be buffered contents = f.read() # snifflimit == 0 means do not sniff if snifflimit is None: kwargs['dialect'] = sniffer.sniff_dialect(contents) elif snifflimit > 0: kwargs['dialect'] = sniffer.sniff_dialect(contents[:snifflimit]) f = six.StringIO(contents) rows = agate.reader(f, **kwargs) try: if no_header_row: # Peek at a row to infer column names from, and put it back on top row = next(rows) rows = itertools.chain([row], rows) headers = make_default_headers(len(row)) else: headers = next(rows) except StopIteration: # The file is `/dev/null`. headers = [] pass if no_header_row or column_ids: column_ids = parse_column_identifiers(column_ids, headers, zero_based) headers = [headers[c] for c in column_ids] else: column_ids = range(len(headers)) data_columns = [[] for c in headers] width = len(data_columns) for i, row in enumerate(rows): j = 0 for j, d in enumerate(row): try: data_columns[j].append(row[column_ids[j]].strip()) except IndexError: # Non-rectangular data is truncated break j += 1 # Populate remaining columns with None while j < width: data_columns[j].append(None) j += 1 columns = [] for i, c in enumerate(data_columns): columns.append(Column(column_ids[i], headers[i], c, blanks_as_nulls=blanks_as_nulls, infer_types=infer_types)) return Table(columns, name=name)