def csv_import( repository, table, file, replace, primary_key, override_type, encoding, separator, no_header, skip_schema_check, ): """ Import a CSV file into a checked-out Splitgraph repository. This doesn't create a new image, use `sgr commit` after the import and any adjustments (e.g. adding primary keys or converting column types) to do so. If the target table doesn't exist, this will create a new table. If the target table does exist, this will try and patch the new values in by updating rows that exist in the current table (as per its primary key constraints) and inserting new ones. Rows existing in the current table but missing in the CSV won't be deleted. If `-r` is passed, the table will instead be deleted and recreated from the CSV file if it exists. """ import csv from splitgraph.ingestion.inference import infer_sg_schema from splitgraph.ingestion.csv import csv_adapter if not primary_key: click.echo( "Warning: primary key is not specified, using the whole row as primary key." "This is probably not something that you want." ) reader = csv.reader(file, delimiter=separator or ",") # Grab the first few rows from the CSV and give them to TableSchema. sample = list(islice(reader, 100)) if no_header: # Patch in a dummy header sample = [[str(i) for i in range(len(sample))]] + sample type_overrides = dict(override_type or []) sg_schema = infer_sg_schema(sample, override_types=type_overrides, primary_keys=primary_key) logging.debug("Using Splitgraph schema: %r", sg_schema) # Seek the file back to beginning and pass it to the csv writer file.seek(0) csv_adapter.to_table( file, repository, table, if_exists="replace" if replace else "patch", schema_check=not skip_schema_check, no_header=no_header, delimiter=separator, encoding=encoding, schema_spec=sg_schema, )
def test_csv_mac_newlines(): # Test a CSV file with old Mac-style newlines (\r) with open(os.path.join(INGESTION_RESOURCES_CSV, "mac_newlines.csv"), "rb") as f: options = CSVOptions() options, reader = make_csv_reader(f, options) assert options.encoding == "utf-8" assert options.header is True data = list(reader) assert len(data) == 5 assert data[0] == ["fruit_id", "timestamp", "name"] schema = generate_column_names(infer_sg_schema(data)) assert schema == [ TableColumn(ordinal=1, name="fruit_id", pg_type="integer", is_pk=False, comment=None), TableColumn(ordinal=2, name="timestamp", pg_type="timestamp", is_pk=False, comment=None), TableColumn(ordinal=3, name="name", pg_type="character varying", is_pk=False, comment=None), ]
def _get_table_definition(response, fdw_options, table_name, table_options): # Allow overriding introspection options with per-table params (e.g. encoding, delimiter...) fdw_options = copy(fdw_options) fdw_options.update(table_options) csv_options, reader = make_csv_reader( response, CSVOptions.from_fdw_options(fdw_options)) sample = list(islice(reader, csv_options.schema_inference_rows)) if not csv_options.header: sample = [[""] * len(sample[0])] + sample sg_schema = infer_sg_schema(sample, None, None) # For nonexistent column names: replace with autogenerated ones (can't have empty column names) sg_schema = generate_column_names(sg_schema) # Merge the autodetected table options with the ones passed to us originally (e.g. # S3 object etc) new_table_options = copy(table_options) new_table_options.update(csv_options.to_table_options()) # Build Multicorn TableDefinition. ColumnDefinition takes in type OIDs, # typmods and other internal PG stuff but other FDWs seem to get by with just # the textual type name. return TableDefinition( table_name=table_name, schema=None, columns=[ ColumnDefinition(column_name=c.name, type_name=c.pg_type) for c in sg_schema ], options=new_table_options, )
def test_csv_dialect_encoding_inference(): # Test CSV dialect inference with: # - win-1252 encoding (will autodetect with chardet) # - Windows line endings # - different separator # - first column name missing with open(os.path.join(INGESTION_RESOURCES_CSV, "encoding-win-1252.csv"), "rb") as f: options = CSVOptions() options, reader = make_csv_reader(f, options) assert options.encoding == "Windows-1252" assert options.header is True # NB we don't extract everything from the sniffed dialect, just the delimiter and the # quotechar. The sniffer also returns doublequote and skipinitialspace. assert options.delimiter == ";" data = list(reader) assert data == [ ["", "DATE", "TEXT"], ["1", "01/07/2021", "Pañamao"], ["2", "06/11/2018", "–"], ["3", "28/05/2018", "División"], ] schema = generate_column_names(infer_sg_schema(data)) assert schema == [ TableColumn(ordinal=1, name="col_1", pg_type="integer", is_pk=False, comment=None), TableColumn(ordinal=2, name="DATE", pg_type="character varying", is_pk=False, comment=None), TableColumn(ordinal=3, name="TEXT", pg_type="character varying", is_pk=False, comment=None), ]