def test_csv_ignore_decoding_errors(): # Test doomed CSVs with malformed Unicode characters. Can't repro this with a small example, # but in some situations chardet can return None, so we fall back to UTF-8. For the purposes # of this test, we force UTF-8 instead. malformed = b"name;number\nTA\xef\xbf\xbd\xef\xbf\xbd\xef\xc3\x87\xc3\x83O\xc2\xba;17" options = CSVOptions(ignore_decode_errors=False, encoding="utf-8", autodetect_encoding=False) with pytest.raises(UnicodeDecodeError): make_csv_reader(BytesIO(malformed), options) options = CSVOptions(ignore_decode_errors=True, encoding="utf-8", autodetect_encoding=False) options, reader = make_csv_reader(BytesIO(malformed), options) assert options.encoding == "utf-8" assert options.header is True data = list(reader) assert len(data) == 2 assert data[0] == ["name", "number"] assert data[1] == ["TA��ÇÃOº", "17"]
def execute(self, quals, columns, sortkeys=None): """Main Multicorn entry point.""" if self.mode == "http": with requests.get(self.url, stream=True, verify=os.environ.get("SSL_CERT_FILE", True)) as response: response.raise_for_status() stream = response.raw if response.headers.get("Content-Encoding") == "gzip": stream = gzip.GzipFile(fileobj=stream) csv_options = self.csv_options if csv_options.encoding is None and not csv_options.autodetect_encoding: csv_options = csv_options._replace( encoding=response.encoding) csv_options, reader = make_csv_reader(stream, csv_options) yield from self._read_csv(reader, csv_options) else: response = None try: response = self.s3_client.get_object( bucket_name=self.s3_bucket, object_name=self.s3_object) csv_options = self.csv_options if csv_options.encoding is None and not csv_options.autodetect_encoding: csv_options = csv_options._replace( autodetect_encoding=True) csv_options, reader = make_csv_reader(response, csv_options) yield from self._read_csv(reader, csv_options) finally: if response: response.close() response.release_conn()
def test_csv_mac_newlines(): # Test a CSV file with old Mac-style newlines (\r) with open(os.path.join(INGESTION_RESOURCES_CSV, "mac_newlines.csv"), "rb") as f: options = CSVOptions() options, reader = make_csv_reader(f, options) assert options.encoding == "utf-8" assert options.header is True data = list(reader) assert len(data) == 5 assert data[0] == ["fruit_id", "timestamp", "name"] schema = generate_column_names(infer_sg_schema(data)) assert schema == [ TableColumn(ordinal=1, name="fruit_id", pg_type="integer", is_pk=False, comment=None), TableColumn(ordinal=2, name="timestamp", pg_type="timestamp", is_pk=False, comment=None), TableColumn(ordinal=3, name="name", pg_type="character varying", is_pk=False, comment=None), ]
def _get_table_definition(response, fdw_options, table_name, table_options): # Allow overriding introspection options with per-table params (e.g. encoding, delimiter...) fdw_options = copy(fdw_options) fdw_options.update(table_options) csv_options, reader = make_csv_reader( response, CSVOptions.from_fdw_options(fdw_options)) sample = list(islice(reader, csv_options.schema_inference_rows)) if not csv_options.header: sample = [[""] * len(sample[0])] + sample sg_schema = infer_sg_schema(sample, None, None) # For nonexistent column names: replace with autogenerated ones (can't have empty column names) sg_schema = generate_column_names(sg_schema) # Merge the autodetected table options with the ones passed to us originally (e.g. # S3 object etc) new_table_options = copy(table_options) new_table_options.update(csv_options.to_table_options()) # Build Multicorn TableDefinition. ColumnDefinition takes in type OIDs, # typmods and other internal PG stuff but other FDWs seem to get by with just # the textual type name. return TableDefinition( table_name=table_name, schema=None, columns=[ ColumnDefinition(column_name=c.name, type_name=c.pg_type) for c in sg_schema ], options=new_table_options, )
def test_csv_dialect_encoding_inference(): # Test CSV dialect inference with: # - win-1252 encoding (will autodetect with chardet) # - Windows line endings # - different separator # - first column name missing with open(os.path.join(INGESTION_RESOURCES_CSV, "encoding-win-1252.csv"), "rb") as f: options = CSVOptions() options, reader = make_csv_reader(f, options) assert options.encoding == "Windows-1252" assert options.header is True # NB we don't extract everything from the sniffed dialect, just the delimiter and the # quotechar. The sniffer also returns doublequote and skipinitialspace. assert options.delimiter == ";" data = list(reader) assert data == [ ["", "DATE", "TEXT"], ["1", "01/07/2021", "Pañamao"], ["2", "06/11/2018", "–"], ["3", "28/05/2018", "División"], ] schema = generate_column_names(infer_sg_schema(data)) assert schema == [ TableColumn(ordinal=1, name="col_1", pg_type="integer", is_pk=False, comment=None), TableColumn(ordinal=2, name="DATE", pg_type="character varying", is_pk=False, comment=None), TableColumn(ordinal=3, name="TEXT", pg_type="character varying", is_pk=False, comment=None), ]