示例#1
0
def test_csv_ignore_decoding_errors():
    # Test doomed CSVs with malformed Unicode characters. Can't repro this with a small example,
    # but in some situations chardet can return None, so we fall back to UTF-8. For the purposes
    # of this test, we force UTF-8 instead.

    malformed = b"name;number\nTA\xef\xbf\xbd\xef\xbf\xbd\xef\xc3\x87\xc3\x83O\xc2\xba;17"

    options = CSVOptions(ignore_decode_errors=False,
                         encoding="utf-8",
                         autodetect_encoding=False)

    with pytest.raises(UnicodeDecodeError):
        make_csv_reader(BytesIO(malformed), options)

    options = CSVOptions(ignore_decode_errors=True,
                         encoding="utf-8",
                         autodetect_encoding=False)
    options, reader = make_csv_reader(BytesIO(malformed), options)
    assert options.encoding == "utf-8"
    assert options.header is True

    data = list(reader)
    assert len(data) == 2
    assert data[0] == ["name", "number"]
    assert data[1] == ["TA��ÇÃOº", "17"]
示例#2
0
    def execute(self, quals, columns, sortkeys=None):
        """Main Multicorn entry point."""

        if self.mode == "http":
            with requests.get(self.url,
                              stream=True,
                              verify=os.environ.get("SSL_CERT_FILE",
                                                    True)) as response:
                response.raise_for_status()
                stream = response.raw
                if response.headers.get("Content-Encoding") == "gzip":
                    stream = gzip.GzipFile(fileobj=stream)

                csv_options = self.csv_options
                if csv_options.encoding is None and not csv_options.autodetect_encoding:
                    csv_options = csv_options._replace(
                        encoding=response.encoding)

                csv_options, reader = make_csv_reader(stream, csv_options)
                yield from self._read_csv(reader, csv_options)
        else:
            response = None
            try:
                response = self.s3_client.get_object(
                    bucket_name=self.s3_bucket, object_name=self.s3_object)
                csv_options = self.csv_options
                if csv_options.encoding is None and not csv_options.autodetect_encoding:
                    csv_options = csv_options._replace(
                        autodetect_encoding=True)
                csv_options, reader = make_csv_reader(response, csv_options)
                yield from self._read_csv(reader, csv_options)
            finally:
                if response:
                    response.close()
                    response.release_conn()
示例#3
0
def test_csv_mac_newlines():
    # Test a CSV file with old Mac-style newlines (\r)

    with open(os.path.join(INGESTION_RESOURCES_CSV, "mac_newlines.csv"),
              "rb") as f:
        options = CSVOptions()
        options, reader = make_csv_reader(f, options)

        assert options.encoding == "utf-8"
        assert options.header is True

        data = list(reader)
        assert len(data) == 5
        assert data[0] == ["fruit_id", "timestamp", "name"]

        schema = generate_column_names(infer_sg_schema(data))
        assert schema == [
            TableColumn(ordinal=1,
                        name="fruit_id",
                        pg_type="integer",
                        is_pk=False,
                        comment=None),
            TableColumn(ordinal=2,
                        name="timestamp",
                        pg_type="timestamp",
                        is_pk=False,
                        comment=None),
            TableColumn(ordinal=3,
                        name="name",
                        pg_type="character varying",
                        is_pk=False,
                        comment=None),
        ]
示例#4
0
def _get_table_definition(response, fdw_options, table_name, table_options):
    # Allow overriding introspection options with per-table params (e.g. encoding, delimiter...)
    fdw_options = copy(fdw_options)
    fdw_options.update(table_options)

    csv_options, reader = make_csv_reader(
        response, CSVOptions.from_fdw_options(fdw_options))
    sample = list(islice(reader, csv_options.schema_inference_rows))

    if not csv_options.header:
        sample = [[""] * len(sample[0])] + sample

    sg_schema = infer_sg_schema(sample, None, None)

    # For nonexistent column names: replace with autogenerated ones (can't have empty column names)
    sg_schema = generate_column_names(sg_schema)

    # Merge the autodetected table options with the ones passed to us originally (e.g.
    # S3 object etc)
    new_table_options = copy(table_options)
    new_table_options.update(csv_options.to_table_options())

    # Build Multicorn TableDefinition. ColumnDefinition takes in type OIDs,
    # typmods and other internal PG stuff but other FDWs seem to get by with just
    # the textual type name.
    return TableDefinition(
        table_name=table_name,
        schema=None,
        columns=[
            ColumnDefinition(column_name=c.name, type_name=c.pg_type)
            for c in sg_schema
        ],
        options=new_table_options,
    )
示例#5
0
def test_csv_dialect_encoding_inference():
    # Test CSV dialect inference with:
    #  - win-1252 encoding (will autodetect with chardet)
    #  - Windows line endings
    #  - different separator
    #  - first column name missing

    with open(os.path.join(INGESTION_RESOURCES_CSV, "encoding-win-1252.csv"),
              "rb") as f:
        options = CSVOptions()
        options, reader = make_csv_reader(f, options)

        assert options.encoding == "Windows-1252"
        assert options.header is True
        # NB we don't extract everything from the sniffed dialect, just the delimiter and the
        # quotechar. The sniffer also returns doublequote and skipinitialspace.
        assert options.delimiter == ";"

        data = list(reader)

        assert data == [
            ["", "DATE", "TEXT"],
            ["1", "01/07/2021", "Pañamao"],
            ["2", "06/11/2018", "–"],
            ["3", "28/05/2018", "División"],
        ]

        schema = generate_column_names(infer_sg_schema(data))
        assert schema == [
            TableColumn(ordinal=1,
                        name="col_1",
                        pg_type="integer",
                        is_pk=False,
                        comment=None),
            TableColumn(ordinal=2,
                        name="DATE",
                        pg_type="character varying",
                        is_pk=False,
                        comment=None),
            TableColumn(ordinal=3,
                        name="TEXT",
                        pg_type="character varying",
                        is_pk=False,
                        comment=None),
        ]