Exemplo n.º 1
0
def parse_green_taxi_csv(fobj):
    """
    Parse a binary file object of cleaned "green taxi" CSV data as
    returned by the "read_green_taxi_csv" function, and return a PyArrow
    table.
    """

    convert_options = ConvertOptions(
        column_types=SCHEMA,
        false_values=['N'],
        null_values=[''],
        timestamp_parsers=['%Y-%m-%d %H:%M:%S'],
        true_values=['Y'],
    )
    parse_options = ParseOptions(quote_char=False)
    read_options = ReadOptions(
        column_names=SCHEMA.names,
        encoding=ENCODING,
    )

    return read_csv(
        fobj,
        convert_options=convert_options,
        parse_options=parse_options,
        read_options=read_options,
    )
Exemplo n.º 2
0
 def test_options_delimiter(self):
     rows = b"a;b,c\nde,fg;eh\n"
     table = self.read_bytes(rows)
     assert table.to_pydict() == {
         'a;b': [u'de'],
         'c': [u'fg;eh'],
         }
     opts = ParseOptions(delimiter=';')
     table = self.read_bytes(rows, parse_options=opts)
     assert table.to_pydict() == {
         'a': [u'de,fg'],
         'b,c': [u'eh'],
         }
Exemplo n.º 3
0
    def test_options_delimiter(self):
        rows = b"a;b,c\nde,fg;eh\n"
        reader = self.open_bytes(rows)
        expected_schema = pa.schema([('a;b', pa.string()), ('c', pa.string())])
        self.check_reader(reader, expected_schema, [{
            'a;b': ['de'],
            'c': ['fg;eh']
        }])

        opts = ParseOptions(delimiter=';')
        reader = self.open_bytes(rows, parse_options=opts)
        expected_schema = pa.schema([('a', pa.string()), ('b,c', pa.string())])
        self.check_reader(reader, expected_schema, [{
            'a': ['de,fg'],
            'b,c': ['eh']
        }])
Exemplo n.º 4
0
 def test_empty_lines(self):
     rows = b"a,b\n\r1,2\r\n\r\n3,4\r\n"
     table = self.read_bytes(rows)
     assert table.to_pydict() == {
         'a': [1, 3],
         'b': [2, 4],
         }
     parse_options = ParseOptions(ignore_empty_lines=False)
     table = self.read_bytes(rows, parse_options=parse_options)
     assert table.to_pydict() == {
         'a': [None, 1, None, 3],
         'b': [None, 2, None, 4],
         }
     read_options = ReadOptions(skip_rows=2)
     table = self.read_bytes(rows, parse_options=parse_options,
                             read_options=read_options)
     assert table.to_pydict() == {
         '1': [None, 3],
         '2': [None, 4],
         }
Exemplo n.º 5
0
    def read(self,
             env: CylonEnv,
             table,
             relevant_cols=None,
             **kwargs) -> DataFrame:
        filepath = self.table_path_mapping[table].replace('$TABLE', table)

        names, _ = get_schema(table)
        # csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)
        # .with_delimiter('|')
        read_opts = ReadOptions(column_names=names, block_size=(1 << 30))
        parse_opts = ParseOptions(delimiter='|')
        convert_opts = ConvertOptions(include_columns=relevant_cols)

        # if table is in refresh_tables list, read that table and concat
        # NOTE: refresh tables have the same parallelism as its data tables
        if table in REFRESH_TABLES:
            data_table = pa_read_csv(filepath,
                                     read_options=read_opts,
                                     parse_options=parse_opts,
                                     convert_options=convert_opts)
            refresh_path = filepath.replace('/data/', '/data_refresh/')

            refresh_table = pa_read_csv(refresh_path,
                                        read_options=read_opts,
                                        parse_options=parse_opts,
                                        convert_options=convert_opts)

            pa_table = pa_concat_tables([data_table, refresh_table])
        else:
            pa_table = pa_read_csv(filepath,
                                   read_options=read_opts,
                                   parse_options=parse_opts,
                                   convert_options=convert_opts)

        return DataFrame(Table.from_arrow(env.context, pa_table))
Exemplo n.º 6
0
    def read_csv(
        cls,
        filepath_or_buffer,
        sep=",",
        delimiter=None,
        header="infer",
        names=None,
        index_col=None,
        usecols=None,
        squeeze=False,
        prefix=None,
        mangle_dupe_cols=True,
        dtype=None,
        engine=None,
        converters=None,
        true_values=None,
        false_values=None,
        skipinitialspace=False,
        skiprows=None,
        nrows=None,
        na_values=None,
        keep_default_na=True,
        na_filter=True,
        verbose=False,
        skip_blank_lines=True,
        parse_dates=False,
        infer_datetime_format=False,
        keep_date_col=False,
        date_parser=None,
        dayfirst=False,
        cache_dates=True,
        iterator=False,
        chunksize=None,
        compression="infer",
        thousands=None,
        decimal=b".",
        lineterminator=None,
        quotechar='"',
        quoting=0,
        escapechar=None,
        comment=None,
        encoding=None,
        dialect=None,
        error_bad_lines=True,
        warn_bad_lines=True,
        skipfooter=0,
        doublequote=True,
        delim_whitespace=False,
        low_memory=True,
        memory_map=False,
        float_precision=None,
        storage_options=None,
    ):
        items = locals().copy()
        mykwargs = {k: items[k] for k in items if k in cls.arg_keys}
        eng = str(engine).lower().strip()
        try:
            if eng in ["pandas", "c"]:
                return cls._read(**mykwargs)

            if isinstance(dtype, dict):
                column_types = {
                    c: cls._dtype_to_arrow(t)
                    for c, t in dtype.items()
                }
            else:
                column_types = cls._dtype_to_arrow(dtype)

            if (type(parse_dates) is list) and type(column_types) is dict:
                for c in parse_dates:
                    column_types[c] = pa.timestamp("s")

            if names:
                if header == 0:
                    skiprows = skiprows + 1 if skiprows is not None else 1
                elif header is None or header == "infer":
                    pass
                else:
                    raise NotImplementedError(
                        "read_csv with 'arrow' engine and provided 'names' parameter supports only 0, None and 'infer' header values"
                    )
            else:
                if header == 0 or header == "infer":
                    pass
                else:
                    raise NotImplementedError(
                        "read_csv with 'arrow' engine without 'names' parameter provided supports only 0 and 'infer' header values"
                    )

            if delimiter is None:
                delimiter = sep

            if delim_whitespace and delimiter != ",":
                raise ValueError(
                    "Specified a delimiter and delim_whitespace=True; you can only specify one."
                )

            usecols_md = cls._prepare_pyarrow_usecols(mykwargs)

            po = ParseOptions(
                delimiter="\\s+" if delim_whitespace else delimiter,
                quote_char=quotechar,
                double_quote=doublequote,
                escape_char=escapechar,
                newlines_in_values=False,
                ignore_empty_lines=skip_blank_lines,
            )
            co = ConvertOptions(
                check_utf8=None,
                column_types=column_types,
                null_values=None,
                true_values=None,
                false_values=None,
                # timestamp fields should be handled as strings if parse_dates
                # didn't passed explicitly as an array or a dict
                timestamp_parsers=[""]
                if isinstance(parse_dates, bool) else None,
                strings_can_be_null=None,
                include_columns=usecols_md,
                include_missing_columns=None,
                auto_dict_encode=None,
                auto_dict_max_cardinality=None,
            )
            ro = ReadOptions(
                use_threads=True,
                block_size=None,
                skip_rows=skiprows,
                column_names=names,
                autogenerate_column_names=None,
            )

            at = read_csv(
                filepath_or_buffer,
                read_options=ro,
                parse_options=po,
                convert_options=co,
            )

            return cls.from_arrow(at)
        except (pa.ArrowNotImplementedError, NotImplementedError):
            if eng in ["arrow"]:
                raise

            ErrorMessage.default_to_pandas("`read_csv`")
            return cls._read(**mykwargs)
Exemplo n.º 7
0
    def read_csv(
        cls,
        filepath_or_buffer,
        sep=",",
        delimiter=None,
        header="infer",
        names=lib.no_default,
        index_col=None,
        usecols=None,
        squeeze=False,
        prefix=lib.no_default,
        mangle_dupe_cols=True,
        dtype=None,
        engine=None,
        converters=None,
        true_values=None,
        false_values=None,
        skipinitialspace=False,
        skiprows=None,
        nrows=None,
        na_values=None,
        keep_default_na=True,
        na_filter=True,
        verbose=False,
        skip_blank_lines=True,
        parse_dates=False,
        infer_datetime_format=False,
        keep_date_col=False,
        date_parser=None,
        dayfirst=False,
        cache_dates=True,
        iterator=False,
        chunksize=None,
        compression="infer",
        thousands=None,
        decimal=".",
        lineterminator=None,
        quotechar='"',
        quoting=0,
        escapechar=None,
        comment=None,
        encoding=None,
        encoding_errors="strict",
        dialect=None,
        error_bad_lines=None,
        warn_bad_lines=None,
        on_bad_lines=None,
        skipfooter=0,
        doublequote=True,
        delim_whitespace=False,
        low_memory=True,
        memory_map=False,
        float_precision=None,
        storage_options=None,
    ):  # noqa: PR01
        """
        Read data from `filepath_or_buffer` according to the passed `kwargs` parameters.

        For parameters description please refer to pandas API.

        Returns
        -------
        BaseQueryCompiler
            Query compiler with imported data for further processing.

        Notes
        -----
        Reading performed by using of `pyarrow.read_csv` function.
        """
        items = locals().copy()
        mykwargs = {k: items[k] for k in items if k in cls.arg_keys}
        eng = str(engine).lower().strip()
        try:
            if eng in ["pandas", "c"]:
                return cls._read(**mykwargs)

            cls._validate_read_csv_kwargs(mykwargs)
            use_modin_impl, error_message = cls._read_csv_check_support(
                mykwargs,
            )
            if not use_modin_impl:
                raise ArrowEngineException(error_message)
            if isinstance(dtype, dict):
                column_types = {c: cls._dtype_to_arrow(t) for c, t in dtype.items()}
            else:
                column_types = cls._dtype_to_arrow(dtype)

            if (type(parse_dates) is list) and type(column_types) is dict:
                for c in parse_dates:
                    column_types[c] = pa.timestamp("s")

            if names not in [lib.no_default, None] and header == 0:
                skiprows = skiprows + 1 if skiprows is not None else 1

            if delimiter is None and sep is not lib.no_default:
                delimiter = sep

            usecols_md = cls._prepare_pyarrow_usecols(mykwargs)

            po = ParseOptions(
                delimiter="\\s+" if delim_whitespace else delimiter,
                quote_char=quotechar,
                double_quote=doublequote,
                escape_char=escapechar,
                newlines_in_values=False,
                ignore_empty_lines=skip_blank_lines,
            )
            co = ConvertOptions(
                check_utf8=None,
                column_types=column_types,
                null_values=None,
                true_values=None,
                false_values=None,
                # timestamp fields should be handled as strings if parse_dates
                # didn't passed explicitly as an array or a dict
                timestamp_parsers=[""] if isinstance(parse_dates, bool) else None,
                strings_can_be_null=None,
                include_columns=usecols_md,
                include_missing_columns=None,
                auto_dict_encode=None,
                auto_dict_max_cardinality=None,
            )
            ro = ReadOptions(
                use_threads=True,
                block_size=None,
                skip_rows=skiprows,
                column_names=names if names is not lib.no_default else None,
                autogenerate_column_names=None,
            )

            at = read_csv(
                filepath_or_buffer,
                read_options=ro,
                parse_options=po,
                convert_options=co,
            )

            return cls.from_arrow(at)
        except (
            pa.ArrowNotImplementedError,
            pa.ArrowInvalid,
            NotImplementedError,
            ArrowEngineException,
        ):
            if eng in ["arrow"]:
                raise

            ErrorMessage.default_to_pandas("`read_csv`")
            return cls._read(**mykwargs)