示例#1
0
    def _agg_index(self, index, try_parse_dates: bool = True) -> Index:
        arrays = []

        for i, arr in enumerate(index):

            if try_parse_dates and self._should_parse_dates(i):
                arr = self._date_conv(arr)

            if self.na_filter:
                col_na_values = self.na_values
                col_na_fvalues = self.na_fvalues
            else:
                col_na_values = set()
                col_na_fvalues = set()

            if isinstance(self.na_values, dict):
                assert self.index_names is not None
                col_name = self.index_names[i]
                if col_name is not None:
                    col_na_values, col_na_fvalues = _get_na_values(
                        col_name, self.na_values, self.na_fvalues,
                        self.keep_default_na)

            arr, _ = self._infer_types(arr, col_na_values | col_na_fvalues)
            arrays.append(arr)

        names = self.index_names
        index = ensure_index_from_sequences(arrays, names)

        return index
示例#2
0
    def _agg_index(self, index, try_parse_dates=True) -> Index:
        arrays = []

        for i, arr in enumerate(index):

            if try_parse_dates and self._should_parse_dates(i):
                arr = self._date_conv(arr)

            if self.na_filter:
                col_na_values = self.na_values
                col_na_fvalues = self.na_fvalues
            else:
                col_na_values = set()
                col_na_fvalues = set()

            if isinstance(self.na_values, dict):
                # pandas\io\parsers.py:1678: error: Value of type
                # "Optional[Any]" is not indexable  [index]
                col_name = self.index_names[i]  # type: ignore[index]
                if col_name is not None:
                    col_na_values, col_na_fvalues = _get_na_values(
                        col_name, self.na_values, self.na_fvalues,
                        self.keep_default_na)

            arr, _ = self._infer_types(arr, col_na_values | col_na_fvalues)
            arrays.append(arr)

        names = self.index_names
        index = ensure_index_from_sequences(arrays, names)

        return index
示例#3
0
    def _get_empty_meta(
        self, columns, index_col, index_names, dtype: DtypeArg | None = None
    ):
        columns = list(columns)

        # Convert `dtype` to a defaultdict of some kind.
        # This will enable us to write `dtype[col_name]`
        # without worrying about KeyError issues later on.
        if not is_dict_like(dtype):
            # if dtype == None, default will be object.
            default_dtype = dtype or object
            # error: Argument 1 to "defaultdict" has incompatible type "Callable[[],
            # Union[ExtensionDtype, str, dtype[Any], Type[object], Dict[Hashable,
            # Union[ExtensionDtype, Union[str, dtype[Any]], Type[str], Type[float],
            # Type[int], Type[complex], Type[bool], Type[object]]]]]"; expected
            # "Optional[Callable[[], Union[ExtensionDtype, str, dtype[Any],
            # Type[object]]]]"
            # error: Incompatible return value type (got "Union[ExtensionDtype, str,
            # dtype[Any], Type[object], Dict[Hashable, Union[ExtensionDtype, Union[str,
            # dtype[Any]], Type[str], Type[float], Type[int], Type[complex], Type[bool],
            # Type[object]]]]", expected "Union[ExtensionDtype, str, dtype[Any],
            # Type[object]]")
            dtype = defaultdict(
                lambda: default_dtype  # type: ignore[arg-type, return-value]
            )
        else:
            dtype = cast(dict, dtype)
            dtype = defaultdict(
                lambda: object,
                {columns[k] if is_integer(k) else k: v for k, v in dtype.items()},
            )

        # Even though we have no data, the "index" of the empty DataFrame
        # could for example still be an empty MultiIndex. Thus, we need to
        # check whether we have any index columns specified, via either:
        #
        # 1) index_col (column indices)
        # 2) index_names (column names)
        #
        # Both must be non-null to ensure a successful construction. Otherwise,
        # we have to create a generic empty Index.
        if (index_col is None or index_col is False) or index_names is None:
            index = Index([])
        else:
            data = [Series([], dtype=dtype[name]) for name in index_names]
            index = ensure_index_from_sequences(data, names=index_names)
            index_col.sort()

            for i, n in enumerate(index_col):
                columns.pop(n - i)

        col_dict = {col_name: Series([], dtype=dtype[col_name]) for col_name in columns}

        return index, columns, col_dict
示例#4
0
    def _get_empty_meta(self,
                        columns,
                        index_col,
                        index_names,
                        dtype: DtypeArg | None = None):
        columns = list(columns)

        # Convert `dtype` to a defaultdict of some kind.
        # This will enable us to write `dtype[col_name]`
        # without worrying about KeyError issues later on.
        dtype_dict: defaultdict[Hashable, Any]
        if not is_dict_like(dtype):
            # if dtype == None, default will be object.
            default_dtype = dtype or object
            dtype_dict = defaultdict(lambda: default_dtype)
        else:
            dtype = cast(dict, dtype)
            dtype_dict = defaultdict(
                lambda: object,
                {
                    columns[k] if is_integer(k) else k: v
                    for k, v in dtype.items()
                },
            )

        # Even though we have no data, the "index" of the empty DataFrame
        # could for example still be an empty MultiIndex. Thus, we need to
        # check whether we have any index columns specified, via either:
        #
        # 1) index_col (column indices)
        # 2) index_names (column names)
        #
        # Both must be non-null to ensure a successful construction. Otherwise,
        # we have to create a generic empty Index.
        if (index_col is None or index_col is False) or index_names is None:
            index = Index([])
        else:
            data = [Series([], dtype=dtype_dict[name]) for name in index_names]
            index = ensure_index_from_sequences(data, names=index_names)
            index_col.sort()

            for i, n in enumerate(index_col):
                columns.pop(n - i)

        col_dict = {
            col_name: Series([], dtype=dtype_dict[col_name])
            for col_name in columns
        }

        return index, columns, col_dict
示例#5
0
    def _agg_index(self, index, try_parse_dates: bool = True) -> Index:
        arrays = []
        converters = self._clean_mapping(self.converters)

        for i, arr in enumerate(index):

            if try_parse_dates and self._should_parse_dates(i):
                arr = self._date_conv(arr)

            if self.na_filter:
                col_na_values = self.na_values
                col_na_fvalues = self.na_fvalues
            else:
                col_na_values = set()
                col_na_fvalues = set()

            if isinstance(self.na_values, dict):
                assert self.index_names is not None
                col_name = self.index_names[i]
                if col_name is not None:
                    col_na_values, col_na_fvalues = _get_na_values(
                        col_name, self.na_values, self.na_fvalues, self.keep_default_na
                    )

            clean_dtypes = self._clean_mapping(self.dtype)

            cast_type = None
            index_converter = False
            if self.index_names is not None:
                if isinstance(clean_dtypes, dict):
                    cast_type = clean_dtypes.get(self.index_names[i], None)

                if isinstance(converters, dict):
                    index_converter = converters.get(self.index_names[i]) is not None

            try_num_bool = not (
                cast_type and is_string_dtype(cast_type) or index_converter
            )

            arr, _ = self._infer_types(
                arr, col_na_values | col_na_fvalues, try_num_bool
            )
            arrays.append(arr)

        names = self.index_names
        index = ensure_index_from_sequences(arrays, names)

        return index
示例#6
0
    def read(self, nrows=None):
        try:
            data = self._reader.read(nrows)
        except StopIteration:
            # error: Cannot determine type of '_first_chunk'
            if self._first_chunk:  # type: ignore[has-type]
                self._first_chunk = False
                names = self._maybe_dedup_names(self.orig_names)
                index, columns, col_dict = self._get_empty_meta(
                    names,
                    self.index_col,
                    self.index_names,
                    dtype=self.kwds.get("dtype"),
                )
                columns = self._maybe_make_multi_index_columns(columns, self.col_names)

                if self.usecols is not None:
                    columns = self._filter_usecols(columns)

                col_dict = {k: v for k, v in col_dict.items() if k in columns}

                return index, columns, col_dict

            else:
                self.close()
                raise

        # Done with first read, next time raise StopIteration
        self._first_chunk = False

        # error: Cannot determine type of 'names'
        names = self.names  # type: ignore[has-type]

        if self._reader.leading_cols:
            if self._has_complex_date_col:
                raise NotImplementedError("file structure not yet supported")

            # implicit index, no index names
            arrays = []

            for i in range(self._reader.leading_cols):
                if self.index_col is None:
                    values = data.pop(i)
                else:
                    values = data.pop(self.index_col[i])

                values = self._maybe_parse_dates(values, i, try_parse_dates=True)
                arrays.append(values)

            index = ensure_index_from_sequences(arrays)

            if self.usecols is not None:
                names = self._filter_usecols(names)

            names = self._maybe_dedup_names(names)

            # rename dict keys
            data_tups = sorted(data.items())
            data = {k: v for k, (i, v) in zip(names, data_tups)}

            names, data = self._do_date_conversions(names, data)

        else:
            # rename dict keys
            data_tups = sorted(data.items())

            # ugh, mutation

            # assert for mypy, orig_names is List or None, None would error in list(...)
            assert self.orig_names is not None
            names = list(self.orig_names)
            names = self._maybe_dedup_names(names)

            if self.usecols is not None:
                names = self._filter_usecols(names)

            # columns as list
            alldata = [x[1] for x in data_tups]

            data = {k: v for k, (i, v) in zip(names, data_tups)}

            names, data = self._do_date_conversions(names, data)
            index, names = self._make_index(data, alldata, names)

        # maybe create a mi on the columns
        names = self._maybe_make_multi_index_columns(names, self.col_names)

        return index, names, data