예제 #1
0
    def _extract_multi_indexer_columns(self,
                                       header,
                                       index_names,
                                       col_names,
                                       passed_names: bool = False):
        """
        extract and return the names, index_names, col_names
        header is a list-of-lists returned from the parsers
        """
        if len(header) < 2:
            return header[0], index_names, col_names, passed_names

        # the names are the tuples of the header that are not the index cols
        # 0 is the name of the index, assuming index_col is a list of column
        # numbers
        ic = self.index_col
        if ic is None:
            ic = []

        if not isinstance(ic, (list, tuple, np.ndarray)):
            ic = [ic]
        sic = set(ic)

        # clean the index_names
        index_names = header.pop(-1)
        index_names, _, _ = self._clean_index_names(index_names,
                                                    self.index_col,
                                                    self.unnamed_cols)

        # extract the columns
        field_count = len(header[0])

        def extract(r):
            return tuple(r[i] for i in range(field_count) if i not in sic)

        columns = list(zip(*(extract(r) for r in header)))
        names = ic + columns

        # If we find unnamed columns all in a single
        # level, then our header was too long.
        for n in range(len(columns[0])):
            if all(ensure_str(col[n]) in self.unnamed_cols for col in columns):
                header = ",".join([str(x) for x in self.header])
                raise ParserError(
                    f"Passed header=[{header}] are too many rows "
                    "for this multi_index of columns")

        # Clean the column names (if we have an index_col).
        if len(ic):
            col_names = [
                r[0] if ((r[0] is not None) and r[0] not in self.unnamed_cols)
                else None for r in header
            ]
        else:
            col_names = [None] * len(header)

        passed_names = True

        return names, index_names, col_names, passed_names
예제 #2
0
파일: _json.py 프로젝트: zeyu-gong/pandas
 def read(self):
     """
     Read the whole JSON input into a pandas object.
     """
     if self.lines and self.chunksize:
         obj = concat(self)
     elif self.lines:
         data = ensure_str(self.data)
         obj = self._get_object_parser(self._combine_lines(data.split("\n")))
     else:
         obj = self._get_object_parser(self.data)
     self.close()
     return obj
예제 #3
0
파일: _json.py 프로젝트: llawall/pandas
 def read(self):
     """
     Read the whole JSON input into a pandas object.
     """
     if self.lines:
         if self.chunksize:
             obj = concat(self)
         elif self.nrows:
             lines = list(islice(self.data, self.nrows))
             lines_json = self._combine_lines(lines)
             obj = self._get_object_parser(lines_json)
         else:
             data = ensure_str(self.data)
             data_lines = data.split("\n")
             obj = self._get_object_parser(self._combine_lines(data_lines))
     else:
         obj = self._get_object_parser(self.data)
     self.close()
     return obj
예제 #4
0
def construct_1d_arraylike_from_scalar(value, length: int, dtype):
    """
    create a np.ndarray / pandas type of specified shape and dtype
    filled with values

    Parameters
    ----------
    value : scalar value
    length : int
    dtype : pandas_dtype / np.dtype

    Returns
    -------
    np.ndarray / pandas type of length, filled with value

    """
    if is_extension_array_dtype(dtype):
        cls = dtype.construct_array_type()
        subarr = cls._from_sequence([value] * length, dtype=dtype)

    else:
        if not isinstance(dtype, (np.dtype, type(np.dtype))):
            dtype = dtype.dtype

        if length and is_integer_dtype(dtype) and isna(value):
            # coerce if we have nan for an integer dtype
            dtype = np.dtype("float64")
        elif isinstance(dtype, np.dtype) and dtype.kind in ("U", "S"):
            # we need to coerce to object dtype to avoid
            # to allow numpy to take our string as a scalar value
            dtype = object
            if not isna(value):
                value = ensure_str(value)

        subarr = np.empty(length, dtype=dtype)
        subarr.fill(value)

    return subarr
예제 #5
0
    def _extract_multi_indexer_columns(
        self,
        header,
        index_names: list | None,
        passed_names: bool = False,
    ):
        """
        Extract and return the names, index_names, col_names if the column
        names are a MultiIndex.

        Parameters
        ----------
        header: list of lists
            The header rows
        index_names: list, optional
            The names of the future index
        passed_names: bool, default False
            A flag specifying if names where passed

        """
        if len(header) < 2:
            return header[0], index_names, None, passed_names

        # the names are the tuples of the header that are not the index cols
        # 0 is the name of the index, assuming index_col is a list of column
        # numbers
        ic = self.index_col
        if ic is None:
            ic = []

        if not isinstance(ic, (list, tuple, np.ndarray)):
            ic = [ic]
        sic = set(ic)

        # clean the index_names
        index_names = header.pop(-1)
        index_names, _, _ = self._clean_index_names(index_names,
                                                    self.index_col,
                                                    self.unnamed_cols)

        # extract the columns
        field_count = len(header[0])

        # check if header lengths are equal
        if not all(
                len(header_iter) == field_count for header_iter in header[1:]):
            raise ParserError(
                "Header rows must have an equal number of columns.")

        def extract(r):
            return tuple(r[i] for i in range(field_count) if i not in sic)

        columns = list(zip(*(extract(r) for r in header)))
        names = columns.copy()
        for single_ic in sorted(ic):
            names.insert(single_ic, single_ic)

        # If we find unnamed columns all in a single
        # level, then our header was too long.
        for n in range(len(columns[0])):
            if all(ensure_str(col[n]) in self.unnamed_cols for col in columns):
                header = ",".join([str(x) for x in self.header])
                raise ParserError(
                    f"Passed header=[{header}] are too many rows "
                    "for this multi_index of columns")

        # Clean the column names (if we have an index_col).
        if len(ic):
            col_names = [
                r[ic[0]] if ((r[ic[0]] is not None)
                             and r[ic[0]] not in self.unnamed_cols) else None
                for r in header
            ]
        else:
            col_names = [None] * len(header)

        passed_names = True

        return names, index_names, col_names, passed_names