def corr(kdf: 'ks.DataFrame', method: str = 'pearson') -> pd.DataFrame: """ The correlation matrix of all the numerical columns of this dataframe. Only accepts scalar numerical values for now. :param kdf: the koalas dataframe. :param method: {'pearson', 'spearman'} * pearson : standard correlation coefficient * spearman : Spearman rank correlation :return: :class:`pandas.DataFrame` >>> ks.DataFrame({'A': [0, 1], 'B': [1, 0], 'C': ['x', 'y']}).corr() A B A 1.0 -1.0 B -1.0 1.0 """ assert method in ('pearson', 'spearman') ndf, column_index = to_numeric_df(kdf) corr = Correlation.corr(ndf, CORRELATION_OUTPUT_COLUMN, method) pcorr = corr.toPandas() arr = pcorr.iloc[0, 0].toArray() if column_index_level(column_index) > 1: idx = pd.MultiIndex.from_tuples(column_index) else: idx = pd.Index([idx[0] for idx in column_index]) return pd.DataFrame(arr, columns=idx, index=idx)
def __getitem__(self, key): from databricks.koalas.frame import DataFrame from databricks.koalas.series import Series rows_sel, cols_sel = _unfold(key, self._kdf_or_kser if self._is_series else None) cond, limit = self._select_rows(rows_sel) column_index, columns, returns_series = self._select_cols(cols_sel) if cond is None and limit is None and returns_series: if self._is_series: return self._kdf_or_kser._with_new_scol(columns[0]) else: return Series(self._internal.copy(scol=columns[0], column_index=[column_index[0]]), anchor=self._kdf_or_kser) else: try: sdf = self._internal._sdf if cond is not None: sdf = sdf.where(cond) if limit is not None: if limit >= 0: sdf = sdf.limit(limit) else: sdf = sdf.limit(sdf.count() + limit) sdf = sdf.select(self._internal.index_scols + columns) if self._internal.column_index_names is None: column_index_names = None else: # Manage column index names level = column_index_level(column_index) column_index_names = self._internal.column_index_names[-level:] internal = _InternalFrame(sdf=sdf, index_map=self._internal.index_map, column_index=column_index, column_index_names=column_index_names) kdf = DataFrame(internal) except AnalysisException: raise KeyError('[{}] don\'t exist in columns' .format([col._jc.toString() for col in columns])) if returns_series: return Series(kdf._internal.copy(scol=kdf._internal.column_scols[0]), anchor=kdf) else: return kdf
def column_index_level(self) -> int: """ Return the level of the column index. """ return column_index_level(self._column_index)
def __getitem__(self, key): from databricks.koalas.frame import DataFrame from databricks.koalas.series import Series if self._is_series: if isinstance(key, Series) and key._kdf is not self._kdf_or_kser._kdf: kdf = self._kdf_or_kser.to_frame() kdf['__temp_col__'] = key return type(self)( kdf[self._kdf_or_kser.name])[kdf['__temp_col__']] cond, limit, remaining_index = self._select_rows(key) if cond is None and limit is None: return self._kdf_or_kser column_index = self._internal.column_index column_scols = self._internal.column_scols returns_series = True else: assert self._is_df if isinstance(key, tuple): if len(key) != 2: raise SparkPandasIndexingError( "Only accepts pairs of candidates") rows_sel, cols_sel = key else: rows_sel = key cols_sel = None if isinstance(rows_sel, Series) and rows_sel._kdf is not self._kdf_or_kser: kdf = self._kdf_or_kser.copy() kdf['__temp_col__'] = rows_sel return type(self)(kdf)[kdf['__temp_col__'], cols_sel][list( self._kdf_or_kser.columns)] cond, limit, remaining_index = self._select_rows(rows_sel) column_index, column_scols, returns_series = self._select_cols( cols_sel) if cond is None and limit is None and returns_series: return Series(self._internal.copy( scol=column_scols[0], column_index=[column_index[0]]), anchor=self._kdf_or_kser) if remaining_index is not None: index_scols = self._internal.index_scols[-remaining_index:] index_map = self._internal.index_map[-remaining_index:] else: index_scols = self._internal.index_scols index_map = self._internal.index_map if self._internal.column_index_names is None: column_index_names = None else: # Manage column index names level = column_index_level(column_index) column_index_names = self._internal.column_index_names[-level:] try: sdf = self._internal._sdf if cond is not None: sdf = sdf.drop(NATURAL_ORDER_COLUMN_NAME).filter(cond) if limit is not None: if limit >= 0: sdf = sdf.limit(limit) else: sdf = sdf.limit(sdf.count() + limit) sdf = sdf.select(index_scols + column_scols) except AnalysisException: raise KeyError('[{}] don\'t exist in columns'.format( [col._jc.toString() for col in column_scols])) internal = _InternalFrame(sdf=sdf, index_map=index_map, column_index=column_index, column_index_names=column_index_names) kdf = DataFrame(internal) if returns_series: kdf_or_kser = Series( kdf._internal.copy(scol=kdf._internal.column_scols[0]), anchor=kdf) else: kdf_or_kser = kdf if remaining_index is not None and remaining_index == 0: pdf_or_pser = kdf_or_kser.head(2).to_pandas() length = len(pdf_or_pser) if length == 0: raise KeyError(name_like_string(key)) elif length == 1: return pdf_or_pser.iloc[0] else: return kdf_or_kser else: return kdf_or_kser
def __getitem__(self, key): from databricks.koalas.frame import DataFrame from databricks.koalas.series import Series rows_sel, cols_sel = _unfold(key, self._kdf_or_kser if self._is_series else None) cond = self._select_rows(rows_sel) # make cols_sel a 1-tuple of string if a single string if isinstance(cols_sel, Series): cols_sel = _make_col(cols_sel) elif isinstance(cols_sel, slice) and cols_sel != slice(None): raise LocIndexer._raiseNotImplemented( "Can only select columns either by name or reference or all") elif isinstance(cols_sel, slice) and cols_sel == slice(None): cols_sel = None returns_series = cols_sel is not None and isinstance(cols_sel, spark.Column) if cols_sel is None: column_index = self._internal.column_index columns = self._internal.column_scols elif isinstance(cols_sel, (str, tuple)): if isinstance(cols_sel, str): cols_sel = (cols_sel,) column_index, columns, returns_series = \ self._get_from_multiindex_column(cols_sel) elif isinstance(cols_sel, spark.Column): columns = [cols_sel] column_index = None elif all(isinstance(key, Series) for key in cols_sel): columns = [_make_col(key) for key in cols_sel] column_index = [key._internal.column_index[0] for key in cols_sel] elif all(isinstance(key, spark.Column) for key in cols_sel): columns = cols_sel column_index = None elif (any(isinstance(key, str) for key in cols_sel) and any(isinstance(key, tuple) for key in cols_sel)): raise TypeError('Expected tuple, got str') else: if all(isinstance(key, tuple) for key in cols_sel): level = self._internal.column_index_level if any(len(key) != level for key in cols_sel): raise ValueError('All the key level should be the same as column index level.') column_to_index = list(zip(self._internal.data_columns, self._internal.column_index)) columns = [] column_index = [] for key in cols_sel: found = False for column, idx in column_to_index: if idx == key or idx[0] == key: columns.append(_make_col(column)) column_index.append(idx) found = True if not found: raise KeyError("['{}'] not in index".format(key)) if cond is None and returns_series: if self._is_series: return self._kdf_or_kser._with_new_scol(columns[0]) else: return Series(self._internal.copy(scol=columns[0], column_index=[column_index[0]]), anchor=self._kdf_or_kser) else: try: sdf = self._internal._sdf if cond is not None: sdf = sdf.where(cond) sdf = sdf.select(self._internal.index_scols + columns) if self._internal.column_index_names is None: column_index_names = None else: # Manage column index names level = column_index_level(column_index) column_index_names = self._internal.column_index_names[-level:] internal = _InternalFrame(sdf=sdf, index_map=self._internal.index_map, column_index=column_index, column_index_names=column_index_names) kdf = DataFrame(internal) except AnalysisException: raise KeyError('[{}] don\'t exist in columns' .format([col._jc.toString() for col in columns])) if returns_series: return Series(kdf._internal.copy(scol=kdf._internal.column_scols[0]), anchor=kdf) else: return kdf