def equal_dtypes(a, b): if is_categorical_dtype(a) != is_categorical_dtype(b): return False if (a is '-' or b is '-'): return False if is_categorical_dtype(a) and is_categorical_dtype(b): # Pandas 0.21 CategoricalDtype compat if (PANDAS_VERSION >= '0.21.0' and (UNKNOWN_CATEGORIES in a.categories or UNKNOWN_CATEGORIES in b.categories)): return True return a == b return (a.kind in eq_types and b.kind in eq_types) or (a == b)
def describe(data): ''' 对每个变量生成统计指标特征 对于每一个变量,生成如下字段: 数据类型: 最大值/频数最大的那个: 最小值/频数最小的那个: 均值/频数中间的那个: 缺失率: 范围/唯一数: ''' data=pd.DataFrame(data) n_sample=len(data) var_type=type_of_var(data,copy=True) summary=pd.DataFrame(columns=data.columns,index=['dtype','max','min','mean','missing_pct','std/nuniue']) for c in data.columns: missing_pct=1-data[c].count()/n_sample if var_type[c] == 'number': max_value,min_value,mean_value=data[c].max(),data[c].min(),data[c].mean() std_value=data[c].std() summary.loc[:,c]=[var_type[c],max_value,min_value,mean_value,missing_pct,std_value] elif var_type[c] == 'category' or is_categorical_dtype(data[c].dtype): tmp=data[c].value_counts() max_value,min_value=tmp.argmax(),tmp.argmin() mean_value_index=tmp[tmp==tmp.median()].index mean_value=mean_value_index[0] if len(mean_value_index)>0 else np.nan summary.loc[:,c]=[var_type[c],max_value,min_value,mean_value,missing_pct,len(tmp)] elif var_type[c] == 'datetime': max_value,min_value=data[c].max(),data[c].min() summary.loc[:,c]=[var_type[c],max_value,min_value,np.nan,missing_pct,np.nan] else: summary.loc[:,c]=[var_type[c],np.nan,np.nan,np.nan,missing_pct,np.nan] return summary
def get_var_type(col): """ Return var_type (for KDEMultivariate) of the column Parameters ---------- col : pandas.Series A dataframe column. Returns ------- out : str One of ['c', 'o', 'u']. See Also -------- The origin of the character codes is :class:`statsmodels.nonparametric.kernel_density.KDEMultivariate`. """ if pdtypes.is_numeric_dtype(col): # continuous return 'c' elif pdtypes.is_categorical_dtype(col): # ordered or unordered return 'o' if col.cat.ordered else 'u' else: # unordered if unsure, e.g string columns that # are not categorical return 'u'
def clear_known_categories(x, cols=None, index=True): """Set categories to be unknown. Parameters ---------- x : DataFrame, Series, Index cols : iterable, optional If x is a DataFrame, set only categoricals in these columns to unknown. By default, all categorical columns are set to unknown categoricals index : bool, optional If True and x is a Series or DataFrame, set the clear known categories in the index as well. """ if isinstance(x, (pd.Series, pd.DataFrame)): x = x.copy() if isinstance(x, pd.DataFrame): mask = x.dtypes == 'category' if cols is None: cols = mask[mask].index elif not mask.loc[cols].all(): raise ValueError("Not all columns are categoricals") for c in cols: x[c].cat.set_categories([UNKNOWN_CATEGORIES], inplace=True) elif isinstance(x, pd.Series): if is_categorical_dtype(x.dtype): x.cat.set_categories([UNKNOWN_CATEGORIES], inplace=True) if index and isinstance(x.index, pd.CategoricalIndex): x.index = x.index.set_categories([UNKNOWN_CATEGORIES]) elif isinstance(x, pd.CategoricalIndex): x = x.set_categories([UNKNOWN_CATEGORIES]) return x
def query_by_values(self, instance_vals, variable_id=None, columns=None, time_last=None, training_window=None): """Query instances that have variable with given value Args: instance_vals (pd.Dataframe, pd.Series, list[str] or str) : Instance(s) to match. variable_id (str) : Variable to query on. If None, query on index. columns (list[str]) : Columns to return. Return all columns if None. time_last (pd.TimeStamp) : Query data up to and including this time. Only applies if entity has a time index. training_window (Timedelta, optional): Data older than time_last by more than this will be ignored Returns: pd.DataFrame : instances that match constraints """ instance_vals = self._vals_to_series(instance_vals, variable_id) training_window = _check_timedelta(training_window) if training_window is not None: assert (isinstance(training_window, Timedelta) and training_window.is_absolute()),\ "training window must be an absolute Timedelta" if instance_vals is None: df = self.df.copy() elif instance_vals.shape[0] == 0: df = self.df.head(0) elif variable_id is None or variable_id == self.index: df = self.df.reindex(instance_vals) df.dropna(subset=[self.index], inplace=True) else: df = self.df.merge(instance_vals.to_frame(variable_id), how="inner", on=variable_id) df = df.set_index(self.index, drop=False) # ensure filtered df has same categories as original # workaround for issue below # github.com/pandas-dev/pandas/issues/22501#issuecomment-415982538 if pdtypes.is_categorical_dtype(self.df[variable_id]): categories = pd.api.types.CategoricalDtype(categories=self.df[variable_id].cat.categories) df[variable_id] = df[variable_id].astype(categories) df = self._handle_time(df=df, time_last=time_last, training_window=training_window) if columns is not None: df = df[columns] return df
def shard_df_on_index(df, divisions): """ Shard a DataFrame by ranges on its index Examples -------- >>> df = pd.DataFrame({'a': [0, 10, 20, 30, 40], 'b': [5, 4 ,3, 2, 1]}) >>> df a b 0 0 5 1 10 4 2 20 3 3 30 2 4 40 1 >>> shards = list(shard_df_on_index(df, [2, 4])) >>> shards[0] a b 0 0 5 1 10 4 >>> shards[1] a b 2 20 3 3 30 2 >>> shards[2] a b 4 40 1 >>> list(shard_df_on_index(df, []))[0] # empty case a b 0 0 5 1 10 4 2 20 3 3 30 2 4 40 1 """ if isinstance(divisions, Iterator): divisions = list(divisions) if not len(divisions): yield df else: divisions = np.array(divisions) df = df.sort_index() index = df.index if is_categorical_dtype(index): index = index.as_ordered() indices = index.searchsorted(divisions) yield df.iloc[:indices[0]] for i in range(len(indices) - 1): yield df.iloc[indices[i]: indices[i + 1]] yield df.iloc[indices[-1]:]
def add_margins(df, vars, margins=True): """ Add margins to a data frame. All margining variables will be converted to factors. Parameters ---------- df : dataframe input data frame vars : list a list of 2 lists | tuples vectors giving the variables in each dimension margins : bool | list variable names to compute margins for. True will compute all possible margins. """ margin_vars = _margins(vars, margins) if not margin_vars: return df # create margin dataframes margin_dfs = [df] for vlst in margin_vars[1:]: dfx = df.copy() for v in vlst: dfx.loc[0:, v] = '(all)' margin_dfs.append(dfx) merged = pd.concat(margin_dfs, axis=0) merged.reset_index(drop=True, inplace=True) # All margin columns become categoricals. The margin indicator # (all) needs to be added as the last level of the categories. categories = {} for v in itertools.chain(*vars): col = df[v] if not pdtypes.is_categorical_dtype(df[v].dtype): col = pd.Categorical(df[v]) categories[v] = col.categories if '(all)' not in categories[v]: categories[v] = categories[v].insert( len(categories[v]), '(all)') for v in merged.columns.intersection(set(categories)): merged[v] = merged[v].astype( pdtypes.CategoricalDtype(categories[v])) return merged
def elements_and_dtype(elements, dtype, source=None): if source is None: prefix = "" else: prefix = "%s." % (source,) if elements is not None: st.check_strategy(elements, "%selements" % (prefix,)) else: with check("dtype is not None"): if dtype is None: raise InvalidArgument( ( "At least one of %(prefix)selements or %(prefix)sdtype " "must be provided." ) % {"prefix": prefix} ) with check("is_categorical_dtype"): if is_categorical_dtype(dtype): raise InvalidArgument( "%sdtype is categorical, which is currently unsupported" % (prefix,) ) dtype = try_convert(np.dtype, dtype, "dtype") if elements is None: elements = npst.from_dtype(dtype) elif dtype is not None: def convert_element(value): name = "draw(%selements)" % (prefix,) try: return np.array([value], dtype=dtype)[0] except TypeError: raise InvalidArgument( "Cannot convert %s=%r of type %s to dtype %s" % (name, value, type(value).__name__, dtype.str) ) except ValueError: raise InvalidArgument( "Cannot convert %s=%r to type %s" % (name, value, dtype.str) ) elements = elements.map(convert_element) assert elements is not None return elements, dtype
def elements_and_dtype(elements, dtype, source=None): if source is None: prefix = '' else: prefix = '%s.' % (source,) if elements is not None: st.check_strategy(elements, '%selements' % (prefix,)) else: with check('dtype is not None'): if dtype is None: raise InvalidArgument(( 'At least one of %(prefix)selements or %(prefix)sdtype ' 'must be provided.') % {'prefix': prefix}) with check('is_categorical_dtype'): if is_categorical_dtype(dtype): raise InvalidArgument( '%sdtype is categorical, which is currently unsupported' % ( prefix, )) dtype = st.try_convert(np.dtype, dtype, 'dtype') if elements is None: elements = npst.from_dtype(dtype) elif dtype is not None: def convert_element(value): name = 'draw(%selements)' % (prefix,) try: return np.array([value], dtype=dtype)[0] except TypeError: raise InvalidArgument( 'Cannot convert %s=%r of type %s to dtype %s' % ( name, value, type(value).__name__, dtype.str ) ) except ValueError: raise InvalidArgument( 'Cannot convert %s=%r to type %s' % ( name, value, dtype.str, ) ) elements = elements.map(convert_element) assert elements is not None return elements, dtype
def _nonempty_series(s, idx): dtype = s.dtype if is_datetime64tz_dtype(dtype): entry = pd.Timestamp('1970-01-01', tz=dtype.tz) data = [entry, entry] elif is_categorical_dtype(dtype): entry = s.cat.categories[0] data = pd.Categorical([entry, entry], categories=s.cat.categories, ordered=s.cat.ordered) else: entry = _scalar_from_dtype(dtype) data = np.array([entry, entry], dtype=dtype) return pd.Series(data, name=s.name, index=idx)
def make_metadata(data, has_nulls=True, ignore_columns=[], fixed_text=None, object_encoding=None, times='int64', index_cols=[]): if not data.columns.is_unique: raise ValueError('Cannot create parquet dataset with duplicate' ' column names (%s)' % data.columns) pandas_metadata = {'index_columns': index_cols, 'columns': [], 'pandas_version': pd.__version__} root = parquet_thrift.SchemaElement(name='schema', num_children=0) meta = parquet_thrift.KeyValue() meta.key = 'pandas' fmd = parquet_thrift.FileMetaData(num_rows=len(data), schema=[root], version=1, created_by=created_by, row_groups=[], key_value_metadata=[meta]) object_encoding = object_encoding or {} for column in data.columns: if column in ignore_columns: continue pandas_metadata['columns'].append( get_column_metadata(data[column], column)) oencoding = (object_encoding if isinstance(object_encoding, STR_TYPE) else object_encoding.get(column, None)) fixed = None if fixed_text is None else fixed_text.get(column, None) if is_categorical_dtype(data[column].dtype): se, type = find_type(data[column].cat.categories, fixed_text=fixed, object_encoding=oencoding) se.name = column else: se, type = find_type(data[column], fixed_text=fixed, object_encoding=oencoding, times=times) col_has_nulls = has_nulls if has_nulls is None: se.repetition_type = data[column].dtype == "O" elif has_nulls is not True and has_nulls is not False: col_has_nulls = column in has_nulls if col_has_nulls: se.repetition_type = parquet_thrift.FieldRepetitionType.OPTIONAL fmd.schema.append(se) root.num_children += 1 meta.value = json.dumps(pandas_metadata, sort_keys=True) return fmd
def _nonempty_series(s, idx=None): # TODO: Use register dtypes with make_array_nonempty if idx is None: idx = _nonempty_index(s.index) dtype = s.dtype if is_datetime64tz_dtype(dtype): entry = pd.Timestamp('1970-01-01', tz=dtype.tz) data = [entry, entry] elif is_categorical_dtype(dtype): if len(s.cat.categories): data = [s.cat.categories[0]] * 2 cats = s.cat.categories else: data = _nonempty_index(s.cat.categories) cats = None data = pd.Categorical(data, categories=cats, ordered=s.cat.ordered) elif is_integer_na_dtype(dtype): data = pd.array([1, None], dtype=dtype) elif is_period_dtype(dtype): # pandas 0.24.0+ should infer this to be Series[Period[freq]] freq = dtype.freq data = [pd.Period('2000', freq), pd.Period('2001', freq)] elif is_sparse(dtype): # TODO: pandas <0.24 # Pandas <= 0.23.4: if PANDAS_GT_0240: entry = _scalar_from_dtype(dtype.subtype) else: entry = _scalar_from_dtype(dtype.subtype) data = pd.SparseArray([entry, entry], dtype=dtype) elif is_interval_dtype(dtype): entry = _scalar_from_dtype(dtype.subtype) if PANDAS_GT_0240: data = pd.array([entry, entry], dtype=dtype) else: data = np.array([entry, entry], dtype=dtype) elif type(dtype) in make_array_nonempty._lookup: data = make_array_nonempty(dtype) else: entry = _scalar_from_dtype(dtype) data = np.array([entry, entry], dtype=dtype) return pd.Series(data, name=s.name, index=idx)
def _id_var(x, drop=False): """ Assign ids to items in x. If two items are the same, they get the same id. Parameters ---------- x : array-like items to associate ids with drop : bool Whether to drop unused factor levels """ if len(x) == 0: return [] categorical = pdtypes.is_categorical_dtype(x) if categorical: if drop: x = x.cat.remove_unused_categories() lst = list(x.cat.codes + 1) else: has_nan = any(np.isnan(i) for i in x if isinstance(i, float)) if has_nan: # NaNs are -1, we give them the highest code nan_code = -1 new_nan_code = np.max(x.cat.codes) + 1 lst = [val if val != nan_code else new_nan_code for val in x] else: lst = list(x.cat.codes + 1) else: try: levels = np.sort(np.unique(x)) except TypeError: # x probably has NANs levels = multitype_sort(set(x)) lst = match(x, levels) lst = [item + 1 for item in lst] return lst
def _nonempty_series(s, idx=None): if idx is None: idx = _nonempty_index(s.index) dtype = s.dtype if is_datetime64tz_dtype(dtype): entry = pd.Timestamp('1970-01-01', tz=dtype.tz) data = [entry, entry] elif is_categorical_dtype(dtype): if len(s.cat.categories): data = [s.cat.categories[0]] * 2 cats = s.cat.categories else: data = _nonempty_index(s.cat.categories) cats = None data = pd.Categorical(data, categories=cats, ordered=s.cat.ordered) else: entry = _scalar_from_dtype(dtype) data = np.array([entry, entry], dtype=dtype) return pd.Series(data, name=s.name, index=idx)
def get_column_metadata(column, name): """Produce pandas column metadata block""" # from pyarrow.pandas_compat # https://github.com/apache/arrow/blob/master/python/pyarrow/pandas_compat.py inferred_dtype = infer_dtype(column) dtype = column.dtype if is_categorical_dtype(dtype): extra_metadata = { 'num_categories': len(column.cat.categories), 'ordered': column.cat.ordered, } dtype = column.cat.codes.dtype elif hasattr(dtype, 'tz'): extra_metadata = {'timezone': str(dtype.tz)} else: extra_metadata = None if not isinstance(name, six.string_types): raise TypeError( 'Column name must be a string. Got column {} of type {}'.format( name, type(name).__name__ ) ) return { 'name': name, 'pandas_type': { 'string': 'bytes' if PY2 else 'unicode', 'datetime64': ( 'datetimetz' if hasattr(dtype, 'tz') else 'datetime' ), 'integer': str(dtype), 'floating': str(dtype), }.get(inferred_dtype, inferred_dtype), 'numpy_type': get_numpy_type(dtype), 'metadata': extra_metadata, }
def strip_unknown_categories(x): """Replace any unknown categoricals with empty categoricals. Useful for preventing ``UNKNOWN_CATEGORIES`` from leaking into results. """ if isinstance(x, (pd.Series, pd.DataFrame)): x = x.copy() if isinstance(x, pd.DataFrame): cat_mask = x.dtypes == 'category' if cat_mask.any(): cats = cat_mask[cat_mask].index for c in cats: if not has_known_categories(x[c]): x[c].cat.set_categories([], inplace=True) elif isinstance(x, pd.Series): if is_categorical_dtype(x.dtype) and not has_known_categories(x): x.cat.set_categories([], inplace=True) if (isinstance(x.index, pd.CategoricalIndex) and not has_known_categories(x.index)): x.index = x.index.set_categories([]) elif isinstance(x, pd.CategoricalIndex) and not has_known_categories(x): x = x.set_categories([]) return x
def dtype_detection(data,category_detection=True,StructureText_detection=True,\ datetime_to_category=True,criterion='sqrt',min_mean_counts=5,fix=False): '''检测数据中单个变量的数据类型 将数据类型分为以下4种 1. number,数值型 2. category,因子 3. datetime,时间类型 4. text,文本型 5. text_st,结构性文本,比如ID, 6. group_number,连续 parameter --------- data: pd.Series 数据, 仅支持一维 # 如果有data,则函数会改变原来data的数据类型 category_detection: bool,根据 nunique 检测是否是因子类型 StructureText_detection: bool, 结构化文本,如列中都有一个分隔符"-" datetime_to_category: 时间序列如果 nunique过少是否转化成因子变量 criterion: string or int, optional (default="sqrt",即样本数的开根号) 支持:'sqrt':样本量的开根号, int: 绝对数, 0-1的float:样本数的百分多少 检测因子变量时,如果一个特征的nunique小于criterion,则判定为因子变量 min_mean_counts: default 5,数值型判定为因子变量时,需要满足每个类别的平均频数要大于min_mean_counts fix: bool,是否返回修改好类型的数据 return: result:dict{ 'name':列名, 'vtype':变量类型, 'ordered':是否是有序因子, 'categories':所有的因子} ''' assert len(data.shape)==1 data=data.copy() data=pd.Series(data) dtype,name,n_sample=data.dtype,data.name,data.count() min_mean_counts=5 if criterion=='sqrt': max_nuniques=np.sqrt(n_sample) elif isinstance(criterion,int): max_nuniques=criterion elif isinstance(criterion,float) and (0<criterion<1): max_nuniques=criterion else: max_nuniques=np.sqrt(n_sample) ordered=False categories=[] if is_numeric_dtype(dtype): vtype='number' ordered=False categories=[] # 纠正误分的数据类型。如将1.0,2.0,3.0都修正为1,2,3 if data.dropna().astype(np.int64).sum()==data.dropna().sum(): data[data.notnull()]=data[data.notnull()].astype(np.int64) if category_detection: nunique=len(data.dropna().unique()) mean_counts=data.value_counts().median() if nunique<max_nuniques and mean_counts>=min_mean_counts: data=data.astype('category') ordered=data.cat.ordered vtype='category' categories=list(data.dropna().cat.categories) result={'name':name,'vtype':vtype,'ordered':ordered,'categories':categories} elif is_string_dtype(dtype): # 处理时间类型 tmp=data.map(lambda x: np.nan if '%s'%x == 'nan' else len('%s'%x)) tmp=tmp.dropna().astype(np.int64) if not(any(data.dropna().map(is_number))) and 7<tmp.max()<20 and tmp.std()<0.1: try: data=pd.to_datetime(data) except : pass # 处理可能的因子类型 #时间格式是否处理为True 且 if datetime_to_category: if len(data.dropna().unique())<np.sqrt(n_sample): data=data.astype('category') else: nunique=len(data.dropna().unique()) #print(data.dtype) if not(is_categorical_dtype(data.dtype)) and not(np.issubdtype(data.dtype,np.datetime64)) and nunique<max_nuniques: data=data.astype('category') # 在非因子类型的前提下,将百分数转化成浮点数,例如21.12%-->0.2112 if is_string_dtype(data.dtype) and not(is_categorical_dtype(data.dtype)) and all(data.str.contains('%')): data=data.str.strip('%').astype(np.float64)/100 if is_categorical_dtype(data.dtype): vtype='category' categories=list(data.cat.categories) ordered=data.cat.ordered # 时间格式 elif np.issubdtype(data.dtype,np.datetime64): vtype='datetime' # 是否是结构化数组 elif StructureText_detection and tmp.dropna().std()==0: # 不可迭代,不是字符串 if not(isinstance(data.dropna().iloc[0],Iterable)): vtype='text' else: k=set(list(data.dropna().iloc[0])) for x in data: if isinstance(x,str) and len(x)>0: k&=set(list(x)) if len(k)>0: vtype='text_st' else: vtype='text' elif is_numeric_dtype(data.dtype): vtype='number' ordered=False categories=[] else: vtype='text' result={'name':name,'vtype':vtype,'ordered':ordered,'categories':categories} elif is_datetime64_any_dtype(dtype): vtype='datetime' result={'name':name,'vtype':vtype,'ordered':ordered,'categories':categories} else: print('unknown dtype!') result=None if fix: return result,data else: return result
def _get_color_values(adata, value_to_plot, groups=None, palette=None, use_raw=False): """ Returns the value or color associated to each data point. For categorical data, the return value is list of colors taken from the category palette or from the given `palette` value. For non-categorical data, the values are returned """ ### # when plotting, the color of the dots is determined for each plot # the data is either categorical or continuous and the data could be in # 'obs' or in 'var' categorical = False if value_to_plot is None: color_vector = 'lightgray' # check if value to plot is in obs elif value_to_plot in adata.obs.columns: if is_categorical_dtype(adata.obs[value_to_plot]): categorical = True if palette: # use category colors base on given palette _set_colors_for_categorical_obs(adata, value_to_plot, palette) else: if value_to_plot + '_colors' not in adata.uns or \ len(adata.uns[value_to_plot + '_colors']) < len(adata.obs[value_to_plot].cat.categories): # set a default palette in case that no colors or few colors are found _set_default_colors_for_categorical_obs( adata, value_to_plot) else: # check that the colors in 'uns' are valid _palette = [] for color in adata.uns[value_to_plot + '_colors']: if not is_color_like(color): # check if the color is a valid R color and translate it # to a valid hex color value if color in utils.additional_colors: color = utils.additional_colors[color] else: logg.warn( "The following color value found in adata.uns['{}'] " " is not valid: '{}'. Default colors are used." .format(value_to_plot + '_colors', color)) _set_default_colors_for_categorical_obs( adata, value_to_plot) _palette = None break _palette.append(color) if _palette is not None: adata.uns[value_to_plot + '_colors'] = _palette # for categorical data, colors should be # stored in adata.uns[value_to_plot + '_colors'] # Obtain color vector by converting every category # into its respective color color_vector = [ adata.uns[value_to_plot + '_colors'][x] for x in adata.obs[value_to_plot].cat.codes ] if groups is not None: if isinstance(groups, str): groups = [groups] color_vector = np.array(color_vector, dtype='<U15') # set color to 'light gray' for all values # that are not in the groups color_vector[~adata.obs[value_to_plot].isin(groups )] = "lightgray" else: color_vector = adata.obs[value_to_plot] # check if value to plot is in var elif use_raw is False and value_to_plot in adata.var_names: color_vector = adata[:, value_to_plot].X elif use_raw is True and value_to_plot in adata.raw.var_names: color_vector = adata.raw[:, value_to_plot].X else: raise ValueError( "The passed `color` {} is not a valid observation annotation " "or variable name. Valid observation annotation keys are: {}". format(value_to_plot, adata.obs.columns)) return color_vector, categorical
def paga_path(adata, nodes, keys, use_raw=True, annotations=['dpt_pseudotime'], color_map=None, color_maps_annotations={'dpt_pseudotime': 'Greys'}, palette_groups=None, n_avg=1, groups_key=None, xlim=[None, None], title=None, left_margin=None, ytick_fontsize=None, title_fontsize=None, show_node_names=True, show_yticks=True, show_colorbar=True, legend_fontsize=None, legend_fontweight=None, normalize_to_zero_one=False, as_heatmap=True, return_data=False, show=None, save=None, ax=None): """Gene expression and annotation changes along paths in the abstracted graph. Parameters ---------- adata : :class:`~scanpy.api.AnnData` An annotated data matrix. nodes : list of group names or their category indices A path through nodes of the abstracted graph, that is, names or indices (within `.categories`) of groups that have been used to run PAGA. keys : list of str Either variables in `adata.var_names` or annotations in `adata.obs`. They are plotted using `color_map`. use_raw : `bool`, optional (default: `True`) Use `adata.raw` for retrieving gene expressions if it has been set. annotations : list of annotations, optional (default: ['dpt_pseudotime']) Plot these keys with `color_maps_annotations`. Need to be keys for `adata.obs`. color_map : color map for plotting keys or `None`, optional (default: `None`) Matplotlib colormap. color_maps_annotations : dict storing color maps or `None`, optional (default: {'dpt_pseudotime': 'Greys'}) Color maps for plotting the annotations. Keys of the dictionary must appear in `annotations`. palette_groups : list of colors or `None`, optional (default: `None`) Ususally, use the same `sc.pl.palettes...` as used for coloring the abstracted graph. n_avg : `int`, optional (default: 1) Number of data points to include in computation of running average. groups_key : `str`, optional (default: `None`) Key of the grouping used to run PAGA. If `None`, defaults to `adata.uns['paga']['groups']`. as_heatmap : `bool`, optional (default: `True`) Plot the timeseries as heatmap. If not plotting as heatmap, `annotations` have no effect. show_node_names : `bool`, optional (default: `True`) Plot the node names on the nodes bar. show_colorbar : `bool`, optional (default: `True`) Show the colorbar. show_yticks : `bool`, optional (default: `True`) Show the y ticks. normalize_to_zero_one : `bool`, optional (default: `True`) Shift and scale the running average to [0, 1] per gene. return_data : `bool`, optional (default: `False`) Return the timeseries data in addition to the axes if `True`. show : `bool`, optional (default: `None`) Show the plot, do not return axis. save : `bool` or `str`, optional (default: `None`) If `True` or a `str`, save the figure. A string is appended to the default filename. Infer the filetype if ending on \{'.pdf', '.png', '.svg'\}. ax : `matplotlib.Axes` A matplotlib axes object. Returns ------- A `matplotlib.Axes`, if `ax` is `None`, else `None`. If `return_data`, return the timeseries data in addition to an axes. """ ax_was_none = ax is None if groups_key is None: if 'groups' not in adata.uns['paga']: raise KeyError( 'Pass the key of the grouping with which you ran PAGA, ' 'using the parameter `groups_key`.') groups_key = adata.uns['paga']['groups'] groups_names = adata.obs[groups_key].cat.categories if palette_groups is None: utils.add_colors_for_categorical_sample_annotation(adata, groups_key) palette_groups = adata.uns[groups_key + '_colors'] def moving_average(a): return sc_utils.moving_average(a, n_avg) ax = pl.gca() if ax is None else ax from matplotlib import transforms trans = transforms.blended_transform_factory(ax.transData, ax.transAxes) X = [] x_tick_locs = [0] x_tick_labels = [] groups = [] anno_dict = {anno: [] for anno in annotations} if isinstance(nodes[0], str): nodes_ints = [] groups_names_set = set(groups_names) for node in nodes: if node not in groups_names_set: raise ValueError( 'Each node/group needs to be one of {} (`groups_key`=\'{}\') not \'{}\'.' .format(groups_names.tolist(), groups_key, node)) nodes_ints.append(groups_names.get_loc(node)) nodes_strs = nodes else: nodes_ints = nodes nodes_strs = [groups_names[node] for node in nodes] adata_X = adata if use_raw and adata.raw is not None: adata_X = adata.raw for ikey, key in enumerate(keys): x = [] for igroup, group in enumerate(nodes_ints): idcs = np.arange(adata.n_obs)[adata.obs[groups_key].values == nodes_strs[igroup]] if len(idcs) == 0: raise ValueError( 'Did not find data points that match ' '`adata.obs[{}].values == str({})`.' 'Check whether adata.obs[{}] actually contains what you expect.' .format(groups_key, group, groups_key)) idcs_group = np.argsort(adata.obs['dpt_pseudotime'].values[ adata.obs[groups_key].values == nodes_strs[igroup]]) idcs = idcs[idcs_group] if key in adata.obs_keys(): x += list(adata.obs[key].values[idcs]) else: x += list(adata_X[:, key].X[idcs]) if ikey == 0: groups += [group for i in range(len(idcs))] x_tick_locs.append(len(x)) for anno in annotations: series = adata.obs[anno] if is_categorical_dtype(series): series = series.cat.codes anno_dict[anno] += list(series.values[idcs]) if n_avg > 1: old_len_x = len(x) x = moving_average(x) if ikey == 0: for key in annotations: if not isinstance(anno_dict[key][0], str): anno_dict[key] = moving_average(anno_dict[key]) if normalize_to_zero_one: x -= np.min(x) x /= np.max(x) X.append(x) if not as_heatmap: ax.plot(x[xlim[0]:xlim[1]], label=key) if ikey == 0: for igroup, group in enumerate(nodes): if len(groups_names) > 0 and group not in groups_names: label = groups_names[group] else: label = group x_tick_labels.append(label) X = np.array(X) if as_heatmap: img = ax.imshow(X, aspect='auto', interpolation='nearest', cmap=color_map) if show_yticks: ax.set_yticks(range(len(X))) ax.set_yticklabels(keys, fontsize=ytick_fontsize) else: ax.set_yticks([]) ax.set_frame_on(False) ax.set_xticks([]) ax.tick_params(axis='both', which='both', length=0) ax.grid(False) if show_colorbar: pl.colorbar(img, ax=ax) left_margin = 0.2 if left_margin is None else left_margin pl.subplots_adjust(left=left_margin) else: left_margin = 0.4 if left_margin is None else left_margin if len(keys) > 1: pl.legend(frameon=False, loc='center left', bbox_to_anchor=(-left_margin, 0.5), fontsize=legend_fontsize) xlabel = groups_key if not as_heatmap: ax.set_xlabel(xlabel) pl.yticks([]) if len(keys) == 1: pl.ylabel(keys[0] + ' (a.u.)') else: import matplotlib.colors # groups bar ax_bounds = ax.get_position().bounds groups_axis = pl.axes([ ax_bounds[0], ax_bounds[1] - ax_bounds[3] / len(keys), ax_bounds[2], ax_bounds[3] / len(keys) ]) groups = np.array(groups)[None, :] groups_axis.imshow( groups, aspect='auto', interpolation="nearest", cmap=matplotlib.colors.ListedColormap( # the following line doesn't work because of normalization # adata.uns['paga_groups_colors']) palette_groups[np.min(groups).astype(int):], N=int(np.max(groups) + 1 - np.min(groups)))) if show_yticks: groups_axis.set_yticklabels(['', xlabel, ''], fontsize=ytick_fontsize) else: groups_axis.set_yticks([]) groups_axis.set_frame_on(False) if show_node_names: ypos = (groups_axis.get_ylim()[1] + groups_axis.get_ylim()[0]) / 2 x_tick_locs = sc_utils.moving_average(x_tick_locs, n=2) for ilabel, label in enumerate(x_tick_labels): groups_axis.text(x_tick_locs[ilabel], ypos, x_tick_labels[ilabel], fontdict={ 'horizontalalignment': 'center', 'verticalalignment': 'center' }) groups_axis.set_xticks([]) groups_axis.grid(False) groups_axis.tick_params(axis='both', which='both', length=0) # further annotations y_shift = ax_bounds[3] / len(keys) for ianno, anno in enumerate(annotations): if ianno > 0: y_shift = ax_bounds[3] / len(keys) / 2 anno_axis = pl.axes([ ax_bounds[0], ax_bounds[1] - (ianno + 2) * y_shift, ax_bounds[2], y_shift ]) arr = np.array(anno_dict[anno])[None, :] if anno not in color_maps_annotations: color_map_anno = ('Vega10' if is_categorical_dtype( adata.obs[anno]) else 'Greys') else: color_map_anno = color_maps_annotations[anno] img = anno_axis.imshow(arr, aspect='auto', interpolation='nearest', cmap=color_map_anno) if show_yticks: anno_axis.set_yticklabels(['', anno, ''], fontsize=ytick_fontsize) anno_axis.tick_params(axis='both', which='both', length=0) else: anno_axis.set_yticks([]) anno_axis.set_frame_on(False) anno_axis.set_xticks([]) anno_axis.grid(False) if title is not None: ax.set_title(title, fontsize=title_fontsize) if show is None and not ax_was_none: show = False else: show = settings.autoshow if show is None else show utils.savefig_or_show('paga_path', show=show, save=save) if return_data: df = pd.DataFrame(data=X.T, columns=keys) df['groups'] = moving_average( groups) # groups is without moving average, yet if 'dpt_pseudotime' in anno_dict: df['distance'] = anno_dict['dpt_pseudotime'].T return ax, df if ax_was_none and show == False else df else: return ax if ax_was_none and show == False else None
def test_get_col(self): self.assertIsInstance(self.explainer.get_col("Sex"), pd.Series) self.assertTrue(is_categorical_dtype(self.explainer.get_col("Sex"))) self.assertIsInstance(self.explainer.get_col("Age"), pd.Series) self.assertTrue(is_numeric_dtype(self.explainer.get_col("Age")))
def _transform_cudf_df( data, feature_names: FeatNamesT, feature_types: Optional[List[str]], enable_categorical: bool, ): try: from cudf.api.types import is_categorical_dtype except ImportError: from cudf.utils.dtypes import is_categorical_dtype if _is_cudf_ser(data): dtypes = [data.dtype] else: dtypes = data.dtypes if not all(dtype.name in _pandas_dtype_mapper or (is_categorical_dtype(dtype) and enable_categorical) for dtype in dtypes): _invalid_dataframe_dtype(data) # handle feature names if feature_names is None: if _is_cudf_ser(data): feature_names = [data.name] elif lazy_isinstance(data.columns, "cudf.core.multiindex", "MultiIndex"): feature_names = [ " ".join([str(x) for x in i]) for i in data.columns ] elif (lazy_isinstance(data.columns, "cudf.core.index", "RangeIndex") or lazy_isinstance(data.columns, "cudf.core.index", "Int64Index") # Unique to cuDF, no equivalence in pandas 1.3.3 or lazy_isinstance(data.columns, "cudf.core.index", "Int32Index")): feature_names = list(map(str, data.columns)) else: feature_names = data.columns.format() # handle feature types if feature_types is None: feature_types = [] for dtype in dtypes: if is_categorical_dtype(dtype) and enable_categorical: feature_types.append(CAT_T) else: feature_types.append(_pandas_dtype_mapper[dtype.name]) # handle categorical data cat_codes = [] if _is_cudf_ser(data): # unlike pandas, cuDF uses NA for missing data. if is_categorical_dtype(data.dtype) and enable_categorical: codes = data.cat.codes cat_codes.append(codes) else: for col in data: if is_categorical_dtype(data[col].dtype) and enable_categorical: codes = data[col].cat.codes cat_codes.append(codes) return data, cat_codes, feature_names, feature_types
def _get_color_values(adata, value_to_plot, groups=None, palette=None, use_raw=False, gene_symbols=None, layer=None) -> Tuple[Union[np.ndarray, str], bool]: """ Returns the value or color associated to each data point. For categorical data, the return value is list of colors taken from the category palette or from the given `palette` value. For non-categorical data, the values are returned Returns ------- Tuple of values to plot, and boolean indicating whether they are categorical. """ if value_to_plot is None: return "lightgray", False if (gene_symbols is not None and value_to_plot not in adata.obs.columns and value_to_plot not in adata.var_names): # We should probably just make an index for this, and share it over runs value_to_plot = adata.var.index[ adata.var[gene_symbols] == value_to_plot][ 0] # TODO: Throw helpful error if this doesn't work if use_raw and value_to_plot not in adata.obs.columns: values = adata.raw.obs_vector(value_to_plot) else: values = adata.obs_vector(value_to_plot, layer=layer) ### # when plotting, the color of the dots is determined for each plot # the data is either categorical or continuous and the data could be in # 'obs' or in 'var' if not is_categorical_dtype(values): return values, False else: # is_categorical_dtype(values) color_key = f"{value_to_plot}_colors" if palette: _set_colors_for_categorical_obs(adata, value_to_plot, palette) elif color_key not in adata.uns or \ len(adata.uns[color_key]) < len(values.categories): # set a default palette in case that no colors or few colors are found _set_default_colors_for_categorical_obs(adata, value_to_plot) else: _palette = [] for color in adata.uns[color_key]: if not is_color_like(color): # check if the color is a valid R color and translate it # to a valid hex color value if color in _utils.additional_colors: color = _utils.additional_colors[color] else: logg.warning( f"The following color value found in adata.uns['{value_to_plot}_colors'] " f"is not valid: '{color}'. Default colors are used." ) _set_default_colors_for_categorical_obs( adata, value_to_plot) _palette = None break _palette.append(color) if _palette is not None: adata.uns[color_key] = _palette color_vector = np.asarray(adata.uns[color_key])[values.codes] # Handle groups if groups is not None: if isinstance(groups, str): groups = [groups] color_vector = np.array(color_vector, dtype='<U15') # set color to 'light gray' for all values # that are not in the groups color_vector[~adata.obs[value_to_plot].isin(groups)] = "lightgray" return color_vector, True
def regress_out(adata, keys, n_jobs=None, copy=False): """Regress out unwanted sources of variation. Uses simple linear regression. This is inspired by Seurat's `regressOut` function in R [Satija15]. Parameters ---------- adata : :class:`~anndata.AnnData` The annotated data matrix. keys : `str` or list of `str` Keys for observation annotation on which to regress on. n_jobs : `int` or `None`, optional. If None is given, then the n_jobs seting is used (default: `None`) Number of jobs for parallel computation. copy : `bool`, optional (default: `False`) If an :class:`~anndata.AnnData` is passed, determines whether a copy is returned. Returns ------- AnnData, None Depending on `copy` returns or updates `adata` with the corrected data matrix. """ logg.info('regressing out', keys, r=True) if issparse(adata.X): logg.info(' sparse input is densified and may ' 'lead to high memory use') adata = adata.copy() if copy else adata if isinstance(keys, str): keys = [keys] if issparse(adata.X): adata.X = adata.X.toarray() n_jobs = sett.n_jobs if n_jobs is None else n_jobs # regress on a single categorical variable sanitize_anndata(adata) variable_is_categorical = False if keys[0] in adata.obs_keys() and is_categorical_dtype( adata.obs[keys[0]]): if len(keys) > 1: raise ValueError('If providing categorical variable, ' 'only a single one is allowed. For this one ' 'we regress on the mean for each category.') logg.msg('... regressing on per-gene means within categories') regressors = np.zeros(adata.X.shape, dtype='float32') for category in adata.obs[keys[0]].cat.categories: mask = (category == adata.obs[keys[0]]).values for ix, x in enumerate(adata.X.T): regressors[mask, ix] = x[mask].mean() variable_is_categorical = True # regress on one or several ordinal variables else: # create data frame with selected keys (if given) if keys: regressors = adata.obs[keys] else: regressors = adata.obs.copy() # add column of ones at index 0 (first column) regressors.insert(0, 'ones', 1.0) len_chunk = np.ceil(min(1000, adata.X.shape[1]) / n_jobs).astype(int) n_chunks = np.ceil(adata.X.shape[1] / len_chunk).astype(int) tasks = [] # split the adata.X matrix by columns in chunks of size n_chunk (the last chunk could be of smaller # size than the others) chunk_list = np.array_split(adata.X, n_chunks, axis=1) if variable_is_categorical: regressors_chunk = np.array_split(regressors, n_chunks, axis=1) for idx, data_chunk in enumerate(chunk_list): # each task is a tuple of a data_chunk eg. (adata.X[:,0:100]) and # the regressors. This data will be passed to each of the jobs. if variable_is_categorical: regres = regressors_chunk[idx] else: regres = regressors tasks.append(tuple((data_chunk, regres, variable_is_categorical))) if n_jobs > 1 and n_chunks > 1: import multiprocessing pool = multiprocessing.Pool(n_jobs) res = pool.map_async(_regress_out_chunk, tasks).get(9999999) pool.close() else: res = list(map(_regress_out_chunk, tasks)) # res is a list of vectors (each corresponding to a regressed gene column). # The transpose is needed to get the matrix in the shape needed adata.X = np.vstack(res).T.astype(adata.X.dtype) logg.info(' finished', t=True) return adata if copy else None
def de_analysis( data: Union[MultimodalData, UnimodalData, AnnData], cluster: str, condition: Optional[str] = None, subset: Optional[List[str]] = None, de_key: Optional[str] = "de_res", n_jobs: Optional[int] = -1, t: Optional[bool] = False, fisher: Optional[bool] = False, temp_folder: Optional[str] = None, verbose: Optional[bool] = True, ) -> None: """Perform Differential Expression (DE) Analysis on data. The analysis considers one cluster at one time, comparing gene expression levels on cells within the cluster with all the others using a number of statistical tools, and determining up-regulated genes and down-regulated genes of the cluster. Mann-Whitney U test and AUROC are calculated by default. Welch's T test and Fisher's Exact test are optionally. The scalability performance on calculating all the test statistics is improved by the inspiration from `Presto <https://github.com/immunogenomics/presto>`_. Parameters ---------- data: ``MultimodalData``, ``UnimodalData``, or ``anndata.AnnData`` Data matrix with rows for cells and columns for genes. cluster: ``str`` Cluster labels used in DE analysis. Must exist in ``data.obs``. condition: ``str``, optional, default: ``None`` Sample attribute used as condition in DE analysis. If ``None``, no condition is considered; otherwise, must exist in ``data.obs``. If ``condition`` is used, the DE analysis will be performed on cells of each level of ``data.obs[condition]`` respectively, and collect the results after finishing. subset: ``List[str]``, optional, default: ``None`` Perform DE analysis on only a subset of cluster IDs. Cluster ID subset is specified as a list of strings, such as ``[clust_1,clust_3,clust_5]``, where all IDs must exist in ``data.obs[cluster]``. de_key: ``str``, optional, default: ``"de_res"`` Key name of DE analysis results stored. n_jobs: ``int``, optional, default: ``-1`` Number of threads to use. If ``-1``, use all available threads. t: ``bool``, optional, default: ``True`` If ``True``, calculate Welch's t test. fisher: ``bool``, optional, default: ``False`` If ``True``, calculate Fisher's exact test. temp_folder: ``str``, optional, default: ``None`` Joblib temporary folder for memmapping numpy arrays. verbose: ``bool``, optional, default: ``True`` If ``True``, show detailed intermediate output. Returns ------- ``None`` Update ``data.varm``: ``data.varm[de_key]``: DE analysis result. Examples -------- >>> pg.de_analysis(data, cluster='spectral_leiden_labels') >>> pg.de_analysis(data, cluster='louvain_labels', condition='anno') """ if cluster not in data.obs: raise ValueError("Cannot find cluster label!") cluster_labels = data.obs[cluster].values if not is_categorical_dtype(cluster_labels): from natsort import natsorted cluster_labels = pd.Categorical(cluster_labels, natsorted(np.unique(cluster_labels))) cond_labels = None if condition is not None: if condition not in data.obs: raise ValueError("Cannot find condition!") cond_labels = data.obs[condition].values if not is_categorical_dtype(cond_labels): from natsort import natsorted cond_labels = pd.Categorical(cond_labels, natsorted(np.unique(cond_labels))) if cond_labels.categories.size < 2: raise ValueError("Number of conditions must be at least 2!") X = data.X if isinstance(data.X, csr_matrix) else csr_matrix(data.X) # If dense matrix, force it to be a csr_matrix if subset is not None: # subset data for de analysis subset = np.array(subset) idx_s = np.isin(subset, cluster_labels.categories.values) if idx_s.sum() < subset.size: raise ValueError( "These cluster labels do not exist: " + ",".join(subset[~idx_s]) + "!" ) idx = np.isin(cluster_labels, subset) cluster_labels = pd.Categorical(cluster_labels[idx], categories = subset) if cond_labels is not None: cond_labels = cond_labels[idx] X = X[idx] if condition is not None: #Eliminate NaN rows from calculation idx_na = cond_labels.isna() if idx_na.sum() > 0: logger.warning("Detected NaN values in condition. Cells with NaN values are excluded from DE analysis.") idx_not_na = ~idx_na X = X[idx_not_na] cluster_labels = cluster_labels[idx_not_na] cond_labels = cond_labels[idx_not_na] n_jobs = eff_n_jobs(n_jobs) gene_names = data.var_names.values if cond_labels is None: df = _de_test(X, cluster_labels, gene_names, n_jobs, t, fisher, temp_folder, verbose) else: df = _de_test_cond(X, cluster_labels, cond_labels, gene_names, n_jobs, t, fisher, temp_folder, verbose) data.varm[de_key] = df.to_records(index=False) logger.info("Differential expression analysis is finished.")
def _paga_graph( adata, ax, solid_edges=None, dashed_edges=None, adjacency_solid=None, adjacency_dashed=None, transitions=None, threshold=None, root=0, colors=None, labels=None, fontsize=None, fontweight=None, fontoutline=None, text_kwds=None, node_size_scale=1., node_size_power=0.5, edge_width_scale=1., normalize_to_color='reference', title=None, pos=None, cmap=None, frameon=True, min_edge_width=None, max_edge_width=None, export_to_gexf=False, cax=None, colorbar=None, use_raw=True, cb_kwds=None, single_component=False, arrowsize=30, ): import networkx as nx if text_kwds is None: text_kwds = {} if cb_kwds is None: cb_kwds = {} node_labels = labels # rename for clarity if (node_labels is not None and isinstance(node_labels, str) and node_labels != adata.uns['paga']['groups']): raise ValueError( 'Provide a list of group labels for the PAGA groups {}, not {}.'. format(adata.uns['paga']['groups'], node_labels)) groups_key = adata.uns['paga']['groups'] if node_labels is None: node_labels = adata.obs[groups_key].cat.categories if (colors is None or colors == groups_key) and groups_key is not None: if (groups_key + '_colors' not in adata.uns or len(adata.obs[groups_key].cat.categories) != len( adata.uns[groups_key + '_colors'])): utils.add_colors_for_categorical_sample_annotation( adata, groups_key) colors = adata.uns[groups_key + '_colors'] for iname, name in enumerate(adata.obs[groups_key].cat.categories): if name in settings.categories_to_ignore: colors[iname] = 'grey' nx_g_solid = nx.Graph(adjacency_solid) if dashed_edges is not None: nx_g_dashed = nx.Graph(adjacency_dashed) # convert pos to array and dict if not isinstance(pos, (Path, str)): pos_array = pos else: pos = Path(pos) if pos.suffix != '.gdf': raise ValueError( 'Currently only supporting reading positions from .gdf files. ' 'Consider generating them using, for instance, Gephi.') s = '' # read the node definition from the file with pos.open() as f: f.readline() for line in f: if line.startswith('edgedef>'): break s += line from io import StringIO df = pd.read_csv(StringIO(s), header=-1) pos_array = df[[4, 5]].values # convert to dictionary pos = {n: [p[0], p[1]] for n, p in enumerate(pos_array)} # uniform color if isinstance(colors, str) and is_color_like(colors): colors = [colors for c in range(len(node_labels))] # color degree of the graph if isinstance(colors, str) and colors.startswith('degree'): # see also tools.paga.paga_degrees if colors == 'degree_dashed': colors = [d for _, d in nx_g_dashed.degree(weight='weight')] elif colors == 'degree_solid': colors = [d for _, d in nx_g_solid.degree(weight='weight')] else: raise ValueError( '`degree` either "degree_dashed" or "degree_solid".') colors = (np.array(colors) - np.min(colors)) / (np.max(colors) - np.min(colors)) # plot gene expression var_names = adata.var_names if adata.raw is None else adata.raw.var_names if isinstance(colors, str) and colors in var_names: x_color = [] cats = adata.obs[groups_key].cat.categories for icat, cat in enumerate(cats): subset = (cat == adata.obs[groups_key]).values if adata.raw is not None and use_raw: adata_gene = adata.raw[:, colors] else: adata_gene = adata[:, colors] x_color.append(np.mean(adata_gene.X[subset])) colors = x_color # plot continuous annotation if (isinstance(colors, str) and colors in adata.obs and not is_categorical_dtype(adata.obs[colors])): x_color = [] cats = adata.obs[groups_key].cat.categories for icat, cat in enumerate(cats): subset = (cat == adata.obs[groups_key]).values x_color.append(adata.obs.loc[subset, colors].mean()) colors = x_color # plot categorical annotation if (isinstance(colors, str) and colors in adata.obs and is_categorical_dtype(adata.obs[colors])): from ... import utils as sc_utils asso_names, asso_matrix = sc_utils.compute_association_matrix_of_groups( adata, prediction=groups_key, reference=colors, normalization='reference' if normalize_to_color else 'prediction') utils.add_colors_for_categorical_sample_annotation(adata, colors) asso_colors = sc_utils.get_associated_colors_of_groups( adata.uns[colors + '_colors'], asso_matrix) colors = asso_colors if len(colors) < len(node_labels): print(node_labels, colors) raise ValueError( '`color` list need to be at least as long as `groups`/`node_labels` list.' ) # count number of connected components n_components, labels = scipy.sparse.csgraph.connected_components( adjacency_solid) if n_components > 1 and not single_component: logg.debug( 'Graph has more than a single connected component. ' 'To restrict to this component, pass `single_component=True`.') if n_components > 1 and single_component: component_sizes = np.bincount(labels) largest_component = np.where( component_sizes == component_sizes.max())[0][0] adjacency_solid = adjacency_solid.tocsr()[labels == largest_component, :] adjacency_solid = adjacency_solid.tocsc()[:, labels == largest_component] colors = np.array(colors)[labels == largest_component] node_labels = np.array(node_labels)[labels == largest_component] cats_dropped = adata.obs[groups_key].cat.categories[ labels != largest_component].tolist() logg.info( 'Restricting graph to largest connected component by dropping categories\n' f'{cats_dropped}') nx_g_solid = nx.Graph(adjacency_solid) if dashed_edges is not None: raise ValueError( '`single_component` only if `dashed_edges` is `None`.') # edge widths base_edge_width = edge_width_scale * 5 * rcParams['lines.linewidth'] # draw dashed edges if dashed_edges is not None: widths = [x[-1]['weight'] for x in nx_g_dashed.edges(data=True)] widths = base_edge_width * np.array(widths) if max_edge_width is not None: widths = np.clip(widths, None, max_edge_width) nx.draw_networkx_edges(nx_g_dashed, pos, ax=ax, width=widths, edge_color='grey', style='dashed', alpha=0.5) # draw solid edges if transitions is None: widths = [x[-1]['weight'] for x in nx_g_solid.edges(data=True)] widths = base_edge_width * np.array(widths) if min_edge_width is not None or max_edge_width is not None: widths = np.clip(widths, min_edge_width, max_edge_width) with warnings.catch_warnings(): warnings.simplefilter("ignore") nx.draw_networkx_edges(nx_g_solid, pos, ax=ax, width=widths, edge_color='black') # draw directed edges else: adjacency_transitions = adata.uns['paga'][transitions].copy() if threshold is None: threshold = 0.01 adjacency_transitions.data[adjacency_transitions.data < threshold] = 0 adjacency_transitions.eliminate_zeros() g_dir = nx.DiGraph(adjacency_transitions.T) widths = [x[-1]['weight'] for x in g_dir.edges(data=True)] widths = base_edge_width * np.array(widths) if min_edge_width is not None or max_edge_width is not None: widths = np.clip(widths, min_edge_width, max_edge_width) nx.draw_networkx_edges(g_dir, pos, ax=ax, width=widths, edge_color='black', arrowsize=arrowsize) if export_to_gexf: if isinstance(colors[0], tuple): from matplotlib.colors import rgb2hex colors = [rgb2hex(c) for c in colors] for count, n in enumerate(nx_g_solid.nodes()): nx_g_solid.node[count]['label'] = str(node_labels[count]) nx_g_solid.node[count]['color'] = str(colors[count]) nx_g_solid.node[count]['viz'] = { 'position': { 'x': 1000 * pos[count][0], 'y': 1000 * pos[count][1], 'z': 0 } } filename = settings.writedir / 'paga_graph.gexf' logg.warning(f'exporting to {filename}') settings.writedir.mkdir(parents=True, exist_ok=True) nx.write_gexf(nx_g_solid, settings.writedir / 'paga_graph.gexf') ax.set_frame_on(frameon) ax.set_xticks([]) ax.set_yticks([]) # groups sizes if groups_key is not None and groups_key + '_sizes' in adata.uns: groups_sizes = adata.uns[groups_key + '_sizes'] else: groups_sizes = np.ones(len(node_labels)) base_scale_scatter = 2000 base_pie_size = (base_scale_scatter / (np.sqrt(adjacency_solid.shape[0]) + 10) * node_size_scale) median_group_size = np.median(groups_sizes) groups_sizes = base_pie_size * np.power(groups_sizes / median_group_size, node_size_power) if fontsize is None: fontsize = rcParams['legend.fontsize'] if fontoutline is not None: text_kwds['path_effects'] = [ patheffects.withStroke(linewidth=fontoutline, foreground='w') ] # usual scatter plot if not isinstance(colors[0], dict): n_groups = len(pos_array) sct = ax.scatter(pos_array[:, 0], pos_array[:, 1], c=colors[:n_groups], edgecolors='face', s=groups_sizes, cmap=cmap) for count, group in enumerate(node_labels): ax.text(pos_array[count, 0], pos_array[count, 1], group, verticalalignment='center', horizontalalignment='center', size=fontsize, fontweight=fontweight, **text_kwds) # else pie chart plot else: # start with this dummy plot... otherwise strange behavior sct = ax.scatter(pos_array[:, 0], pos_array[:, 1], c='white', edgecolors='face', s=groups_sizes, cmap=cmap) trans = ax.transData.transform bbox = ax.get_position().get_points() ax_x_min = bbox[0, 0] ax_x_max = bbox[1, 0] ax_y_min = bbox[0, 1] ax_y_max = bbox[1, 1] ax_len_x = ax_x_max - ax_x_min ax_len_y = ax_y_max - ax_y_min trans2 = ax.transAxes.inverted().transform pie_axs = [] for count, n in enumerate(nx_g_solid.nodes()): pie_size = groups_sizes[count] / base_scale_scatter x1, y1 = trans(pos[n]) # data coordinates xa, ya = trans2((x1, y1)) # axis coordinates xa = ax_x_min + (xa - pie_size / 2) * ax_len_x ya = ax_y_min + (ya - pie_size / 2) * ax_len_y # clip, the fruchterman layout sometimes places below figure if ya < 0: ya = 0 if xa < 0: xa = 0 pie_axs.append( pl.axes([xa, ya, pie_size * ax_len_x, pie_size * ax_len_y], frameon=False)) pie_axs[count].set_xticks([]) pie_axs[count].set_yticks([]) if not isinstance(colors[count], dict): raise ValueError( '{} is neither a dict of valid matplotlib colors ' 'nor a valid matplotlib color.'.format(colors[count])) color_single = colors[count].keys() fracs = [colors[count][c] for c in color_single] if sum(fracs) < 1: color_single = list(color_single) color_single.append('grey') fracs.append(1 - sum(fracs)) pie_axs[count].pie(fracs, colors=color_single) if node_labels is not None: for ia, a in enumerate(pie_axs): a.text(0.5, 0.5, node_labels[ia], verticalalignment='center', horizontalalignment='center', transform=a.transAxes, size=fontsize, fontweight=fontweight, **text_kwds) return sct
def infer_variable_types(df, link_vars, variable_types, time_index, secondary_time_index): '''Infer variable types from dataframe Args: df (DataFrame): Input DataFrame link_vars (list[]): Linked variables variable_types (dict[str -> dict[str -> type]]) : An entity's variable_types dict maps string variable ids to types (:class:`.Variable`) or (type, kwargs) to pass keyword arguments to the Variable. time_index (str or None): Name of time_index column secondary_time_index (dict[str: [str]]): Dictionary of secondary time columns that each map to a list of columns that depend on that secondary time ''' # TODO: set pk and pk types here inferred_types = {} vids_to_assume_datetime = [time_index] if len(list(secondary_time_index.keys())): vids_to_assume_datetime.append(list(secondary_time_index.keys())[0]) inferred_type = vtypes.Unknown for variable in df.columns: if variable in variable_types: continue elif variable in vids_to_assume_datetime: if col_is_datetime(df[variable]): inferred_type = vtypes.Datetime else: inferred_type = vtypes.Numeric elif variable in link_vars: inferred_type = vtypes.Categorical elif df[variable].dtype == "object": if not len(df[variable]): inferred_type = vtypes.Categorical elif col_is_datetime(df[variable]): inferred_type = vtypes.Datetime else: inferred_type = vtypes.Categorical # heuristics to predict this some other than categorical sample = df[variable].sample(min(10000, len(df[variable]))) # catch cases where object dtype cannot be interpreted as a string try: avg_length = sample.str.len().mean() if avg_length > 50: inferred_type = vtypes.Text except AttributeError: pass elif df[variable].dtype == "bool": inferred_type = vtypes.Boolean elif pdtypes.is_categorical_dtype(df[variable].dtype): inferred_type = vtypes.Categorical elif pdtypes.is_numeric_dtype(df[variable].dtype): inferred_type = vtypes.Numeric elif col_is_datetime(df[variable]): inferred_type = vtypes.Datetime elif len(df[variable]): sample = df[variable] \ .sample(min(10000, df[variable].nunique(dropna=False))) unique = sample.unique() percent_unique = sample.size / len(unique) if percent_unique < .05: inferred_type = vtypes.Categorical else: inferred_type = vtypes.Numeric inferred_types[variable] = inferred_type return inferred_types
def concat_pandas(dfs, axis=0, join='outer', uniform=False, filter_warning=True): if axis == 1: return pd.concat(dfs, axis=axis, join=join, **concat_kwargs) # Support concatenating indices along axis 0 if isinstance(dfs[0], pd.Index): if isinstance(dfs[0], pd.CategoricalIndex): return pd.CategoricalIndex(union_categoricals(dfs), name=dfs[0].name) elif isinstance(dfs[0], pd.MultiIndex): first, rest = dfs[0], dfs[1:] if all((isinstance(o, pd.MultiIndex) and o.nlevels >= first.nlevels) for o in rest): arrays = [concat([_get_level_values(i, n) for i in dfs]) for n in range(first.nlevels)] return pd.MultiIndex.from_arrays(arrays, names=first.names) to_concat = (first.values, ) + tuple(k._values for k in rest) new_tuples = np.concatenate(to_concat) try: return pd.MultiIndex.from_tuples(new_tuples, names=first.names) except Exception: return pd.Index(new_tuples) return dfs[0].append(dfs[1:]) # Handle categorical index separately dfs0_index = dfs[0].index has_categoricalindex = ( isinstance(dfs0_index, pd.CategoricalIndex) or (isinstance(dfs0_index, pd.MultiIndex) and any(isinstance(i, pd.CategoricalIndex) for i in dfs0_index.levels))) if has_categoricalindex: dfs2 = [df.reset_index(drop=True) for df in dfs] ind = concat([df.index for df in dfs]) else: dfs2 = dfs ind = None # Concatenate the partitions together, handling categories as needed if (isinstance(dfs2[0], pd.DataFrame) if uniform else any(isinstance(df, pd.DataFrame) for df in dfs2)): if uniform: dfs3 = dfs2 cat_mask = dfs2[0].dtypes == 'category' else: # When concatenating mixed dataframes and series on axis 1, Pandas # converts series to dataframes with a single column named 0, then # concatenates. dfs3 = [df if isinstance(df, pd.DataFrame) else df.to_frame().rename(columns={df.name: 0}) for df in dfs2] # pandas may raise a RuntimeWarning for comparing ints and strs with warnings.catch_warnings(): warnings.simplefilter("ignore", RuntimeWarning) if filter_warning: warnings.simplefilter('ignore', FutureWarning) cat_mask = pd.concat([(df.dtypes == 'category').to_frame().T for df in dfs3], join=join, **concat_kwargs).any() if cat_mask.any(): not_cat = cat_mask[~cat_mask].index # this should be aligned, so no need to filter warning out = pd.concat([df[df.columns.intersection(not_cat)] for df in dfs3], join=join, **concat_kwargs) temp_ind = out.index for col in cat_mask.index.difference(not_cat): # Find an example of categoricals in this column for df in dfs3: sample = df.get(col) if sample is not None: break # Extract partitions, subbing in missing if needed parts = [] for df in dfs3: if col in df.columns: parts.append(df[col]) else: codes = np.full(len(df), -1, dtype='i8') data = pd.Categorical.from_codes(codes, sample.cat.categories, sample.cat.ordered) parts.append(data) out[col] = union_categoricals(parts) # Pandas resets index type on assignment if frame is empty # https://github.com/pandas-dev/pandas/issues/17101 if not len(temp_ind): out.index = temp_ind out = out.reindex(columns=cat_mask.index) else: # pandas may raise a RuntimeWarning for comparing ints and strs with warnings.catch_warnings(): warnings.simplefilter("ignore", RuntimeWarning) if filter_warning: warnings.simplefilter("ignore", FutureWarning) out = pd.concat(dfs3, join=join, **concat_kwargs) else: if is_categorical_dtype(dfs2[0].dtype): if ind is None: ind = concat([df.index for df in dfs2]) return pd.Series(union_categoricals(dfs2), index=ind, name=dfs2[0].name) with warnings.catch_warnings(): if filter_warning: warnings.simplefilter('ignore', FutureWarning) out = pd.concat(dfs2, join=join, **concat_kwargs) # Re-add the index if needed if ind is not None: out.index = ind return out
def get_df( data, keys=None, layer=None, index=None, columns=None, sort_values=None, dropna="all", precision=None, ): """Get dataframe for a specified adata key. Return values for specified key (in obs, var, obsm, varm, obsp, varp, uns, or layers) as a dataframe. Arguments ------ adata AnnData object or a numpy array to get values from. keys Keys from `.var_names`, `.obs_names`, `.var`, `.obs`, `.obsm`, `.varm`, `.obsp`, `.varp`, `.uns`, or `.layers`. layer Layer of `adata` to use as expression values. index List to set as index. columns List to set as columns names. sort_values Wether to sort values by first column (sort_values=True) or a specified column. dropna Drop columns/rows that contain NaNs in all ('all') or in any entry ('any'). precision Set precision for pandas dataframe. Returns ------- A dataframe. """ if precision is not None: pd.set_option("precision", precision) if isinstance(data, AnnData): keys, keys_split = (keys.split("*") if isinstance(keys, str) and "*" in keys else (keys, None)) keys, key_add = (keys.split("/") if isinstance(keys, str) and "/" in keys else (keys, None)) keys = [keys] if isinstance(keys, str) else keys key = keys[0] s_keys = ["obs", "var", "obsm", "varm", "uns", "layers"] d_keys = [ data.obs.keys(), data.var.keys(), data.obsm.keys(), data.varm.keys(), data.uns.keys(), data.layers.keys(), ] if hasattr(data, "obsp") and hasattr(data, "varp"): s_keys.extend(["obsp", "varp"]) d_keys.extend([data.obsp.keys(), data.varp.keys()]) if keys is None: df = data.to_df() elif key in data.var_names: df = obs_df(data, keys, layer=layer) elif key in data.obs_names: df = var_df(data, keys, layer=layer) else: if keys_split is not None: keys = [ k for k in list(data.obs.keys()) + list(data.var.keys()) if key in k and keys_split in k ] key = keys[0] s_key = [s for (s, d_key) in zip(s_keys, d_keys) if key in d_key] if len(s_key) == 0: raise ValueError( f"'{key}' not found in any of {', '.join(s_keys)}.") if len(s_key) > 1: logg.warn( f"'{key}' found multiple times in {', '.join(s_key)}.") s_key = s_key[-1] df = getattr(data, s_key)[keys if len(keys) > 1 else key] if key_add is not None: df = df[key_add] if index is None: index = (data.var_names if s_key == "varm" else data.obs_names if s_key in {"obsm", "layers"} else None) if index is None and s_key == "uns" and hasattr(df, "shape"): key_cats = np.array([ key for key in data.obs.keys() if is_categorical_dtype(data.obs[key]) ]) num_cats = [ len(data.obs[key].cat.categories) == df.shape[0] for key in key_cats ] if np.sum(num_cats) == 1: index = data.obs[key_cats[num_cats][0]].cat.categories if (columns is None and len(df.shape) > 1 and df.shape[0] == df.shape[1]): columns = index elif isinstance(index, str) and index in data.obs.keys(): index = pd.Categorical(data.obs[index]).categories if columns is None and s_key == "layers": columns = data.var_names elif isinstance(columns, str) and columns in data.obs.keys(): columns = pd.Categorical(data.obs[columns]).categories elif isinstance(data, pd.DataFrame): if isinstance(keys, str) and "*" in keys: keys, keys_split = keys.split("*") keys = [k for k in data.columns if keys in k and keys_split in k] df = data[keys] if keys is not None else data else: df = data if issparse(df): df = np.array(df.A) if columns is None and hasattr(df, "names"): columns = df.names df = pd.DataFrame(df, index=index, columns=columns) if dropna: df.replace("", np.nan, inplace=True) how = dropna if isinstance(dropna, str) else "any" if dropna is True else "all" df.dropna(how=how, axis=0, inplace=True) df.dropna(how=how, axis=1, inplace=True) if sort_values: sort_by = (sort_values if isinstance(sort_values, str) and sort_values in df.columns else df.columns[0]) df = df.sort_values(by=sort_by, ascending=False) if hasattr(data, "var_names"): if df.index[0] in data.var_names: df.var_names = df.index elif df.columns[0] in data.var_names: df.var_names = df.columns if hasattr(data, "obs_names"): if df.index[0] in data.obs_names: df.obs_names = df.index elif df.columns[0] in data.obs_names: df.obs_names = df.columns return df
def _calculate_agg_features(self, features, frame, df_trie, progress_callback): test_feature = features[0] child_entity = test_feature.base_features[0].entity base_frame = df_trie.get_node(test_feature.relationship_path).value parent_merge_var = test_feature.relationship_path[0][ 1].parent_variable.id # Sometimes approximate features get computed in a previous filter frame # and put in the current one dynamically, # so there may be existing features here fl = [] for f in features: for ind in f.get_feature_names(): if ind not in frame.columns: fl.append(f) break features = fl if not len(features): progress_callback(len(features) / float(self.num_features)) return frame # handle where base_frame_empty = base_frame.empty if isinstance( base_frame, pd.DataFrame) else False where = test_feature.where if where is not None and not base_frame_empty: base_frame = base_frame.loc[base_frame[where.get_name()]] # when no child data, just add all the features to frame with nan base_frame_empty = base_frame.empty if isinstance( base_frame, pd.DataFrame) else False if base_frame_empty: feature_values = [] for f in features: feature_values.append( (f, np.full(f.number_output_features, np.nan))) progress_callback(1 / float(self.num_features)) frame = update_feature_columns(feature_values, frame) else: relationship_path = test_feature.relationship_path groupby_var = get_relationship_variable_id(relationship_path) # if the use_previous property exists on this feature, include only the # instances from the child entity included in that Timedelta use_previous = test_feature.use_previous if use_previous: # Filter by use_previous values time_last = self.time_last if use_previous.has_no_observations(): time_first = time_last - use_previous ti = child_entity.time_index if ti is not None: base_frame = base_frame[base_frame[ti] >= time_first] else: n = use_previous.get_value('o') def last_n(df): return df.iloc[-n:] base_frame = base_frame.groupby(groupby_var, observed=True, sort=False).apply(last_n) to_agg = {} agg_rename = {} to_apply = set() # apply multivariable and time-dependent features as we find them, and # save aggregable features for later for f in features: if _can_agg(f): variable_id = f.base_features[0].get_name() if variable_id not in to_agg: to_agg[variable_id] = [] if isinstance(base_frame, dd.DataFrame): func = f.get_dask_aggregation() else: func = f.get_function() # for some reason, using the string count is significantly # faster than any method a primitive can return # https://stackoverflow.com/questions/55731149/use-a-function-instead-of-string-in-pandas-groupby-agg if func == pd.Series.count: func = "count" funcname = func if callable(func): # if the same function is being applied to the same # variable twice, wrap it in a partial to avoid # duplicate functions funcname = str(id(func)) if u"{}-{}".format(variable_id, funcname) in agg_rename: func = partial(func) funcname = str(id(func)) func.__name__ = funcname if isinstance(func, dd.Aggregation): # TODO: handle aggregation being applied to same variable twice # (see above partial wrapping of functions) funcname = func.__name__ to_agg[variable_id].append(func) # this is used below to rename columns that pandas names for us agg_rename[u"{}-{}".format(variable_id, funcname)] = f.get_name() continue to_apply.add(f) # Apply the non-aggregable functions generate a new dataframe, and merge # it with the existing one if len(to_apply): wrap = agg_wrapper(to_apply, self.time_last) # groupby_var can be both the name of the index and a column, # to silence pandas warning about ambiguity we explicitly pass # the column (in actuality grouping by both index and group would # work) to_merge = base_frame.groupby(base_frame[groupby_var], observed=True, sort=False).apply(wrap) frame = pd.merge(left=frame, right=to_merge, left_index=True, right_index=True, how='left') progress_callback(len(to_apply) / float(self.num_features)) # Apply the aggregate functions to generate a new dataframe, and merge # it with the existing one if len(to_agg): # groupby_var can be both the name of the index and a column, # to silence pandas warning about ambiguity we explicitly pass # the column (in actuality grouping by both index and group would # work) if isinstance(base_frame, dd.DataFrame): to_merge = base_frame.groupby(groupby_var).agg(to_agg) else: to_merge = base_frame.groupby(base_frame[groupby_var], observed=True, sort=False).agg(to_agg) # rename columns to the correct feature names to_merge.columns = [ agg_rename["-".join(x)] for x in to_merge.columns.ravel() ] to_merge = to_merge[list(agg_rename.values())] # workaround for pandas bug where categories are in the wrong order # see: https://github.com/pandas-dev/pandas/issues/22501 if pdtypes.is_categorical_dtype(frame.index): categories = pdtypes.CategoricalDtype( categories=frame.index.categories) to_merge.index = to_merge.index.astype(object).astype( categories) if isinstance(frame, dd.DataFrame): frame = frame.merge(to_merge, left_on=parent_merge_var, right_index=True, how='left') else: frame = pd.merge(left=frame, right=to_merge, left_index=True, right_index=True, how='left') # determine number of features that were just merged progress_callback( len(to_merge.columns) / float(self.num_features)) # Handle default values fillna_dict = {} for f in features: feature_defaults = { name: f.default_value for name in f.get_feature_names() } fillna_dict.update(feature_defaults) frame = frame.fillna(fillna_dict) # convert boolean dtypes to floats as appropriate # pandas behavior: https://github.com/pydata/pandas/issues/3752 for f in features: if (f.number_output_features == 1 and f.variable_type == variable_types.Numeric and frame[f.get_name()].dtype.name in ['object', 'bool']): frame[f.get_name()] = frame[f.get_name()].astype(float) return frame
def train(cls, new_data, old=None, drop=False, na_rm=False): """ Train a continuous scale Parameters ---------- new_data : array_like New values old : array_like Old range. List of values known to the scale. drop : bool Whether to drop(not include) unused categories na_rm : bool If ``True``, remove missing values. Missing values are either ``NaN`` or ``None``. Returns ------- out : list Values covered by the scale """ if not len(new_data): return old if old is None: old = [] # Get the missing values (NaN & Nones) locations and remove them nan_bool_idx = pd.isnull(new_data) has_na = np.any(nan_bool_idx) if not hasattr(new_data, 'dtype'): new_data = np.asarray(new_data) new_data = new_data[~nan_bool_idx] if new_data.dtype.kind not in DISCRETE_KINDS: raise TypeError( "Continuous value supplied to discrete scale") # Train i.e. get the new values if pdtypes.is_categorical_dtype(new_data): try: new = list(new_data.cat.categories) # series except AttributeError: new = list(new_data.categories) # plain categorical if drop: present = set(new_data.drop_duplicates()) new = [i for i in new if i in present] else: try: new = np.unique(new_data) new.sort() except TypeError: # new_data probably has nans and other types new = list(set(new_data)) new = multitype_sort(new) # Add nan if required if has_na and not na_rm: new = np.hstack([new, np.nan]) # update old old_set = set(old) return list(old) + [i for i in new if (i not in old_set)]
def infer_variable_types(self, variable_types, time_index, secondary_time_index): '''Infer variable types from dataframe Args: variable_types (dict[str -> dict[str -> type]]) : An entity's variable_types dict maps string variable ids to types (:class:`.Variable`) or (type, kwargs) to pass keyword arguments to the Variable. time_index (str or None): Name of time_index column secondary_time_index (dict[str: [str]]): Dictionary of secondary time columns that each map to a list of columns that depend on that secondary time ''' link_relationships = [r for r in self.entityset.relationships if r.parent_entity.id == self.id or r.child_entity.id == self.id] link_vars = [v.id for rel in link_relationships for v in [rel.parent_variable, rel.child_variable] if v.entity.id == self.id] # TODO: set pk and pk types here inferred_types = {} df = self.df vids_to_assume_datetime = [time_index] if len(list(secondary_time_index.keys())): vids_to_assume_datetime.append(list(secondary_time_index.keys())[0]) inferred_type = vtypes.Unknown for variable in df.columns: if variable in variable_types: continue elif variable in vids_to_assume_datetime: if col_is_datetime(df[variable]): inferred_type = vtypes.Datetime else: inferred_type = vtypes.Numeric elif df[variable].dtype == "object": if variable in link_vars: inferred_type = vtypes.Categorical elif len(df[variable]): if col_is_datetime(df[variable]): inferred_type = vtypes.Datetime else: # heuristics to predict this some other than categorical sample = df[variable].sample(min(10000, df[variable].nunique())) avg_length = sample.str.len().mean() if avg_length > 50: inferred_type = vtypes.Text else: inferred_type = vtypes.Categorical elif df[variable].dtype == "bool": inferred_type = vtypes.Boolean elif pdtypes.is_categorical_dtype(df[variable].dtype): inferred_type = vtypes.Categorical elif col_is_datetime(df[variable]): inferred_type = vtypes.Datetime elif variable in link_vars: inferred_type = vtypes.Ordinal elif len(df[variable]): sample = df[variable] \ .sample(min(10000, df[variable].nunique(dropna=False))) unique = sample.unique() percent_unique = sample.size / len(unique) if percent_unique < .05: inferred_type = vtypes.Categorical else: inferred_type = vtypes.Numeric inferred_types[variable] = inferred_type return inferred_types
def query_by_values(self, instance_vals, variable_id=None, columns=None, time_last=None, training_window=None, include_cutoff_time=True): """Query instances that have variable with given value Args: instance_vals (pd.Dataframe, pd.Series, list[str] or str) : Instance(s) to match. variable_id (str) : Variable to query on. If None, query on index. columns (list[str]) : Columns to return. Return all columns if None. time_last (pd.TimeStamp) : Query data up to and including this time. Only applies if entity has a time index. training_window (Timedelta, optional): Window defining how much time before the cutoff time data can be used when calculating features. If None, all data before cutoff time is used. include_cutoff_time (bool): If True, data at cutoff time are included in calculating features Returns: pd.DataFrame : instances that match constraints with ids in order of underlying dataframe """ if not variable_id: variable_id = self.index instance_vals = self._vals_to_series(instance_vals, variable_id) training_window = _check_timedelta(training_window) if training_window is not None: assert training_window.has_no_observations(), "Training window cannot be in observations" if instance_vals is None: df = self.df.copy() elif isinstance(instance_vals, pd.Series) and instance_vals.empty: df = self.df.head(0) else: if is_instance(instance_vals, (dd, ks), 'Series'): df = self.df.merge(instance_vals.to_frame(), how="inner", on=variable_id) elif isinstance(instance_vals, pd.Series) and is_instance(self.df, ks, 'DataFrame'): df = self.df.merge(ks.DataFrame({variable_id: instance_vals}), how="inner", on=variable_id) else: df = self.df[self.df[variable_id].isin(instance_vals)] if isinstance(self.df, pd.DataFrame): df = df.set_index(self.index, drop=False) # ensure filtered df has same categories as original # workaround for issue below # github.com/pandas-dev/pandas/issues/22501#issuecomment-415982538 if pdtypes.is_categorical_dtype(self.df[variable_id]): categories = pd.api.types.CategoricalDtype(categories=self.df[variable_id].cat.categories) df[variable_id] = df[variable_id].astype(categories) df = self._handle_time(df=df, time_last=time_last, training_window=training_window, include_cutoff_time=include_cutoff_time) if columns is not None: df = df[columns] return df
def scatter(adata, x=None, y=None, color='grey', use_raw=True, sort_order=True, alpha=None, basis=None, groups=None, components=None, projection='2d', legend_loc='right margin', legend_fontsize=None, legend_fontweight=None, color_map=None, palette=None, right_margin=None, left_margin=None, size=None, title=None, show=None, save=None, ax=None): """Scatter plot. Color with annotation of observations (`.obs`) or expression of genes (`.var_names`). Parameters ---------- adata : :class:`~scanpy.api.AnnData` Annotated data matrix. x : `str` or `None` x coordinate. y : `str` or `None` y coordinate. color : string or list of strings, optional (default: `None`) Keys for observation/cell annotation `[\'ann1\', \'ann2\']`. use_raw : `bool`, optional (default: `True`) Use `raw` attribute of `adata` if present. sort_order : `bool`, optional (default: `True`) For continuous annotations used as color parameter, plot data points with higher values on top of others. basis : {'pca', 'tsne', 'umap', 'diffmap', 'draw_graph_fr', etc.} String that denotes a plotting tool that computed coordinates. groups : str, optional (default: all groups in color) Allows to restrict categories in observation annotation to a subset. components : `str` or list of `str`, optional (default: '1,2') String of the form '1,2' or ['1,2', '2,3']. projection : {'2d', '3d'}, optional (default: '2d') Projection of plot. legend_loc : `str`, optional (default: 'right margin') Location of legend, either 'on data', 'right margin' or valid keywords for `matplotlib.pyplot.legend <https://matplotlib.org/api/_as_gen/matplotlib.pyplot.legend.html>`_. legend_fontsize : `int` (default: `None`) Legend font size. legend_fontweight : `int` (default: `None`) Legend font weight. color_map : `str` (default: 'RdBu_r') String denoting matplotlib color map for continuous coloring. palette : list of `str` (default: `None`) Colors to use for plotting groups (categorical annotation). right_margin : `float` (default: 0.3) Adjust how far the plotting panel extends to the right. size : float (default: None) Point size. Observation-number dependent by default. title : `str` or list of `str`, optional (default: `None`) Provide title for panels either as `[\'title1\', ...]`. show : `bool`, optional (default: `None`) Show the plot. save : `bool` or `str`, optional (default: `None`) If `True` or a `str`, save the figure. A string is appended to the default filename. Infer the filetype if ending on \{'.pdf', '.png', '.svg'\}. ax : `matplotlib.Axes` A matplotlib axes object. Returns ------- A list of `matplotlib.Axis` objects. """ sanitize_anndata(adata) if legend_loc not in VALID_LEGENDLOCS: raise ValueError('Invalid `legend_loc`, need to be one of: {}.'.format( VALID_LEGENDLOCS)) if components is None: components = '1,2' if '2d' in projection else '1,2,3' if isinstance(components, str): components = components.split(',') components = np.array(components).astype(int) - 1 title = None if title is None else title.split(',') if isinstance( title, str) else title keys = ['grey'] if color is None else color.split(',') if isinstance( color, str) else color groups = None if groups is None else groups.split(',') if isinstance( groups, str) else groups highlights = adata.uns['highlights'] if 'highlights' in adata.uns else [] if basis is not None: try: Y = adata.obsm['X_' + basis][:, components] except KeyError: raise KeyError( 'compute coordinates using visualization tool {} first'.format( basis)) elif x is not None and y is not None: x_arr = adata._get_obs_array(x) y_arr = adata._get_obs_array(y) Y = np.c_[x_arr[:, None], y_arr[:, None]] else: raise ValueError( 'Either provide keys for a `basis` or for `x` and `y`.') if size is None: n = Y.shape[0] size = 120000 / n if legend_loc == 'on data' and legend_fontsize is None: legend_fontsize = rcParams['legend.fontsize'] elif legend_fontsize is None: legend_fontsize = rcParams['legend.fontsize'] palette_was_none = False if palette is None: palette_was_none = True if isinstance(palette, list): if not is_color_like(palette[0]): palettes = palette else: palettes = [palette] else: palettes = [palette for i in range(len(keys))] for i, palette in enumerate(palettes): palettes[i] = utils.default_palette(palette) if basis is not None: component_name = ('DC' if basis == 'diffmap' else basis.replace('draw_graph_', '').upper() if 'draw_graph' in basis else 'tSNE' if basis == 'tsne' else 'UMAP' if basis == 'umap' else 'PC' if basis == 'pca' else 'Spring' if basis == 'spring' else None) else: component_name = None axis_labels = (x, y) if component_name is None else None show_ticks = True if component_name is None else False # the actual color ids, e.g. 'grey' or '#109482' color_ids = [None if not is_color_like(key) else key for key in keys] categoricals = [] colorbars = [] for ikey, key in enumerate(keys): if color_ids[ikey] is not None: c = color_ids[ikey] continuous = True categorical = False colorbars.append(False) else: c = 'white' if projection == '2d' else 'white' categorical = False continuous = False # test whether we have categorial or continuous annotation if key in adata.obs_keys(): if is_categorical_dtype(adata.obs[key]): categorical = True else: continuous = True c = adata.obs[key] # coloring according to gene expression elif (use_raw and adata.raw is not None and key in adata.raw.var_names): c = adata.raw[:, key].X continuous = True elif key in adata.var_names: c = adata[:, key].X continuous = True else: raise ValueError( '"' + key + '" is invalid!' + ' specify valid observation annotation, one of ' + str(adata.obs_keys()) + ' or a gene name ' + str(adata.var_names)) colorbars.append(True if continuous else False) if categorical: categoricals.append(ikey) color_ids[ikey] = c if right_margin is None and len(categoricals) > 0: if legend_loc == 'right margin': right_margin = 0.5 if title is None and keys[0] is not None: title = [ key.replace('_', ' ') if not is_color_like(key) else '' for key in keys ] axs = scatter_base(Y, title=title, alpha=alpha, component_name=component_name, axis_labels=axis_labels, component_indexnames=components + 1, projection=projection, colors=color_ids, highlights=highlights, colorbars=colorbars, right_margin=right_margin, left_margin=left_margin, sizes=[size for c in keys], color_map=color_map, show_ticks=show_ticks, ax=ax) def add_centroid(centroids, name, Y, mask): Y_mask = Y[mask] if Y_mask.shape[0] == 0: return median = np.median(Y_mask, axis=0) i = np.argmin(np.sum(np.abs(Y_mask - median), axis=1)) centroids[name] = Y_mask[i] for i, ikey in enumerate(categoricals): palette = palettes[i] key = keys[ikey] if (not key + '_colors' in adata.uns or not palette_was_none or len(adata.obs[key].cat.categories) != len( adata.uns[key + '_colors'])): utils.add_colors_for_categorical_sample_annotation( adata, key, palette) # actually plot the groups mask_remaining = np.ones(Y.shape[0], dtype=bool) centroids = {} if groups is None: for iname, name in enumerate(adata.obs[key].cat.categories): if name not in settings.categories_to_ignore: mask = scatter_group(axs[ikey], key, iname, adata, Y, projection, size=size, alpha=alpha) mask_remaining[mask] = False if legend_loc == 'on data': add_centroid(centroids, name, Y, mask) else: for name in groups: if name not in set(adata.obs[key].cat.categories): raise ValueError('"' + name + '" is invalid!' + ' specify valid name, one of ' + str(adata.obs[key].cat.categories)) else: iname = np.flatnonzero( adata.obs[key].cat.categories.values == name)[0] mask = scatter_group(axs[ikey], key, iname, adata, Y, projection, size=size, alpha=alpha) if legend_loc == 'on data': add_centroid(centroids, name, Y, mask) mask_remaining[mask] = False if mask_remaining.sum() > 0: data = [Y[mask_remaining, 0], Y[mask_remaining, 1]] if projection == '3d': data.append(Y[mask_remaining, 2]) axs[ikey].scatter(*data, marker='.', c='grey', s=size, edgecolors='none', zorder=-1) legend = None if legend_loc == 'on data': for name, pos in centroids.items(): axs[ikey].text(pos[0], pos[1], name, weight=legend_fontweight, verticalalignment='center', horizontalalignment='center', fontsize=legend_fontsize) elif legend_loc == 'right margin': legend = axs[ikey].legend( frameon=False, loc='center left', bbox_to_anchor=(1, 0.5), ncol=(1 if len(adata.obs[key].cat.categories) <= 14 else 2 if len(adata.obs[key].cat.categories) <= 30 else 3), fontsize=legend_fontsize) elif legend_loc != 'none': legend = axs[ikey].legend(frameon=False, loc=legend_loc, fontsize=legend_fontsize) if legend is not None: for handle in legend.legendHandles: handle.set_sizes([300.0]) utils.savefig_or_show('scatter' if basis is None else basis, show=show, save=save) if show == False: axs
def contains_op(cls, series: pd.Series) -> bool: if not pdt.is_categorical_dtype(series) and pdt.is_bool_dtype(series): return True return False
def _transform_pandas_df( data: DataFrame, enable_categorical: bool, feature_names: FeatNamesT = None, feature_types: Optional[List[str]] = None, meta: Optional[str] = None, meta_type: Optional[str] = None, ) -> Tuple[np.ndarray, FeatNamesT, Optional[List[str]]]: import pandas as pd from pandas.api.types import is_sparse, is_categorical_dtype if not all(dtype.name in _pandas_dtype_mapper or is_sparse(dtype) or (is_categorical_dtype(dtype) and enable_categorical) for dtype in data.dtypes): _invalid_dataframe_dtype(data) # handle feature names if feature_names is None and meta is None: if isinstance(data.columns, pd.MultiIndex): feature_names = [ " ".join([str(x) for x in i]) for i in data.columns ] elif isinstance(data.columns, (pd.Index, pd.RangeIndex)): feature_names = list(map(str, data.columns)) else: feature_names = data.columns.format() # handle feature types if feature_types is None and meta is None: feature_types = [] for i, dtype in enumerate(data.dtypes): if is_sparse(dtype): feature_types.append(_pandas_dtype_mapper[dtype.subtype.name]) elif is_categorical_dtype(dtype) and enable_categorical: feature_types.append(CAT_T) else: feature_types.append(_pandas_dtype_mapper[dtype.name]) # handle category codes. transformed = pd.DataFrame() # Avoid transformation due to: PerformanceWarning: DataFrame is highly fragmented if enable_categorical and any( is_categorical_dtype(dtype) for dtype in data.dtypes): for i, dtype in enumerate(data.dtypes): if is_categorical_dtype(dtype): # pandas uses -1 as default missing value for categorical data transformed[data.columns[i]] = ( data[data.columns[i]].cat.codes.astype(np.float32).replace( -1.0, np.NaN)) else: transformed[data.columns[i]] = data[data.columns[i]] else: transformed = data if meta and len(data.columns) > 1 and meta not in _matrix_meta: raise ValueError(f"DataFrame for {meta} cannot have multiple columns") dtype = meta_type if meta_type else np.float32 arr = transformed.values if meta_type: arr = arr.astype(meta_type) return arr, feature_names, feature_types
def query_by_values(self, instance_vals, variable_id=None, columns=None, time_last=None, training_window=None): """Query instances that have variable with given value Args: instance_vals (pd.Dataframe, pd.Series, list[str] or str) : Instance(s) to match. variable_id (str) : Variable to query on. If None, query on index. columns (list[str]) : Columns to return. Return all columns if None. time_last (pd.TimeStamp) : Query data up to and including this time. Only applies if entity has a time index. training_window (Timedelta, optional): Data older than time_last by more than this will be ignored Returns: pd.DataFrame : instances that match constraints """ instance_vals = self._vals_to_series(instance_vals, variable_id) training_window = _check_timedelta(training_window) if training_window is not None: assert (isinstance(training_window, Timedelta) and training_window.is_absolute()),\ "training window must be an absolute Timedelta" if instance_vals is None: df = self.df.copy() elif instance_vals.shape[0] == 0: df = self.df.head(0) elif variable_id is None or variable_id == self.index: df = self.df.reindex(instance_vals) df.dropna(subset=[self.index], inplace=True) else: df = self.df.merge(instance_vals.to_frame(variable_id), how="inner", on=variable_id) df = df.set_index(self.index, drop=False) # ensure filtered df has same categories as original # workaround for issue below # github.com/pandas-dev/pandas/issues/22501#issuecomment-415982538 if pdtypes.is_categorical_dtype(self.df[variable_id]): categories = pd.api.types.CategoricalDtype( categories=self.df[variable_id].cat.categories) df[variable_id] = df[variable_id].astype(categories) df = self._handle_time(df=df, time_last=time_last, training_window=training_window) if columns is not None: df = df[columns] return df
def regress_out( adata: AnnData, keys: Union[str, Sequence[str]], n_jobs: Optional[int] = None, copy: bool = False, ) -> Optional[AnnData]: """\ Regress out (mostly) unwanted sources of variation. Uses simple linear regression. This is inspired by Seurat's `regressOut` function in R [Satija15]. Note that this function tends to overcorrect in certain circumstances as described in :issue:`526`. Parameters ---------- adata The annotated data matrix. keys Keys for observation annotation on which to regress on. n_jobs Number of jobs for parallel computation. `None` means using :attr:`scanpy._settings.ScanpyConfig.n_jobs`. copy Determines whether a copy of `adata` is returned. Returns ------- Depending on `copy` returns or updates `adata` with the corrected data matrix. """ start = logg.info(f'regressing out {keys}') if issparse(adata.X): logg.info(' sparse input is densified and may ' 'lead to high memory use') adata = adata.copy() if copy else adata sanitize_anndata(adata) # TODO: This should throw an implicit modification warning if adata.is_view: adata._init_as_actual(adata.copy()) if isinstance(keys, str): keys = [keys] if issparse(adata.X): adata.X = adata.X.toarray() n_jobs = sett.n_jobs if n_jobs is None else n_jobs # regress on a single categorical variable variable_is_categorical = False if keys[0] in adata.obs_keys() and is_categorical_dtype( adata.obs[keys[0]]): if len(keys) > 1: raise ValueError('If providing categorical variable, ' 'only a single one is allowed. For this one ' 'we regress on the mean for each category.') logg.debug('... regressing on per-gene means within categories') regressors = np.zeros(adata.X.shape, dtype='float32') for category in adata.obs[keys[0]].cat.categories: mask = (category == adata.obs[keys[0]]).values for ix, x in enumerate(adata.X.T): regressors[mask, ix] = x[mask].mean() variable_is_categorical = True # regress on one or several ordinal variables else: # create data frame with selected keys (if given) if keys: regressors = adata.obs[keys] else: regressors = adata.obs.copy() # add column of ones at index 0 (first column) regressors.insert(0, 'ones', 1.0) len_chunk = np.ceil(min(1000, adata.X.shape[1]) / n_jobs).astype(int) n_chunks = np.ceil(adata.X.shape[1] / len_chunk).astype(int) tasks = [] # split the adata.X matrix by columns in chunks of size n_chunk # (the last chunk could be of smaller size than the others) chunk_list = np.array_split(adata.X, n_chunks, axis=1) if variable_is_categorical: regressors_chunk = np.array_split(regressors, n_chunks, axis=1) for idx, data_chunk in enumerate(chunk_list): # each task is a tuple of a data_chunk eg. (adata.X[:,0:100]) and # the regressors. This data will be passed to each of the jobs. if variable_is_categorical: regres = regressors_chunk[idx] else: regres = regressors tasks.append(tuple((data_chunk, regres, variable_is_categorical))) from joblib import Parallel, delayed # TODO: figure out how to test that this doesn't oversubscribe resources res = Parallel(n_jobs=n_jobs)(delayed(_regress_out_chunk)(task) for task in tasks) # res is a list of vectors (each corresponding to a regressed gene column). # The transpose is needed to get the matrix in the shape needed adata.X = np.vstack(res).T.astype(adata.X.dtype) logg.info(' finished', time=start) return adata if copy else None
def concat(dfs, axis=0, join='outer', uniform=False): """Concatenate, handling some edge cases: - Unions categoricals between partitions - Ignores empty partitions Parameters ---------- dfs : list of DataFrame, Series, or Index axis : int or str, optional join : str, optional uniform : bool, optional Whether to treat ``dfs[0]`` as representative of ``dfs[1:]``. Set to True if all arguments have the same columns and dtypes (but not necessarily categories). Default is False. """ if axis == 1: return pd.concat(dfs, axis=axis, join=join) if len(dfs) == 1: return dfs[0] # Support concatenating indices along axis 0 if isinstance(dfs[0], pd.Index): if isinstance(dfs[0], pd.CategoricalIndex): return pd.CategoricalIndex(union_categoricals(dfs), name=dfs[0].name) elif isinstance(dfs[0], pd.MultiIndex): first, rest = dfs[0], dfs[1:] if all((isinstance(o, pd.MultiIndex) and o.nlevels >= first.nlevels) for o in rest): arrays = [concat([_get_level_values(i, n) for i in dfs]) for n in range(first.nlevels)] return pd.MultiIndex.from_arrays(arrays, names=first.names) to_concat = (first.values, ) + tuple(k._values for k in rest) new_tuples = np.concatenate(to_concat) try: return pd.MultiIndex.from_tuples(new_tuples, names=first.names) except Exception: return pd.Index(new_tuples) return dfs[0].append(dfs[1:]) # Handle categorical index separately dfs0_index = dfs[0].index has_categoricalindex = ( isinstance(dfs0_index, pd.CategoricalIndex) or (isinstance(dfs0_index, pd.MultiIndex) and any(isinstance(i, pd.CategoricalIndex) for i in dfs0_index.levels))) if has_categoricalindex: dfs2 = [df.reset_index(drop=True) for df in dfs] ind = concat([df.index for df in dfs]) else: dfs2 = dfs ind = None # Concatenate the partitions together, handling categories as needed if (isinstance(dfs2[0], pd.DataFrame) if uniform else any(isinstance(df, pd.DataFrame) for df in dfs2)): if uniform: dfs3 = dfs2 cat_mask = dfs2[0].dtypes == 'category' else: # When concatenating mixed dataframes and series on axis 1, Pandas # converts series to dataframes with a single column named 0, then # concatenates. dfs3 = [df if isinstance(df, pd.DataFrame) else df.to_frame().rename(columns={df.name: 0}) for df in dfs2] # pandas may raise a RuntimeWarning for comparing ints and strs with warnings.catch_warnings(): warnings.simplefilter("ignore", RuntimeWarning) cat_mask = pd.concat([(df.dtypes == 'category').to_frame().T for df in dfs3], join=join).any() if cat_mask.any(): not_cat = cat_mask[~cat_mask].index out = pd.concat([df[df.columns.intersection(not_cat)] for df in dfs3], join=join) temp_ind = out.index for col in cat_mask.index.difference(not_cat): # Find an example of categoricals in this column for df in dfs3: sample = df.get(col) if sample is not None: break # Extract partitions, subbing in missing if needed parts = [] for df in dfs3: if col in df.columns: parts.append(df[col]) else: codes = np.full(len(df), -1, dtype='i8') data = pd.Categorical.from_codes(codes, sample.cat.categories, sample.cat.ordered) parts.append(data) out[col] = union_categoricals(parts) # Pandas resets index type on assignment if frame is empty # https://github.com/pandas-dev/pandas/issues/17101 if not len(temp_ind): out.index = temp_ind out = out.reindex(columns=cat_mask.index) else: # pandas may raise a RuntimeWarning for comparing ints and strs with warnings.catch_warnings(): warnings.simplefilter("ignore", RuntimeWarning) out = pd.concat(dfs3, join=join) else: if is_categorical_dtype(dfs2[0].dtype): if ind is None: ind = concat([df.index for df in dfs2]) return pd.Series(union_categoricals(dfs2), index=ind, name=dfs2[0].name) out = pd.concat(dfs2, join=join) # Re-add the index if needed if ind is not None: out.index = ind return out
def _paga_graph(adata, ax, layout=None, layout_kwds={}, init_pos=None, solid_edges=None, dashed_edges=None, transitions=None, threshold=None, threshold_arrows=None, threshold_solid=None, threshold_dashed=None, root=0, colors=None, labels=None, fontsize=None, fontweight=None, text_kwds=None, node_size_scale=1, node_size_power=0.5, edge_width_scale=1, title=None, pos=None, cmap=None, frameon=True, min_edge_width=None, max_edge_width=None, export_to_gexf=False, cax=None, colorbar=None, use_raw=True, cb_kwds={}, single_component=False, arrowsize=30, random_state=0): node_labels = labels # rename for clarity if (node_labels is not None and isinstance(node_labels, str) and node_labels != adata.uns['paga']['groups']): raise ValueError( 'Provide a list of group labels for the PAGA groups {}, not {}.'. format(adata.uns['paga']['groups'], node_labels)) groups_key = adata.uns['paga']['groups'] if node_labels is None: node_labels = adata.obs[groups_key].cat.categories if (colors is None or colors == groups_key) and groups_key is not None: if (groups_key + '_colors' not in adata.uns or len(adata.obs[groups_key].cat.categories) != len( adata.uns[groups_key + '_colors'])): utils.add_colors_for_categorical_sample_annotation( adata, groups_key) colors = adata.uns[groups_key + '_colors'] for iname, name in enumerate(adata.obs[groups_key].cat.categories): if name in settings.categories_to_ignore: colors[iname] = 'grey' if isinstance(root, str): if root in node_labels: root = list(node_labels).index(root) else: raise ValueError( 'If `root` is a string, it needs to be one of {} not \'{}\'.'. format(node_labels.tolist(), root)) if isinstance(root, list) and root[0] in node_labels: root = [list(node_labels).index(r) for r in root] # define the objects adjacency_solid = adata.uns['paga'][solid_edges].copy() # set the the thresholds, either explicitly if threshold is not None: threshold_solid = threshold threshold_dashed = threshold # or to a default value else: if threshold_solid is None: threshold_solid = 0.01 # default threshold if threshold_dashed is None: threshold_dashed = 0.01 # default treshold if threshold_solid > 0: adjacency_solid.data[adjacency_solid.data < threshold_solid] = 0 adjacency_solid.eliminate_zeros() nx_g_solid = nx.Graph(adjacency_solid) if dashed_edges is not None: adjacency_dashed = adata.uns['paga'][dashed_edges].copy() if threshold_dashed > 0: adjacency_dashed.data[adjacency_dashed.data < threshold_dashed] = 0 adjacency_dashed.eliminate_zeros() nx_g_dashed = nx.Graph(adjacency_dashed) # uniform color if isinstance(colors, str) and is_color_like(colors): colors = [colors for c in range(len(node_labels))] # color degree of the graph if isinstance(colors, str) and colors.startswith('degree'): # see also tools.paga.paga_degrees if colors == 'degree_dashed': colors = [d for _, d in nx_g_dashed.degree(weight='weight')] elif colors == 'degree_solid': colors = [d for _, d in nx_g_solid.degree(weight='weight')] else: raise ValueError( '`degree` either "degree_dashed" or "degree_solid".') colors = (np.array(colors) - np.min(colors)) / (np.max(colors) - np.min(colors)) # plot gene expression var_names = adata.var_names if adata.raw is None else adata.raw.var_names if isinstance(colors, str) and colors in var_names: x_color = [] cats = adata.obs[groups_key].cat.categories for icat, cat in enumerate(cats): subset = (cat == adata.obs[groups_key]).values if adata.raw is not None and use_raw: adata_gene = adata.raw[:, colors] else: adata_gene = adata[:, colors] x_color.append(np.mean(adata_gene.X[subset])) colors = x_color # plot continuous annotation if (isinstance(colors, str) and colors in adata.obs and not is_categorical_dtype(adata.obs[colors])): x_color = [] cats = adata.obs[groups_key].cat.categories for icat, cat in enumerate(cats): subset = (cat == adata.obs[groups_key]).values x_color.append(adata.obs.loc[subset, colors].mean()) colors = x_color # plot categorical annotation if (isinstance(colors, str) and colors in adata.obs and is_categorical_dtype(adata.obs[colors])): from ... import utils as sc_utils asso_names, asso_matrix = sc_utils.compute_association_matrix_of_groups( adata, prediction=groups_key, reference=colors, normalization='reference') utils.add_colors_for_categorical_sample_annotation(adata, colors) asso_colors = sc_utils.get_associated_colors_of_groups( adata.uns[colors + '_colors'], asso_matrix) colors = asso_colors if len(colors) < len(node_labels): print(node_labels, colors) raise ValueError( '`color` list need to be at least as long as `groups`/`node_labels` list.' ) # count number of connected components n_components, labels = scipy.sparse.csgraph.connected_components( adjacency_solid) if n_components > 1 and not single_component: logg.msg( 'Graph has more than a single connected component. ' 'To restrict to this component, pass `single_component=True`.') if n_components > 1 and single_component: component_sizes = np.bincount(labels) largest_component = np.where( component_sizes == component_sizes.max())[0][0] adjacency_solid = adjacency_solid.tocsr()[labels == largest_component, :] adjacency_solid = adjacency_solid.tocsc()[:, labels == largest_component] colors = np.array(colors)[labels == largest_component] node_labels = np.array(node_labels)[labels == largest_component] logg.info( 'Restricting graph to largest connected component by dropping categories\n' '{}'.format(adata.obs[groups_key].cat.categories[ labels != largest_component].tolist())) nx_g_solid = nx.Graph(adjacency_solid) if dashed_edges is not None: raise ValueError( '`single_component` only if `dashed_edges` is `None`.') # node positions from adjacency_solid if pos is None: if layout is None: layout = 'fr' if layout == 'fa': try: from fa2 import ForceAtlas2 except: logg.warn( 'Package \'fa2\' is not installed, falling back to layout \'fr\'.' 'To use the faster and better ForceAtlas2 layout, ' 'install package \'fa2\' (`pip install fa2`).') layout = 'fr' if layout == 'fa': np.random.seed(random_state) if init_pos is None: init_coords = np.random.random((adjacency_solid.shape[0], 2)) else: init_coords = init_pos.copy() forceatlas2 = ForceAtlas2( # Behavior alternatives outboundAttractionDistribution=False, # Dissuade hubs linLogMode=False, # NOT IMPLEMENTED adjustSizes=False, # Prevent overlap (NOT IMPLEMENTED) edgeWeightInfluence=1.0, # Performance jitterTolerance=1.0, # Tolerance barnesHutOptimize=True, barnesHutTheta=1.2, multiThreaded=False, # NOT IMPLEMENTED # Tuning scalingRatio=2.0, strongGravityMode=False, gravity=1.0, # Log verbose=False) if 'maxiter' in layout_kwds: iterations = layout_kwds['maxiter'] elif 'iterations' in layout_kwds: iterations = layout_kwds['iterations'] else: iterations = 500 pos_list = forceatlas2.forceatlas2(adjacency_solid, pos=init_coords, iterations=iterations) pos = {n: [p[0], -p[1]] for n, p in enumerate(pos_list)} elif layout == 'eq_tree': nx_g_tree = nx_g_solid if solid_edges == 'connectivities': adj_tree = adata.uns['paga']['connectivities_tree'] nx_g_tree = nx.Graph(adj_tree) pos = utils.hierarchy_pos(nx_g_tree, root) if len(pos) < adjacency_solid.shape[0]: raise ValueError('This is a forest and not a single tree. ' 'Try another `layout`, e.g., {\'fr\'}.') else: # igraph layouts from ... import utils as sc_utils g = sc_utils.get_igraph_from_adjacency(adjacency_solid) if 'rt' in layout: g_tree = g if solid_edges == 'connectivities': adj_tree = adata.uns['paga']['connectivities_tree'] g_tree = sc_utils.get_igraph_from_adjacency(adj_tree) pos_list = g_tree.layout( layout, root=root if isinstance(root, list) else [root]).coords elif layout == 'circle': pos_list = g.layout(layout).coords else: # I don't know why this is necessary np.random.seed(random_state) if init_pos is None: init_coords = np.random.random( (adjacency_solid.shape[0], 2)).tolist() else: init_pos = init_pos.copy() # this is a super-weird hack that is necessary as igraphs layout function # seems to do some strange stuff, here init_pos[:, 1] *= -1 init_coords = init_pos.tolist() try: pos_list = g.layout(layout, seed=init_coords, weights='weight', **layout_kwds).coords except: # hack for excepting attribute error for empty graphs... pos_list = g.layout(layout, seed=init_coords, **layout_kwds).coords pos = {n: [p[0], -p[1]] for n, p in enumerate(pos_list)} pos_array = np.array([pos[n] for count, n in enumerate(nx_g_solid)]) else: if isinstance(pos, str): if not pos.endswith('.gdf'): raise ValueError( 'Currently only supporting reading positions from .gdf files.' 'Consider generating them using, for instance, Gephi.') s = '' # read the node definition from the file with open(pos) as f: f.readline() for line in f: if line.startswith('edgedef>'): break s += line from io import StringIO df = pd.read_csv(StringIO(s), header=-1) pos = df[[4, 5]].values pos_array = pos # convert to dictionary pos = {n: [p[0], p[1]] for n, p in enumerate(pos)} if len(pos) == 1: pos[0] = (0.5, 0.5) # edge widths base_edge_width = edge_width_scale * 5 * rcParams['lines.linewidth'] # draw dashed edges if dashed_edges is not None: widths = [x[-1]['weight'] for x in nx_g_dashed.edges(data=True)] widths = base_edge_width * np.array(widths) if max_edge_width is not None: widths = np.clip(widths, None, max_edge_width) nx.draw_networkx_edges(nx_g_dashed, pos, ax=ax, width=widths, edge_color='grey', style='dashed', alpha=0.5) # draw solid edges if transitions is None: widths = [x[-1]['weight'] for x in nx_g_solid.edges(data=True)] widths = base_edge_width * np.array(widths) if min_edge_width is not None or max_edge_width is not None: widths = np.clip(widths, min_edge_width, max_edge_width) nx.draw_networkx_edges(nx_g_solid, pos, ax=ax, width=widths, edge_color='black') # draw directed edges else: adjacency_transitions = adata.uns['paga'][transitions].copy() if threshold_arrows is None: threshold_arrows = 0.005 adjacency_transitions.data[ adjacency_transitions.data < threshold_arrows] = 0 adjacency_transitions.eliminate_zeros() g_dir = nx.DiGraph(adjacency_transitions.T) widths = [x[-1]['weight'] for x in g_dir.edges(data=True)] widths = 100 * base_edge_width * np.array(widths) if min_edge_width is not None or max_edge_width is not None: widths = np.clip(widths, min_edge_width, max_edge_width) nx.draw_networkx_edges(g_dir, pos, ax=ax, width=widths, edge_color='black', arrowsize=arrowsize) if export_to_gexf: if isinstance(colors[0], tuple): from matplotlib.colors import rgb2hex colors = [rgb2hex(c) for c in colors] for count, n in enumerate(nx_g_solid.nodes()): nx_g_solid.node[count]['label'] = str(node_labels[count]) nx_g_solid.node[count]['color'] = str(colors[count]) nx_g_solid.node[count]['viz'] = { 'position': { 'x': 1000 * pos[count][0], 'y': 1000 * pos[count][1], 'z': 0 } } filename = settings.writedir + 'paga_graph.gexf' logg.msg('exporting to {}'.format(filename), v=1) if settings.writedir != '' and not os.path.exists(settings.writedir): os.makedirs(settings.writedir) nx.write_gexf(nx_g_solid, settings.writedir + 'paga_graph.gexf') ax.set_frame_on(frameon) ax.set_xticks([]) ax.set_yticks([]) # groups sizes if (groups_key is not None and groups_key + '_sizes' in adata.uns): groups_sizes = adata.uns[groups_key + '_sizes'] else: groups_sizes = np.ones(len(node_labels)) base_scale_scatter = 2000 base_pie_size = (base_scale_scatter / (np.sqrt(adjacency_solid.shape[0]) + 10) * node_size_scale) median_group_size = np.median(groups_sizes) groups_sizes = base_pie_size * np.power(groups_sizes / median_group_size, node_size_power) # usual scatter plot if not isinstance(colors[0], dict): sct = ax.scatter(pos_array[:, 0], pos_array[:, 1], c=colors, edgecolors='face', s=groups_sizes, cmap=cmap) if fontsize is None: fontsize = rcParams['legend.fontsize'] for count, group in enumerate(node_labels): ax.text(pos_array[count, 0], pos_array[count, 1], group, verticalalignment='center', horizontalalignment='center', size=fontsize, fontweight=fontweight, **text_kwds) # else pie chart plot else: # start with this dummy plot... otherwise strange behavior sct = ax.scatter(pos_array[:, 0], pos_array[:, 1], c='white', edgecolors='face', s=groups_sizes, cmap=cmap) trans = ax.transData.transform bbox = ax.get_position().get_points() ax_x_min = bbox[0, 0] ax_x_max = bbox[1, 0] ax_y_min = bbox[0, 1] ax_y_max = bbox[1, 1] ax_len_x = ax_x_max - ax_x_min ax_len_y = ax_y_max - ax_y_min trans2 = ax.transAxes.inverted().transform pie_axs = [] for count, n in enumerate(nx_g_solid.nodes()): pie_size = groups_sizes[count] / base_scale_scatter x1, y1 = trans(pos[n]) # data coordinates xa, ya = trans2((x1, y1)) # axis coordinates xa = ax_x_min + (xa - pie_size / 2) * ax_len_x ya = ax_y_min + (ya - pie_size / 2) * ax_len_y # clip, the fruchterman layout sometimes places below figure if ya < 0: ya = 0 if xa < 0: xa = 0 pie_axs.append( pl.axes([xa, ya, pie_size * ax_len_x, pie_size * ax_len_y], frameon=False)) pie_axs[count].set_xticks([]) pie_axs[count].set_yticks([]) if not isinstance(colors[count], dict): raise ValueError( '{} is neither a dict of valid matplotlib colors ' 'nor a valid matplotlib color.'.format(colors[count])) color_single = colors[count].keys() fracs = [colors[count][c] for c in color_single] if sum(fracs) < 1: color_single = list(color_single) color_single.append('grey') fracs.append(1 - sum(fracs)) pie_axs[count].pie(fracs, colors=color_single) if node_labels is not None: for ia, a in enumerate(pie_axs): a.text(0.5, 0.5, node_labels[ia], verticalalignment='center', horizontalalignment='center', transform=a.transAxes, size=fontsize) return pos_array, sct
def get_numpy_type(dtype): if is_categorical_dtype(dtype): return 'category' else: return str(dtype)
def test_columns(self): ce = dpp.Categorizer(columns=["A"]) trn = ce.fit_transform(raw) assert is_categorical_dtype(trn["A"]) assert is_object_dtype(trn["B"])
def dendrogram( adata: AnnData, groupby: str, n_pcs: Optional[int] = None, use_rep: Optional[str] = None, var_names: Optional[Sequence[str]] = None, use_raw: Optional[bool] = None, cor_method: str = 'pearson', linkage_method: str = 'complete', optimal_ordering: bool = False, key_added: Optional[str] = None, inplace: bool = True, ) -> Optional[Dict[str, Any]]: """\ Computes a hierarchical clustering for the given `groupby` categories. By default, the PCA representation is used unless `.X` has less than 50 variables. Alternatively, a list of `var_names` (e.g. features) can be given. Average values of either `var_names` or components are used to compute a correlation matrix. The hierarchical clustering can be visualized using :func:`quanp.pl.dendrogram` or multiple other visualizations that can include a dendrogram: :func:`~quanp.pl.matrixplot`, :func:`~quanp.pl.heatmap`, :func:`~quanp.pl.dotplot`, and :func:`~quanp.pl.stacked_violin`. .. note:: The computation of the hierarchical clustering is based on predefined groups and not per subject. The correlation matrix is computed using by default pearson but other methods are available. Parameters ---------- adata Annotated data matrix {n_pcs} {use_rep} var_names List of var_names to use for computing the hierarchical clustering. If `var_names` is given, then `use_rep` and `n_pcs` is ignored. use_raw Only when `var_names` is not None. Use `raw` attribute of `adata` if present. cor_method correlation method to use. Options are 'pearson', 'kendall', and 'spearman' linkage_method linkage method to use. See :func:`scipy.cluster.hierarchy.linkage` for more information. optimal_ordering Same as the optimal_ordering argument of :func:`scipy.cluster.hierarchy.linkage` which reorders the linkage matrix so that the distance between successive leaves is minimal. key_added By default, the dendrogram information is added to `.uns[f'dendrogram_{{groupby}}']`. Notice that the `groupby` information is added to the dendrogram. inplace If `True`, adds dendrogram information to `adata.uns[key_added]`, else this function returns the information. Returns ------- If `inplace=False`, returns dendrogram information, else `adata.uns[key_added]` is updated with it. Examples -------- >>> import quanp as qp >>> adata = qp.datasets.pbmc68k_reduced() >>> qp.tl.dendrogram(adata, groupby='bulk_labels') >>> qp.pl.dendrogram(adata) >>> markers = ['featureA', 'featureB', 'featureC', 'featureD', 'featureE', 'featureF'] >>> qp.pl.dotplot(adata, markers, groupby='bulk_labels', dendrogram=True) """ if isinstance(groupby, str): # if not a list, turn into a list groupby = [groupby] for group in groupby: if group not in adata.obs_keys(): raise ValueError( 'groupby has to be a valid observation. ' f'Given value: {group}, valid observations: {adata.obs_keys()}' ) if not is_categorical_dtype(adata.obs[group]): raise ValueError( 'groupby has to be a categorical observation. ' f'Given value: {group}, Column type: {adata.obs[group].dtype}' ) if var_names is None: rep_df = pd.DataFrame( _choose_representation(adata, use_rep=use_rep, n_pcs=n_pcs) ) categorical = adata.obs[groupby[0]] if len(groupby) > 1: for group in groupby[1:]: # create new category by merging the given groupby categories categorical = ( categorical.astype(str) + "_" + adata.obs[group].astype(str) ).astype('category') categorical.name = "_".join(groupby) rep_df.set_index(categorical, inplace=True) categories = rep_df.index.categories else: if use_raw is None and adata.raw is not None: use_raw = True feature_names = adata.raw.var_names if use_raw else adata.var_names from ..plotting._anndata import _prepare_dataframe categories, rep_df = _prepare_dataframe(adata, feature_names, groupby, use_raw) # aggregate values within categories using 'mean' mean_df = rep_df.groupby(level=0).mean() import scipy.cluster.hierarchy as sch corr_matrix = mean_df.T.corr(method=cor_method) z_var = sch.linkage( corr_matrix, method=linkage_method, optimal_ordering=optimal_ordering ) dendro_info = sch.dendrogram(z_var, labels=list(categories), no_plot=True) dat = dict( linkage=z_var, groupby=groupby, use_rep=use_rep, cor_method=cor_method, linkage_method=linkage_method, categories_ordered=dendro_info['ivl'], categories_idx_ordered=dendro_info['leaves'], dendrogram_info=dendro_info, correlation_matrix=corr_matrix.values, ) if inplace: if key_added is None: key_added = f'dendrogram_{groupby}' logg.info(f'Storing dendrogram info using `.uns[{key_added!r}]`') adata.uns[key_added] = dat else: return dat
def write_column(f, data, selement, compression=None): """ Write a single column of data to an open Parquet file Parameters ---------- f: open binary file data: pandas Series or numpy (1d) array selement: thrift SchemaElement produced by ``find_type`` compression: str, dict, or None if ``str``, must be one of the keys in ``compression.compress`` if ``dict``, must have key ``"type"`` which specifies the compression type to use, which must be one of the keys in ``compression.compress``, and may optionally have key ``"args`` which should be a dictionary of options to pass to the underlying compression engine. Returns ------- chunk: ColumnChunk structure """ has_nulls = selement.repetition_type == parquet_thrift.FieldRepetitionType.OPTIONAL tot_rows = len(data) encoding = "PLAIN" if has_nulls: if is_categorical_dtype(data.dtype): num_nulls = (data.cat.codes == -1).sum() elif data.dtype.kind in ['i', 'b']: num_nulls = 0 else: num_nulls = len(data) - data.count() definition_data, data = make_definitions(data, num_nulls == 0) if data.dtype.kind == "O" and not is_categorical_dtype(data.dtype): try: if selement.type == parquet_thrift.Type.INT64: data = data.astype(int) elif selement.type == parquet_thrift.Type.BOOLEAN: data = data.astype(bool) except ValueError as e: t = parquet_thrift.Type._VALUES_TO_NAMES[selement.type] raise ValueError('Error converting column "%s" to primitive ' 'type %s. Original error: ' '%s' % (data.name, t, e)) else: definition_data = b"" num_nulls = 0 # No nested field handling (encode those as J/BSON) repetition_data = b"" cats = False name = data.name diff = 0 max, min = None, None start = f.tell() if is_categorical_dtype(data.dtype): dph = parquet_thrift.DictionaryPageHeader( num_values=len(data.cat.categories), encoding=parquet_thrift.Encoding.PLAIN) bdata = encode['PLAIN'](pd.Series(data.cat.categories), selement) bdata += 8 * b'\x00' l0 = len(bdata) if compression: bdata = compress_data(bdata, compression) l1 = len(bdata) else: l1 = l0 diff += l0 - l1 ph = parquet_thrift.PageHeader( type=parquet_thrift.PageType.DICTIONARY_PAGE, uncompressed_page_size=l0, compressed_page_size=l1, dictionary_page_header=dph, crc=None) dict_start = f.tell() write_thrift(f, ph) f.write(bdata) try: if num_nulls == 0: max, min = data.values.max(), data.values.min() if selement.type == parquet_thrift.Type.BYTE_ARRAY: if selement.converted_type is not None: max = encode['PLAIN'](pd.Series([max]), selement)[4:] min = encode['PLAIN'](pd.Series([min]), selement)[4:] else: max = encode['PLAIN'](pd.Series([max]), selement) min = encode['PLAIN'](pd.Series([min]), selement) except TypeError: pass ncats = len(data.cat.categories) data = data.cat.codes cats = True encoding = "PLAIN_DICTIONARY" elif str(data.dtype) in ['int8', 'int16', 'uint8', 'uint16']: # encoding = "RLE" # disallow bitpacking for compatability data = data.astype('int32') bdata = definition_data + repetition_data + encode[encoding]( data, selement) bdata += 8 * b'\x00' try: if encoding != 'PLAIN_DICTIONARY' and num_nulls == 0: max, min = data.values.max(), data.values.min() if selement.type == parquet_thrift.Type.BYTE_ARRAY: if selement.converted_type is not None: max = encode['PLAIN'](pd.Series([max]), selement)[4:] min = encode['PLAIN'](pd.Series([min]), selement)[4:] else: max = encode['PLAIN'](pd.Series([max]), selement) min = encode['PLAIN'](pd.Series([min]), selement) except TypeError: pass dph = parquet_thrift.DataPageHeader( num_values=tot_rows, encoding=getattr(parquet_thrift.Encoding, encoding), definition_level_encoding=parquet_thrift.Encoding.RLE, repetition_level_encoding=parquet_thrift.Encoding.BIT_PACKED) l0 = len(bdata) if compression: bdata = compress_data(bdata, compression) l1 = len(bdata) else: l1 = l0 diff += l0 - l1 ph = parquet_thrift.PageHeader(type=parquet_thrift.PageType.DATA_PAGE, uncompressed_page_size=l0, compressed_page_size=l1, data_page_header=dph, crc=None) try: write_thrift(f, ph) except OverflowError as err: raise IOError('Overflow error while writing page; try using a smaller ' 'value for `row_group_offsets`. Original message: ' + str(err)) f.write(bdata) compressed_size = f.tell() - start uncompressed_size = compressed_size + diff offset = f.tell() s = parquet_thrift.Statistics(max=max, min=min, null_count=num_nulls) p = [parquet_thrift.PageEncodingStats( page_type=parquet_thrift.PageType.DATA_PAGE, encoding=parquet_thrift.Encoding.PLAIN, count=1)] if isinstance(compression, dict): algorithm = compression.get("type", None) else: algorithm = compression cmd = parquet_thrift.ColumnMetaData( type=selement.type, path_in_schema=[name], encodings=[parquet_thrift.Encoding.RLE, parquet_thrift.Encoding.BIT_PACKED, parquet_thrift.Encoding.PLAIN], codec=(getattr(parquet_thrift.CompressionCodec, algorithm.upper()) if algorithm else 0), num_values=tot_rows, statistics=s, data_page_offset=start, encoding_stats=p, key_value_metadata=[], total_uncompressed_size=uncompressed_size, total_compressed_size=compressed_size) if cats: p.append(parquet_thrift.PageEncodingStats( page_type=parquet_thrift.PageType.DICTIONARY_PAGE, encoding=parquet_thrift.Encoding.PLAIN, count=1)) cmd.dictionary_page_offset = dict_start cmd.key_value_metadata.append( parquet_thrift.KeyValue(key='num_categories', value=str(ncats))) cmd.key_value_metadata.append( parquet_thrift.KeyValue(key='numpy_dtype', value=str(data.dtype))) chunk = parquet_thrift.ColumnChunk(file_offset=offset, meta_data=cmd) write_thrift(f, chunk) return chunk
def infer_variable_types(self, variable_types, time_index, secondary_time_index): '''Infer variable types from dataframe Args: variable_types (dict[str -> dict[str -> type]]) : An entity's variable_types dict maps string variable ids to types (:class:`.Variable`) or (type, kwargs) to pass keyword arguments to the Variable. time_index (str or None): Name of time_index column secondary_time_index (dict[str: [str]]): Dictionary of secondary time columns that each map to a list of columns that depend on that secondary time ''' link_relationships = [ r for r in self.entityset.relationships if r.parent_entity.id == self.id or r.child_entity.id == self.id ] link_vars = [ v.id for rel in link_relationships for v in [rel.parent_variable, rel.child_variable] if v.entity.id == self.id ] # TODO: set pk and pk types here inferred_types = {} df = self.df vids_to_assume_datetime = [time_index] if len(list(secondary_time_index.keys())): vids_to_assume_datetime.append( list(secondary_time_index.keys())[0]) inferred_type = vtypes.Unknown for variable in df.columns: if variable in variable_types: continue elif variable in vids_to_assume_datetime: if col_is_datetime(df[variable]): inferred_type = vtypes.Datetime else: inferred_type = vtypes.Numeric elif df[variable].dtype == "object": if variable in link_vars: inferred_type = vtypes.Categorical elif len(df[variable]): if col_is_datetime(df[variable]): inferred_type = vtypes.Datetime else: # heuristics to predict this some other than categorical sample = df[variable].sample( min(10000, df[variable].nunique())) avg_length = sample.str.len().mean() if avg_length > 50: inferred_type = vtypes.Text else: inferred_type = vtypes.Categorical elif df[variable].dtype == "bool": inferred_type = vtypes.Boolean elif pdtypes.is_categorical_dtype(df[variable].dtype): inferred_type = vtypes.Categorical elif col_is_datetime(df[variable]): inferred_type = vtypes.Datetime elif variable in link_vars: inferred_type = vtypes.Ordinal elif len(df[variable]): sample = df[variable] \ .sample(min(10000, df[variable].nunique(dropna=False))) unique = sample.unique() percent_unique = sample.size / len(unique) if percent_unique < .05: inferred_type = vtypes.Categorical else: inferred_type = vtypes.Numeric inferred_types[variable] = inferred_type return inferred_types
def concat(dfs, axis=0, join='outer', uniform=False): """Concatenate, handling some edge cases: - Unions categoricals between partitions - Ignores empty partitions Parameters ---------- dfs : list of DataFrame, Series, or Index axis : int or str, optional join : str, optional uniform : bool, optional Whether to treat ``dfs[0]`` as representative of ``dfs[1:]``. Set to True if all arguments have the same columns and dtypes (but not necessarily categories). Default is False. """ if axis == 1: return pd.concat(dfs, axis=axis, join=join) if len(dfs) == 1: return dfs[0] # Support concatenating indices along axis 0 if isinstance(dfs[0], pd.Index): if isinstance(dfs[0], pd.CategoricalIndex): return pd.CategoricalIndex(union_categoricals(dfs), name=dfs[0].name) return dfs[0].append(dfs[1:]) # Handle categorical index separately if isinstance(dfs[0].index, pd.CategoricalIndex): dfs2 = [df.reset_index(drop=True) for df in dfs] ind = concat([df.index for df in dfs]) else: dfs2 = dfs ind = None # Concatenate the partitions together, handling categories as needed if (isinstance(dfs2[0], pd.DataFrame) if uniform else any(isinstance(df, pd.DataFrame) for df in dfs2)): if uniform: dfs3 = dfs2 cat_mask = dfs2[0].dtypes == 'category' else: # When concatenating mixed dataframes and series on axis 1, Pandas # converts series to dataframes with a single column named 0, then # concatenates. dfs3 = [df if isinstance(df, pd.DataFrame) else df.rename(0).to_frame() for df in dfs2] cat_mask = pd.concat([(df.dtypes == 'category').to_frame().T for df in dfs3], join=join).any() if cat_mask.any(): not_cat = cat_mask[~cat_mask].index out = pd.concat([df[df.columns.intersection(not_cat)] for df in dfs3], join=join) for col in cat_mask.index.difference(not_cat): # Find an example of categoricals in this column for df in dfs3: sample = df.get(col) if sample is not None: break # Extract partitions, subbing in missing if needed parts = [] for df in dfs3: if col in df.columns: parts.append(df[col]) else: codes = np.full(len(df), -1, dtype='i8') data = pd.Categorical.from_codes(codes, sample.cat.categories, sample.cat.ordered) parts.append(data) out[col] = union_categoricals(parts) out = out.reindex_axis(cat_mask.index, axis=1) else: out = pd.concat(dfs3, join=join) else: if is_categorical_dtype(dfs2[0].dtype): if ind is None: ind = concat([df.index for df in dfs2]) return pd.Series(union_categoricals(dfs2), index=ind, name=dfs2[0].name) out = pd.concat(dfs2, join=join) # Re-add the index if needed if ind is not None: out.index = ind return out
def _calculate_agg_features(self, features, entity_frames): test_feature = features[0] entity = test_feature.entity child_entity = test_feature.base_features[0].entity assert entity.id in entity_frames and child_entity.id in entity_frames frame = entity_frames[entity.id] base_frame = entity_frames[child_entity.id] # Sometimes approximate features get computed in a previous filter frame # and put in the current one dynamically, # so there may be existing features here features = [f for f in features if f.get_name() not in frame.columns] if not len(features): return frame # handle where where = test_feature.where if where is not None and not base_frame.empty: base_frame = base_frame.loc[base_frame[where.get_name()]] # when no child data, just add all the features to frame with nan if base_frame.empty: for f in features: frame[f.get_name()] = np.nan else: relationship_path = self.entityset.find_backward_path( entity.id, child_entity.id) groupby_var = Relationship._get_link_variable_name( relationship_path) # if the use_previous property exists on this feature, include only the # instances from the child entity included in that Timedelta use_previous = test_feature.use_previous if use_previous and not base_frame.empty: # Filter by use_previous values time_last = self.time_last if use_previous.is_absolute(): time_first = time_last - use_previous ti = child_entity.time_index if ti is not None: base_frame = base_frame[base_frame[ti] >= time_first] else: n = use_previous.value def last_n(df): return df.iloc[-n:] base_frame = base_frame.groupby(groupby_var, observed=True, sort=False).apply(last_n) to_agg = {} agg_rename = {} to_apply = set() # apply multivariable and time-dependent features as we find them, and # save aggregable features for later for f in features: if _can_agg(f): variable_id = f.base_features[0].get_name() if variable_id not in to_agg: to_agg[variable_id] = [] func = f.get_function() funcname = func if callable(func): funcname = func.__name__ to_agg[variable_id].append(func) # this is used below to rename columns that pandas names for us agg_rename[u"{}-{}".format(variable_id, funcname)] = f.get_name() continue to_apply.add(f) # Apply the non-aggregable functions generate a new dataframe, and merge # it with the existing one if len(to_apply): wrap = agg_wrapper(to_apply, self.time_last) # groupby_var can be both the name of the index and a column, # to silence pandas warning about ambiguity we explicitly pass # the column (in actuality grouping by both index and group would # work) to_merge = base_frame.groupby(base_frame[groupby_var], observed=True, sort=False).apply(wrap) frame = pd.merge(left=frame, right=to_merge, left_index=True, right_index=True, how='left') # Apply the aggregate functions to generate a new dataframe, and merge # it with the existing one if len(to_agg): # groupby_var can be both the name of the index and a column, # to silence pandas warning about ambiguity we explicitly pass # the column (in actuality grouping by both index and group would # work) to_merge = base_frame.groupby(base_frame[groupby_var], observed=True, sort=False).agg(to_agg) # rename columns to the correct feature names to_merge.columns = [ agg_rename["-".join(x)] for x in to_merge.columns.ravel() ] to_merge = to_merge[list(agg_rename.values())] # workaround for pandas bug where categories are in the wrong order # see: https://github.com/pandas-dev/pandas/issues/22501 if pdtypes.is_categorical_dtype(frame.index): categories = pdtypes.CategoricalDtype( categories=frame.index.categories) to_merge.index = to_merge.index.astype(object).astype( categories) frame = pd.merge(left=frame, right=to_merge, left_index=True, right_index=True, how='left') # Handle default values # 1. handle non scalar default values iterfeats = [ f for f in features if hasattr(f.default_value, '__iter__') ] for f in iterfeats: nulls = pd.isnull(frame[f.get_name()]) for ni in nulls[nulls].index: frame.at[ni, f.get_name()] = f.default_value # 2. handle scalars default values fillna_dict = { f.get_name(): f.default_value for f in features if f not in iterfeats } frame.fillna(fillna_dict, inplace=True) # convert boolean dtypes to floats as appropriate # pandas behavior: https://github.com/pydata/pandas/issues/3752 for f in features: if (not f.expanding and f.variable_type == variable_types.Numeric and frame[f.get_name()].dtype.name in ['object', 'bool']): frame[f.get_name()] = frame[f.get_name()].astype(float) return frame
def _is_discrete(s, force_nominal): return (is_categorical_dtype(s) or is_object_dtype(s) and (force_nominal or s.nunique() < s.size**.666))
# build a transformer pipeline based on sp data SPTransformer = CatTransformer.build_transformer_pipeline(sp_india_data, label=None) # transform the data before modeling #x = SPTransformer.fit(sp_india_data) prepared = SPTransformer.fit_transform(sp_india_data) from pandas.api.types import is_numeric_dtype, is_categorical_dtype, \ is_string_dtype cats = [] nums = [] for k, v in sp_india_data.items(): if is_categorical_dtype(v) or is_string_dtype(v): cats.append(k) for k, v in sp_india_data.items(): if is_numeric_dtype(v): nums.append(k) cols = list( SPTransformer.transformers_[0][1].named_steps['ohe'].get_feature_names( input_features=cats)) + nums clf = tree.DecisionTreeClassifier(max_depth=4) clf = clf.fit(prepared, sp_india_label) SPTransformer.transformers_[0][1].named_steps['ohe'].get_feature_names( input_features=cats)
def trajectory_tree(adata: AnnData, root_node: Optional[str] = None, method: Literal["euclid", "gauss", "paga"] = "euclid", dimension: Literal["pca", "diffmap"] = "pca", tree: Optional[nx.DiGraph] = None, groupby: str = "leiden"): """\ Calculate a trajectory tree. Specify AnnData and the root name in the trajectory. You can pass your metadata in adata.obs by passing a key in adata.obs to `groupby`. You can also define your own tree by drawing it as nx.DiGraph and pass it to `tree`. Parameters ---------- adata : AnnData The annotated data matrix. root_node : Optional[str] A given cluster is used as the root for the tree. If `None`, it calculates a most likely root of the tree, but this will not be reliable for the moment. Please set the root cluster, by default `None`. method : Literal["euclid", "gauss", "paga"] Method for calculating the tree, by default "euclid." dimension : Literal["pca", "diffmap"] The data for calculating the trajectory tree, by default "pca." tree : Optional[nx.DiGraph] If `None`, it calculates the trajectory tree. If nx.DiGraph is given, the tree is constructed from the nx.DiGraph, by default `None`. groupby : str Key for categorical in `adata.obs`. You can pass your metadata of clusters, by default "leiden." """ if method not in ["euclid", "paga", "gauss"]: raise ValueError( "Argument 'method' must be 'euclid','paga' or 'gauss'.") if dimension not in ['pca', 'diffmap']: raise ValueError("Argument 'dimension' must be 'pca' or 'diffmap'.") if not isinstance(adata, AnnData): raise ValueError("trajectory_tree() expects AnnData argument") else: if groupby not in adata.obs.keys(): raise ValueError("Did not find adata.obs[{}]".format(groupby)) if "X_{}".format(dimension) not in list(adata.obsm): raise ValueError( "Did not find 'X_{}' in adata.obsm. Run scanpy.tl to calculate first." .format(dimension)) # when adata.obs[groupby] is metadata and not categorized, it cause errors. # maybe there are better ways to set adata.obs, but one code below does jobs enough. if not is_categorical_dtype(adata.obs[groupby]): adata.obs[groupby] = adata.obs[groupby].astype('category') pp = Preprocessing() pp.trajectory_tree(adata, root_node=root_node, tree=tree, method=method, dimension=dimension, groupby=groupby)
def _is_discrete(s): return (is_categorical_dtype(s) or is_object_dtype(s) and (force_nominal or s.nunique() < s.size**.666))
def _assert_categorical_obs(adata: AnnData, key: str) -> None: if key not in adata.obs: raise KeyError(f"Cluster key `{key}` not found in `adata.obs`.") if not is_categorical_dtype(adata.obs[key]): raise TypeError(f"Expected `adata.obs[{key!r}]` to be `categorical`, found `{infer_dtype(adata.obs[key])}`.")
def less_than_equal_to_scalar(vals): if (pdtypes.is_categorical_dtype(vals) and self.value not in vals.cat.categories): return np.nan return vals <= self.value
def greater_than_scalar(vals): if (pdtypes.is_categorical_dtype(vals) and self.value not in vals.cat.categories): return np.nan return vals > self.value
def empty(types, size, cats=None, cols=None, index_types=None, index_names=None, timezones=None): """ Create empty DataFrame to assign into In the simplest case, will return a Pandas dataframe of the given size, with columns of the given names and types. The second return value `views` is a dictionary of numpy arrays into which you can assign values that show up in the dataframe. For categorical columns, you get two views to assign into: if the column name is "col", you get both "col" (the category codes) and "col-catdef" (the category labels). For a single categorical index, you should use the `.set_categories` method of the appropriate "-catdef" columns, passing an Index of values ``views['index-catdef'].set_categories(pd.Index(newvalues), fastpath=True)`` Multi-indexes work a lot like categoricals, even if the types of each index are not themselves categories, and will also have "-catdef" entries in the views. However, these will be Dummy instances, providing only a ``.set_categories`` method, to be used as above. Parameters ---------- types: like np record structure, 'i4,u2,f4,f2,f4,M8,m8', or using tuples applies to non-categorical columns. If there are only categorical columns, an empty string of None will do. size: int Number of rows to allocate cats: dict {col: labels} Location and labels for categorical columns, e.g., {1: ['mary', 'mo]} will create column index 1 (inserted amongst the numerical columns) with two possible values. If labels is an integers, `{'col': 5}`, will generate temporary labels using range. If None, or column name is missing, will assume 16-bit integers (a reasonable default). cols: list of labels assigned column names, including categorical ones. index_types: list of str For one of more index columns, make them have this type. See general description, above, for caveats about multi-indexing. If None, the index will be the default RangeIndex. index_names: list of str Names of the index column(s), if using timezones: dict {col: timezone_str} for timestamp type columns, apply this timezone to the pandas series; the numpy view will be UTC. Returns ------- - dataframe with correct shape and data-types - list of numpy views, in order, of the columns of the dataframe. Assign to this. """ views = {} timezones = timezones or {} if isinstance(types, STR_TYPE): types = types.split(',') cols = cols if cols is not None else range(len(types)) def cat(col): if cats is None or col not in cats: return RangeIndex(0, 2**14) elif isinstance(cats[col], int): return RangeIndex(0, cats[col]) else: # explicit labels list return cats[col] df = OrderedDict() for t, col in zip(types, cols): if str(t) == 'category': df[six.text_type(col)] = Categorical([], categories=cat(col), fastpath=True) else: d = np.empty(0, dtype=t) if d.dtype.kind == "M" and six.text_type(col) in timezones: d = Series(d).dt.tz_localize(timezones[six.text_type(col)]) df[six.text_type(col)] = d df = DataFrame(df) if not index_types: index = RangeIndex(size) elif len(index_types) == 1: t, col = index_types[0], index_names[0] if col is None: raise ValueError('If using an index, must give an index name') if str(t) == 'category': c = Categorical([], categories=cat(col), fastpath=True) vals = np.zeros(size, dtype=c.codes.dtype) index = CategoricalIndex(c) index._data._codes = vals views[col] = vals views[col+'-catdef'] = index._data else: d = np.empty(size, dtype=t) index = Index(d) views[col] = index.values else: index = MultiIndex([[]], [[]]) # index = MultiIndex.from_arrays(indexes) index._levels = list() index._labels = list() for i, col in enumerate(index_names): index._levels.append(Index([None])) def set_cats(values, i=i, col=col, **kwargs): values.name = col if index._levels[i][0] is None: index._levels[i] = values elif not index._levels[i].equals(values): raise RuntimeError("Different dictionaries encountered" " while building categorical") x = Dummy() x._set_categories = set_cats d = np.zeros(size, dtype=int) index._labels.append(d) views[col] = d views[col+'-catdef'] = x axes = [df._data.axes[0], index] # allocate and create blocks blocks = [] for block in df._data.blocks: if block.is_categorical: categories = block.values.categories code = np.zeros(shape=size, dtype=block.values.codes.dtype) values = Categorical(values=code, categories=categories, fastpath=True) new_block = block.make_block_same_class(values=values) elif getattr(block.dtype, 'tz', None): new_shape = (size, ) values = np.empty(shape=new_shape, dtype="M8[ns]") new_block = block.make_block_same_class( values=values, dtype=block.values.dtype) else: new_shape = (block.values.shape[0], size) values = np.empty(shape=new_shape, dtype=block.values.dtype) new_block = block.make_block_same_class(values=values) blocks.append(new_block) # create block manager df = DataFrame(BlockManager(blocks, axes)) # create views for block in df._data.blocks: dtype = block.dtype inds = block.mgr_locs.indexer if isinstance(inds, slice): inds = list(range(inds.start, inds.stop, inds.step)) for i, ind in enumerate(inds): col = df.columns[ind] if is_categorical_dtype(dtype): views[col] = block.values._codes views[col+'-catdef'] = block.values elif getattr(block.dtype, 'tz', None): views[col] = np.asarray(block.values, dtype='M8[ns]') else: views[col] = block.values[i] if index_names: df.index.names = [ None if re.match(r'__index_level_\d+__', n) else n for n in index_names ] return df, views
def convert_col_dtype(col, int_to_category=True, force_fp32=True): """Convert datatypes for columns according to "sensible" rules for the tasks in this module: * integer types are reduced to smallest integer type without losing information, or to a categorical if that uses less memory (roughly) * float types are all made the same: either the type of the first element, or all are reduced to single precision * object types that contain strings are converted to categoricals * object types that contain numbers are converted according to the rules above to either floats, shortest-possible ints, or a categorical * bool types are forced to ``numpy.dtype('bool')`` Parameters ---------- col : pandas.Series Column int_to_category : bool Whether to convert integer types to categoricals in the case that this will save memory. force_fp32 : bool Force all floating-point data types to be single precision (fp32). If False, the type of the first element is used instead (for all values in the column). Returns ------- col : pandas.Series """ from pisa.utils.fileio import fsort categorical_dtype = CategoricalDtype() recognized_dtype = False original_dtype = col.dtype col_name = col.name if len(col) == 0: #pylint: disable=len-as-condition return col first_item = col.iloc[0] # Default: keep current dtype new_dtype = original_dtype if (is_categorical_dtype(original_dtype) or is_datetime64_any_dtype(original_dtype) or is_timedelta64_dtype(original_dtype) or is_timedelta64_ns_dtype(original_dtype)): recognized_dtype = True new_dtype = original_dtype elif is_object_dtype(original_dtype): if isinstance(first_item, basestring): recognized_dtype = True new_dtype = categorical_dtype # NOTE: Must check bool before int since bools look like ints (but not # vice versa) elif isinstance(first_item, BOOL_TYPES): recognized_dtype = True new_dtype = np.dtype('bool') elif isinstance(first_item, INT_TYPES + UINT_TYPES): recognized_dtype = True new_dtype = np.dtype('int') elif isinstance(first_item, FLOAT_TYPES): recognized_dtype = True new_dtype = np.dtype(type(first_item)) # Convert ints to either shortest int possible or categorical, # whichever is smaller (use int if same size) if new_dtype in INT_DTYPES + UINT_DTYPES: recognized_dtype = True # See how large an int would be necessary col_min, col_max = col.min(), col.max() found_int_dtype = False int_dtype = None for int_dtype in INT_DTYPES: exponent = 8*int_dtype.itemsize - 1 min_representable = -2 ** exponent max_representable = (2 ** exponent) - 1 if col_min >= min_representable and col_max <= max_representable: found_int_dtype = True break if not found_int_dtype: raise ValueError('Value(s) in column "%s" exceed %s bounds' % (col_name, int_dtype)) # Check if categorical is probably smaller than int dtype; note that # the below is not perfect (i.e. is not based on exact internal # representation of categoricals in Pandas...) but should get us pretty # close, so that at least order-of-magnitude efficiencies will be # found) if int_to_category: num_unique = len(col.unique()) category_bytes = int(np.ceil(np.log2(num_unique) / 8)) if category_bytes < int_dtype.itemsize: new_dtype = categorical_dtype else: new_dtype = int_dtype elif new_dtype in FLOAT_DTYPES: recognized_dtype = True if force_fp32: new_dtype = np.dtype('float32') else: new_dtype = np.dtype(type(first_item)) elif new_dtype in BOOL_DTYPES: recognized_dtype = True new_dtype = np.dtype('bool') if not recognized_dtype: wstderr('WARNING: Not modifying column "%s" with unhandled dtype "%s"' ' and/or sub-type "%s"\n' % (col_name, original_dtype.name, type(first_item))) if is_dtype_equal(new_dtype, original_dtype): if isinstance(first_item, basestring): return col.cat.reorder_categories(fsort(col.cat.categories)) return col if is_categorical_dtype(new_dtype): new_col = col.astype('category') if isinstance(first_item, basestring): new_col.cat.reorder_categories(fsort(new_col.cat.categories), inplace=True) return new_col try: return col.astype(new_dtype) except ValueError: wstderr('WARNING: Could not convert column "%s" to dtype "%s"; keeping' ' original dtype "%s"\n' % (col_name, new_dtype, original_dtype)) return col