示例#1
0
文件: utils.py 项目: fortizc/dask
 def equal_dtypes(a, b):
     if is_categorical_dtype(a) != is_categorical_dtype(b):
         return False
     if (a is '-' or b is '-'):
         return False
     if is_categorical_dtype(a) and is_categorical_dtype(b):
         # Pandas 0.21 CategoricalDtype compat
         if (PANDAS_VERSION >= '0.21.0' and
                 (UNKNOWN_CATEGORIES in a.categories or
                  UNKNOWN_CATEGORIES in b.categories)):
             return True
         return a == b
     return (a.kind in eq_types and b.kind in eq_types) or (a == b)
示例#2
0
def describe(data):
    '''
    对每个变量生成统计指标特征
    对于每一个变量,生成如下字段:
        数据类型:
        最大值/频数最大的那个:
        最小值/频数最小的那个:
        均值/频数中间的那个:
        缺失率:
        范围/唯一数:
    '''

    data=pd.DataFrame(data)
    n_sample=len(data)
    var_type=type_of_var(data,copy=True)
    summary=pd.DataFrame(columns=data.columns,index=['dtype','max','min','mean','missing_pct','std/nuniue'])
    for c in data.columns:
        missing_pct=1-data[c].count()/n_sample
        if var_type[c] == 'number':
            max_value,min_value,mean_value=data[c].max(),data[c].min(),data[c].mean()
            std_value=data[c].std()
            summary.loc[:,c]=[var_type[c],max_value,min_value,mean_value,missing_pct,std_value]
        elif var_type[c] == 'category' or is_categorical_dtype(data[c].dtype):
            tmp=data[c].value_counts()
            max_value,min_value=tmp.argmax(),tmp.argmin()
            mean_value_index=tmp[tmp==tmp.median()].index
            mean_value=mean_value_index[0] if len(mean_value_index)>0 else np.nan
            summary.loc[:,c]=[var_type[c],max_value,min_value,mean_value,missing_pct,len(tmp)]
        elif var_type[c] == 'datetime':
            max_value,min_value=data[c].max(),data[c].min()
            summary.loc[:,c]=[var_type[c],max_value,min_value,np.nan,missing_pct,np.nan]
        else:
            summary.loc[:,c]=[var_type[c],np.nan,np.nan,np.nan,missing_pct,np.nan]
    return summary
示例#3
0
def get_var_type(col):
    """
    Return var_type (for KDEMultivariate) of the column

    Parameters
    ----------
    col : pandas.Series
        A dataframe column.

    Returns
    -------
    out : str
        One of ['c', 'o', 'u'].

    See Also
    --------
    The origin of the character codes is
    :class:`statsmodels.nonparametric.kernel_density.KDEMultivariate`.
    """
    if pdtypes.is_numeric_dtype(col):
        # continuous
        return 'c'
    elif pdtypes.is_categorical_dtype(col):
        # ordered or unordered
        return 'o' if col.cat.ordered else 'u'
    else:
        # unordered if unsure, e.g string columns that
        # are not categorical
        return 'u'
示例#4
0
文件: utils.py 项目: fortizc/dask
def clear_known_categories(x, cols=None, index=True):
    """Set categories to be unknown.

    Parameters
    ----------
    x : DataFrame, Series, Index
    cols : iterable, optional
        If x is a DataFrame, set only categoricals in these columns to unknown.
        By default, all categorical columns are set to unknown categoricals
    index : bool, optional
        If True and x is a Series or DataFrame, set the clear known categories
        in the index as well.
    """
    if isinstance(x, (pd.Series, pd.DataFrame)):
        x = x.copy()
        if isinstance(x, pd.DataFrame):
            mask = x.dtypes == 'category'
            if cols is None:
                cols = mask[mask].index
            elif not mask.loc[cols].all():
                raise ValueError("Not all columns are categoricals")
            for c in cols:
                x[c].cat.set_categories([UNKNOWN_CATEGORIES], inplace=True)
        elif isinstance(x, pd.Series):
            if is_categorical_dtype(x.dtype):
                x.cat.set_categories([UNKNOWN_CATEGORIES], inplace=True)
        if index and isinstance(x.index, pd.CategoricalIndex):
            x.index = x.index.set_categories([UNKNOWN_CATEGORIES])
    elif isinstance(x, pd.CategoricalIndex):
        x = x.set_categories([UNKNOWN_CATEGORIES])
    return x
示例#5
0
    def query_by_values(self, instance_vals, variable_id=None, columns=None,
                        time_last=None, training_window=None):
        """Query instances that have variable with given value

        Args:
            instance_vals (pd.Dataframe, pd.Series, list[str] or str) :
                Instance(s) to match.
            variable_id (str) : Variable to query on. If None, query on index.
            columns (list[str]) : Columns to return. Return all columns if None.
            time_last (pd.TimeStamp) : Query data up to and including this
                time. Only applies if entity has a time index.
            training_window (Timedelta, optional):
                Data older than time_last by more than this will be ignored

        Returns:
            pd.DataFrame : instances that match constraints
        """
        instance_vals = self._vals_to_series(instance_vals, variable_id)

        training_window = _check_timedelta(training_window)
        if training_window is not None:
            assert (isinstance(training_window, Timedelta) and
                    training_window.is_absolute()),\
                "training window must be an absolute Timedelta"

        if instance_vals is None:
            df = self.df.copy()

        elif instance_vals.shape[0] == 0:
            df = self.df.head(0)

        elif variable_id is None or variable_id == self.index:
            df = self.df.reindex(instance_vals)
            df.dropna(subset=[self.index], inplace=True)

        else:
            df = self.df.merge(instance_vals.to_frame(variable_id),
                               how="inner", on=variable_id)
            df = df.set_index(self.index, drop=False)

            # ensure filtered df has same categories as original
            # workaround for issue below
            # github.com/pandas-dev/pandas/issues/22501#issuecomment-415982538
            if pdtypes.is_categorical_dtype(self.df[variable_id]):
                categories = pd.api.types.CategoricalDtype(categories=self.df[variable_id].cat.categories)
                df[variable_id] = df[variable_id].astype(categories)

        df = self._handle_time(df=df,
                               time_last=time_last,
                               training_window=training_window)

        if columns is not None:
            df = df[columns]

        return df
示例#6
0
文件: utils.py 项目: fortizc/dask
def shard_df_on_index(df, divisions):
    """ Shard a DataFrame by ranges on its index

    Examples
    --------

    >>> df = pd.DataFrame({'a': [0, 10, 20, 30, 40], 'b': [5, 4 ,3, 2, 1]})
    >>> df
        a  b
    0   0  5
    1  10  4
    2  20  3
    3  30  2
    4  40  1

    >>> shards = list(shard_df_on_index(df, [2, 4]))
    >>> shards[0]
        a  b
    0   0  5
    1  10  4

    >>> shards[1]
        a  b
    2  20  3
    3  30  2

    >>> shards[2]
        a  b
    4  40  1

    >>> list(shard_df_on_index(df, []))[0]  # empty case
        a  b
    0   0  5
    1  10  4
    2  20  3
    3  30  2
    4  40  1
    """

    if isinstance(divisions, Iterator):
        divisions = list(divisions)
    if not len(divisions):
        yield df
    else:
        divisions = np.array(divisions)
        df = df.sort_index()
        index = df.index
        if is_categorical_dtype(index):
            index = index.as_ordered()
        indices = index.searchsorted(divisions)
        yield df.iloc[:indices[0]]
        for i in range(len(indices) - 1):
            yield df.iloc[indices[i]: indices[i + 1]]
        yield df.iloc[indices[-1]:]
示例#7
0
文件: utils.py 项目: jwhendy/plotnine
def add_margins(df, vars, margins=True):
    """
    Add margins to a data frame.

    All margining variables will be converted to factors.

    Parameters
    ----------
    df : dataframe
        input data frame

    vars : list
        a list of 2 lists | tuples vectors giving the
        variables in each dimension

    margins : bool | list
        variable names to compute margins for.
        True will compute all possible margins.
    """
    margin_vars = _margins(vars, margins)
    if not margin_vars:
        return df

    # create margin dataframes
    margin_dfs = [df]
    for vlst in margin_vars[1:]:
        dfx = df.copy()
        for v in vlst:
            dfx.loc[0:, v] = '(all)'
        margin_dfs.append(dfx)

    merged = pd.concat(margin_dfs, axis=0)
    merged.reset_index(drop=True, inplace=True)

    # All margin columns become categoricals. The margin indicator
    # (all) needs to be added as the last level of the categories.
    categories = {}
    for v in itertools.chain(*vars):
        col = df[v]
        if not pdtypes.is_categorical_dtype(df[v].dtype):
            col = pd.Categorical(df[v])
        categories[v] = col.categories
        if '(all)' not in categories[v]:
            categories[v] = categories[v].insert(
                len(categories[v]), '(all)')

    for v in merged.columns.intersection(set(categories)):
        merged[v] = merged[v].astype(
           pdtypes.CategoricalDtype(categories[v]))

    return merged
示例#8
0
def elements_and_dtype(elements, dtype, source=None):

    if source is None:
        prefix = ""
    else:
        prefix = "%s." % (source,)

    if elements is not None:
        st.check_strategy(elements, "%selements" % (prefix,))
    else:
        with check("dtype is not None"):
            if dtype is None:
                raise InvalidArgument(
                    (
                        "At least one of %(prefix)selements or %(prefix)sdtype "
                        "must be provided."
                    )
                    % {"prefix": prefix}
                )

    with check("is_categorical_dtype"):
        if is_categorical_dtype(dtype):
            raise InvalidArgument(
                "%sdtype is categorical, which is currently unsupported" % (prefix,)
            )

    dtype = try_convert(np.dtype, dtype, "dtype")

    if elements is None:
        elements = npst.from_dtype(dtype)
    elif dtype is not None:

        def convert_element(value):
            name = "draw(%selements)" % (prefix,)
            try:
                return np.array([value], dtype=dtype)[0]
            except TypeError:
                raise InvalidArgument(
                    "Cannot convert %s=%r of type %s to dtype %s"
                    % (name, value, type(value).__name__, dtype.str)
                )
            except ValueError:
                raise InvalidArgument(
                    "Cannot convert %s=%r to type %s" % (name, value, dtype.str)
                )

        elements = elements.map(convert_element)
    assert elements is not None

    return elements, dtype
示例#9
0
def elements_and_dtype(elements, dtype, source=None):

    if source is None:
        prefix = ''
    else:
        prefix = '%s.' % (source,)

    if elements is not None:
        st.check_strategy(elements, '%selements' % (prefix,))
    else:
        with check('dtype is not None'):
            if dtype is None:
                raise InvalidArgument((
                    'At least one of %(prefix)selements or %(prefix)sdtype '
                    'must be provided.') % {'prefix': prefix})

    with check('is_categorical_dtype'):
        if is_categorical_dtype(dtype):
            raise InvalidArgument(
                '%sdtype is categorical, which is currently unsupported' % (
                    prefix,
                ))

    dtype = st.try_convert(np.dtype, dtype, 'dtype')

    if elements is None:
        elements = npst.from_dtype(dtype)
    elif dtype is not None:
        def convert_element(value):
            name = 'draw(%selements)' % (prefix,)
            try:
                return np.array([value], dtype=dtype)[0]
            except TypeError:
                raise InvalidArgument(
                    'Cannot convert %s=%r of type %s to dtype %s' % (
                        name, value, type(value).__name__, dtype.str
                    )
                )
            except ValueError:
                raise InvalidArgument(
                    'Cannot convert %s=%r to type %s' % (
                        name, value, dtype.str,
                    )
                )
        elements = elements.map(convert_element)
    assert elements is not None

    return elements, dtype
示例#10
0
文件: utils.py 项目: limx0/dask
def _nonempty_series(s, idx):

    dtype = s.dtype
    if is_datetime64tz_dtype(dtype):
        entry = pd.Timestamp('1970-01-01', tz=dtype.tz)
        data = [entry, entry]
    elif is_categorical_dtype(dtype):
        entry = s.cat.categories[0]
        data = pd.Categorical([entry, entry],
                              categories=s.cat.categories,
                              ordered=s.cat.ordered)
    else:
        entry = _scalar_from_dtype(dtype)
        data = np.array([entry, entry], dtype=dtype)

    return pd.Series(data, name=s.name, index=idx)
示例#11
0
def make_metadata(data, has_nulls=True, ignore_columns=[], fixed_text=None,
                  object_encoding=None, times='int64', index_cols=[]):
    if not data.columns.is_unique:
        raise ValueError('Cannot create parquet dataset with duplicate'
                         ' column names (%s)' % data.columns)
    pandas_metadata = {'index_columns': index_cols,
                       'columns': [], 'pandas_version': pd.__version__}
    root = parquet_thrift.SchemaElement(name='schema',
                                        num_children=0)

    meta = parquet_thrift.KeyValue()
    meta.key = 'pandas'
    fmd = parquet_thrift.FileMetaData(num_rows=len(data),
                                      schema=[root],
                                      version=1,
                                      created_by=created_by,
                                      row_groups=[],
                                      key_value_metadata=[meta])

    object_encoding = object_encoding or {}
    for column in data.columns:
        if column in ignore_columns:
            continue
        pandas_metadata['columns'].append(
            get_column_metadata(data[column], column))
        oencoding = (object_encoding if isinstance(object_encoding, STR_TYPE)
                     else object_encoding.get(column, None))
        fixed = None if fixed_text is None else fixed_text.get(column, None)
        if is_categorical_dtype(data[column].dtype):
            se, type = find_type(data[column].cat.categories,
                                 fixed_text=fixed, object_encoding=oencoding)
            se.name = column
        else:
            se, type = find_type(data[column], fixed_text=fixed,
                                 object_encoding=oencoding, times=times)
        col_has_nulls = has_nulls
        if has_nulls is None:
            se.repetition_type = data[column].dtype == "O"
        elif has_nulls is not True and has_nulls is not False:
            col_has_nulls = column in has_nulls
        if col_has_nulls:
            se.repetition_type = parquet_thrift.FieldRepetitionType.OPTIONAL
        fmd.schema.append(se)
        root.num_children += 1
    meta.value = json.dumps(pandas_metadata, sort_keys=True)
    return fmd
示例#12
0
def _nonempty_series(s, idx=None):
    # TODO: Use register dtypes with make_array_nonempty
    if idx is None:
        idx = _nonempty_index(s.index)
    dtype = s.dtype
    if is_datetime64tz_dtype(dtype):
        entry = pd.Timestamp('1970-01-01', tz=dtype.tz)
        data = [entry, entry]
    elif is_categorical_dtype(dtype):
        if len(s.cat.categories):
            data = [s.cat.categories[0]] * 2
            cats = s.cat.categories
        else:
            data = _nonempty_index(s.cat.categories)
            cats = None
        data = pd.Categorical(data, categories=cats,
                              ordered=s.cat.ordered)
    elif is_integer_na_dtype(dtype):
        data = pd.array([1, None], dtype=dtype)
    elif is_period_dtype(dtype):
        # pandas 0.24.0+ should infer this to be Series[Period[freq]]
        freq = dtype.freq
        data = [pd.Period('2000', freq), pd.Period('2001', freq)]
    elif is_sparse(dtype):
        # TODO: pandas <0.24
        # Pandas <= 0.23.4:
        if PANDAS_GT_0240:
            entry = _scalar_from_dtype(dtype.subtype)
        else:
            entry = _scalar_from_dtype(dtype.subtype)
        data = pd.SparseArray([entry, entry], dtype=dtype)
    elif is_interval_dtype(dtype):
        entry = _scalar_from_dtype(dtype.subtype)
        if PANDAS_GT_0240:
            data = pd.array([entry, entry], dtype=dtype)
        else:
            data = np.array([entry, entry], dtype=dtype)
    elif type(dtype) in make_array_nonempty._lookup:
        data = make_array_nonempty(dtype)
    else:
        entry = _scalar_from_dtype(dtype)
        data = np.array([entry, entry], dtype=dtype)

    return pd.Series(data, name=s.name, index=idx)
示例#13
0
文件: utils.py 项目: jwhendy/plotnine
def _id_var(x, drop=False):
    """
    Assign ids to items in x. If two items
    are the same, they get the same id.

    Parameters
    ----------
    x : array-like
        items to associate ids with
    drop : bool
        Whether to drop unused factor levels
    """
    if len(x) == 0:
        return []

    categorical = pdtypes.is_categorical_dtype(x)

    if categorical:
        if drop:
            x = x.cat.remove_unused_categories()
            lst = list(x.cat.codes + 1)
        else:
            has_nan = any(np.isnan(i) for i in x if isinstance(i, float))
            if has_nan:
                # NaNs are -1, we give them the highest code
                nan_code = -1
                new_nan_code = np.max(x.cat.codes) + 1
                lst = [val if val != nan_code else new_nan_code for val in x]
            else:
                lst = list(x.cat.codes + 1)
    else:
        try:
            levels = np.sort(np.unique(x))
        except TypeError:
            # x probably has NANs
            levels = multitype_sort(set(x))

        lst = match(x, levels)
        lst = [item + 1 for item in lst]

    return lst
示例#14
0
文件: utils.py 项目: mrocklin/dask
def _nonempty_series(s, idx=None):
    if idx is None:
        idx = _nonempty_index(s.index)
    dtype = s.dtype
    if is_datetime64tz_dtype(dtype):
        entry = pd.Timestamp('1970-01-01', tz=dtype.tz)
        data = [entry, entry]
    elif is_categorical_dtype(dtype):
        if len(s.cat.categories):
            data = [s.cat.categories[0]] * 2
            cats = s.cat.categories
        else:
            data = _nonempty_index(s.cat.categories)
            cats = None
        data = pd.Categorical(data, categories=cats,
                              ordered=s.cat.ordered)
    else:
        entry = _scalar_from_dtype(dtype)
        data = np.array([entry, entry], dtype=dtype)

    return pd.Series(data, name=s.name, index=idx)
示例#15
0
def get_column_metadata(column, name):
    """Produce pandas column metadata block"""
    # from pyarrow.pandas_compat
    # https://github.com/apache/arrow/blob/master/python/pyarrow/pandas_compat.py
    inferred_dtype = infer_dtype(column)
    dtype = column.dtype

    if is_categorical_dtype(dtype):
        extra_metadata = {
            'num_categories': len(column.cat.categories),
            'ordered': column.cat.ordered,
        }
        dtype = column.cat.codes.dtype
    elif hasattr(dtype, 'tz'):
        extra_metadata = {'timezone': str(dtype.tz)}
    else:
        extra_metadata = None

    if not isinstance(name, six.string_types):
        raise TypeError(
            'Column name must be a string. Got column {} of type {}'.format(
                name, type(name).__name__
            )
        )

    return {
        'name': name,
        'pandas_type': {
            'string': 'bytes' if PY2 else 'unicode',
            'datetime64': (
                'datetimetz' if hasattr(dtype, 'tz')
                else 'datetime'
            ),
            'integer': str(dtype),
            'floating': str(dtype),
        }.get(inferred_dtype, inferred_dtype),
        'numpy_type': get_numpy_type(dtype),
        'metadata': extra_metadata,
    }
示例#16
0
文件: utils.py 项目: fortizc/dask
def strip_unknown_categories(x):
    """Replace any unknown categoricals with empty categoricals.

    Useful for preventing ``UNKNOWN_CATEGORIES`` from leaking into results.
    """
    if isinstance(x, (pd.Series, pd.DataFrame)):
        x = x.copy()
        if isinstance(x, pd.DataFrame):
            cat_mask = x.dtypes == 'category'
            if cat_mask.any():
                cats = cat_mask[cat_mask].index
                for c in cats:
                    if not has_known_categories(x[c]):
                        x[c].cat.set_categories([], inplace=True)
        elif isinstance(x, pd.Series):
            if is_categorical_dtype(x.dtype) and not has_known_categories(x):
                x.cat.set_categories([], inplace=True)
        if (isinstance(x.index, pd.CategoricalIndex) and not
                has_known_categories(x.index)):
            x.index = x.index.set_categories([])
    elif isinstance(x, pd.CategoricalIndex) and not has_known_categories(x):
        x = x.set_categories([])
    return x
示例#17
0
def dtype_detection(data,category_detection=True,StructureText_detection=True,\
datetime_to_category=True,criterion='sqrt',min_mean_counts=5,fix=False):
    '''检测数据中单个变量的数据类型
    将数据类型分为以下4种
    1. number,数值型
    2. category,因子
    3. datetime,时间类型
    4. text,文本型
    5. text_st,结构性文本,比如ID,
    6. group_number,连续

    parameter
    ---------
    data: pd.Series 数据, 仅支持一维
    # 如果有data,则函数会改变原来data的数据类型
    category_detection: bool,根据 nunique 检测是否是因子类型
    StructureText_detection: bool, 结构化文本,如列中都有一个分隔符"-"
    datetime_to_category: 时间序列如果 nunique过少是否转化成因子变量
    criterion: string or int, optional (default="sqrt",即样本数的开根号)
        支持:'sqrt':样本量的开根号, int: 绝对数, 0-1的float:样本数的百分多少
        检测因子变量时,如果一个特征的nunique小于criterion,则判定为因子变量
    min_mean_counts: default 5,数值型判定为因子变量时,需要满足每个类别的平均频数要大于min_mean_counts
    fix: bool,是否返回修改好类型的数据


    return:
    result:dict{
        'name':列名,
        'vtype':变量类型,
        'ordered':是否是有序因子,
        'categories':所有的因子}

    '''

    assert len(data.shape)==1
    data=data.copy()
    data=pd.Series(data)
    dtype,name,n_sample=data.dtype,data.name,data.count()

    min_mean_counts=5
    if criterion=='sqrt':
        max_nuniques=np.sqrt(n_sample)
    elif isinstance(criterion,int):
        max_nuniques=criterion
    elif isinstance(criterion,float) and (0<criterion<1):
        max_nuniques=criterion
    else:
        max_nuniques=np.sqrt(n_sample)
    ordered=False
    categories=[]
    if is_numeric_dtype(dtype):
        vtype='number'
        ordered=False
        categories=[]
        # 纠正误分的数据类型。如将1.0,2.0,3.0都修正为1,2,3
        if data.dropna().astype(np.int64).sum()==data.dropna().sum():
            data[data.notnull()]=data[data.notnull()].astype(np.int64)
        if category_detection:
            nunique=len(data.dropna().unique())
            mean_counts=data.value_counts().median()
            if nunique<max_nuniques and mean_counts>=min_mean_counts:
                data=data.astype('category')
                ordered=data.cat.ordered
                vtype='category'
                categories=list(data.dropna().cat.categories)
        result={'name':name,'vtype':vtype,'ordered':ordered,'categories':categories}
    elif is_string_dtype(dtype):
        # 处理时间类型
        tmp=data.map(lambda x: np.nan if '%s'%x == 'nan' else len('%s'%x))
        tmp=tmp.dropna().astype(np.int64)
        if not(any(data.dropna().map(is_number))) and 7<tmp.max()<20 and tmp.std()<0.1:
            try:
                data=pd.to_datetime(data)
            except :
                pass
        # 处理可能的因子类型
        #时间格式是否处理为True 且
        if datetime_to_category:
            if len(data.dropna().unique())<np.sqrt(n_sample):
                data=data.astype('category')
        else:
            nunique=len(data.dropna().unique())
            #print(data.dtype)
            if not(is_categorical_dtype(data.dtype)) and not(np.issubdtype(data.dtype,np.datetime64)) and nunique<max_nuniques:
                data=data.astype('category')

        # 在非因子类型的前提下,将百分数转化成浮点数,例如21.12%-->0.2112
        if is_string_dtype(data.dtype) and not(is_categorical_dtype(data.dtype)) and all(data.str.contains('%')):
            data=data.str.strip('%').astype(np.float64)/100

        if is_categorical_dtype(data.dtype):
            vtype='category'
            categories=list(data.cat.categories)
            ordered=data.cat.ordered
        # 时间格式
        elif np.issubdtype(data.dtype,np.datetime64):
            vtype='datetime'
        # 是否是结构化数组
        elif StructureText_detection and tmp.dropna().std()==0:
            # 不可迭代,不是字符串
            if not(isinstance(data.dropna().iloc[0],Iterable)):
                vtype='text'
            else:
                k=set(list(data.dropna().iloc[0]))
                for x in data:
                    if isinstance(x,str) and len(x)>0:
                        k&=set(list(x))
                if len(k)>0:
                    vtype='text_st'
                else:
                    vtype='text'
        elif is_numeric_dtype(data.dtype):
            vtype='number'
            ordered=False
            categories=[]
        else:
            vtype='text'
        result={'name':name,'vtype':vtype,'ordered':ordered,'categories':categories}
    elif is_datetime64_any_dtype(dtype):
        vtype='datetime'
        result={'name':name,'vtype':vtype,'ordered':ordered,'categories':categories}
    else:
        print('unknown dtype!')
        result=None

    if fix:
        return result,data
    else:
        return result
示例#18
0
def _get_color_values(adata,
                      value_to_plot,
                      groups=None,
                      palette=None,
                      use_raw=False):
    """
    Returns the value or color associated to each data point.
    For categorical data, the return value is list of colors taken
    from the category palette or from the given `palette` value.

    For non-categorical data, the values are returned
    """

    ###
    # when plotting, the color of the dots is determined for each plot
    # the data is either categorical or continuous and the data could be in
    # 'obs' or in 'var'
    categorical = False
    if value_to_plot is None:
        color_vector = 'lightgray'
    # check if value to plot is in obs
    elif value_to_plot in adata.obs.columns:
        if is_categorical_dtype(adata.obs[value_to_plot]):
            categorical = True

            if palette:
                # use category colors base on given palette
                _set_colors_for_categorical_obs(adata, value_to_plot, palette)
            else:
                if value_to_plot + '_colors' not in adata.uns or \
                    len(adata.uns[value_to_plot + '_colors']) < len(adata.obs[value_to_plot].cat.categories):
                    #  set a default palette in case that no colors or few colors are found
                    _set_default_colors_for_categorical_obs(
                        adata, value_to_plot)
                else:
                    # check that the colors in 'uns' are valid
                    _palette = []
                    for color in adata.uns[value_to_plot + '_colors']:
                        if not is_color_like(color):
                            # check if the color is a valid R color and translate it
                            # to a valid hex color value
                            if color in utils.additional_colors:
                                color = utils.additional_colors[color]
                            else:
                                logg.warn(
                                    "The following color value found in adata.uns['{}'] "
                                    " is not valid: '{}'. Default colors are used."
                                    .format(value_to_plot + '_colors', color))
                                _set_default_colors_for_categorical_obs(
                                    adata, value_to_plot)
                                _palette = None
                                break
                        _palette.append(color)
                    if _palette is not None:
                        adata.uns[value_to_plot + '_colors'] = _palette
            # for categorical data, colors should be
            # stored in adata.uns[value_to_plot + '_colors']
            # Obtain color vector by converting every category
            # into its respective color

            color_vector = [
                adata.uns[value_to_plot + '_colors'][x]
                for x in adata.obs[value_to_plot].cat.codes
            ]
            if groups is not None:
                if isinstance(groups, str):
                    groups = [groups]
                color_vector = np.array(color_vector, dtype='<U15')
                # set color to 'light gray' for all values
                # that are not in the groups
                color_vector[~adata.obs[value_to_plot].isin(groups
                                                            )] = "lightgray"
        else:
            color_vector = adata.obs[value_to_plot]

    # check if value to plot is in var
    elif use_raw is False and value_to_plot in adata.var_names:
        color_vector = adata[:, value_to_plot].X

    elif use_raw is True and value_to_plot in adata.raw.var_names:
        color_vector = adata.raw[:, value_to_plot].X
    else:
        raise ValueError(
            "The passed `color` {} is not a valid observation annotation "
            "or variable name. Valid observation annotation keys are: {}".
            format(value_to_plot, adata.obs.columns))

    return color_vector, categorical
示例#19
0
def paga_path(adata,
              nodes,
              keys,
              use_raw=True,
              annotations=['dpt_pseudotime'],
              color_map=None,
              color_maps_annotations={'dpt_pseudotime': 'Greys'},
              palette_groups=None,
              n_avg=1,
              groups_key=None,
              xlim=[None, None],
              title=None,
              left_margin=None,
              ytick_fontsize=None,
              title_fontsize=None,
              show_node_names=True,
              show_yticks=True,
              show_colorbar=True,
              legend_fontsize=None,
              legend_fontweight=None,
              normalize_to_zero_one=False,
              as_heatmap=True,
              return_data=False,
              show=None,
              save=None,
              ax=None):
    """Gene expression and annotation changes along paths in the abstracted graph.

    Parameters
    ----------
    adata : :class:`~scanpy.api.AnnData`
        An annotated data matrix.
    nodes : list of group names or their category indices
        A path through nodes of the abstracted graph, that is, names or indices
        (within `.categories`) of groups that have been used to run PAGA.
    keys : list of str
        Either variables in `adata.var_names` or annotations in
        `adata.obs`. They are plotted using `color_map`.
    use_raw : `bool`, optional (default: `True`)
        Use `adata.raw` for retrieving gene expressions if it has been set.
    annotations : list of annotations, optional (default: ['dpt_pseudotime'])
        Plot these keys with `color_maps_annotations`. Need to be keys for
        `adata.obs`.
    color_map : color map for plotting keys or `None`, optional (default: `None`)
        Matplotlib colormap.
    color_maps_annotations : dict storing color maps or `None`, optional (default: {'dpt_pseudotime': 'Greys'})
        Color maps for plotting the annotations. Keys of the dictionary must
        appear in `annotations`.
    palette_groups : list of colors or `None`, optional (default: `None`)
        Ususally, use the same `sc.pl.palettes...` as used for coloring the
        abstracted graph.
    n_avg : `int`, optional (default: 1)
        Number of data points to include in computation of running average.
    groups_key : `str`, optional (default: `None`)
        Key of the grouping used to run PAGA. If `None`, defaults to
        `adata.uns['paga']['groups']`.
    as_heatmap : `bool`, optional (default: `True`)
        Plot the timeseries as heatmap. If not plotting as heatmap,
        `annotations` have no effect.
    show_node_names : `bool`, optional (default: `True`)
        Plot the node names on the nodes bar.
    show_colorbar : `bool`, optional (default: `True`)
        Show the colorbar.
    show_yticks : `bool`, optional (default: `True`)
        Show the y ticks.
    normalize_to_zero_one : `bool`, optional (default: `True`)
        Shift and scale the running average to [0, 1] per gene.
    return_data : `bool`, optional (default: `False`)
        Return the timeseries data in addition to the axes if `True`.
    show : `bool`, optional (default: `None`)
         Show the plot, do not return axis.
    save : `bool` or `str`, optional (default: `None`)
        If `True` or a `str`, save the figure. A string is appended to the
        default filename. Infer the filetype if ending on \{'.pdf', '.png', '.svg'\}.
    ax : `matplotlib.Axes`
         A matplotlib axes object.

    Returns
    -------
    A `matplotlib.Axes`, if `ax` is `None`, else `None`. If `return_data`,
    return the timeseries data in addition to an axes.
    """
    ax_was_none = ax is None

    if groups_key is None:
        if 'groups' not in adata.uns['paga']:
            raise KeyError(
                'Pass the key of the grouping with which you ran PAGA, '
                'using the parameter `groups_key`.')
        groups_key = adata.uns['paga']['groups']
    groups_names = adata.obs[groups_key].cat.categories

    if palette_groups is None:
        utils.add_colors_for_categorical_sample_annotation(adata, groups_key)
        palette_groups = adata.uns[groups_key + '_colors']

    def moving_average(a):
        return sc_utils.moving_average(a, n_avg)

    ax = pl.gca() if ax is None else ax
    from matplotlib import transforms
    trans = transforms.blended_transform_factory(ax.transData, ax.transAxes)
    X = []
    x_tick_locs = [0]
    x_tick_labels = []
    groups = []
    anno_dict = {anno: [] for anno in annotations}
    if isinstance(nodes[0], str):
        nodes_ints = []
        groups_names_set = set(groups_names)
        for node in nodes:
            if node not in groups_names_set:
                raise ValueError(
                    'Each node/group needs to be one of {} (`groups_key`=\'{}\') not \'{}\'.'
                    .format(groups_names.tolist(), groups_key, node))
            nodes_ints.append(groups_names.get_loc(node))
        nodes_strs = nodes
    else:
        nodes_ints = nodes
        nodes_strs = [groups_names[node] for node in nodes]

    adata_X = adata
    if use_raw and adata.raw is not None:
        adata_X = adata.raw

    for ikey, key in enumerate(keys):
        x = []
        for igroup, group in enumerate(nodes_ints):
            idcs = np.arange(adata.n_obs)[adata.obs[groups_key].values ==
                                          nodes_strs[igroup]]
            if len(idcs) == 0:
                raise ValueError(
                    'Did not find data points that match '
                    '`adata.obs[{}].values == str({})`.'
                    'Check whether adata.obs[{}] actually contains what you expect.'
                    .format(groups_key, group, groups_key))
            idcs_group = np.argsort(adata.obs['dpt_pseudotime'].values[
                adata.obs[groups_key].values == nodes_strs[igroup]])
            idcs = idcs[idcs_group]
            if key in adata.obs_keys(): x += list(adata.obs[key].values[idcs])
            else: x += list(adata_X[:, key].X[idcs])
            if ikey == 0:
                groups += [group for i in range(len(idcs))]
                x_tick_locs.append(len(x))
                for anno in annotations:
                    series = adata.obs[anno]
                    if is_categorical_dtype(series): series = series.cat.codes
                    anno_dict[anno] += list(series.values[idcs])
        if n_avg > 1:
            old_len_x = len(x)
            x = moving_average(x)
            if ikey == 0:
                for key in annotations:
                    if not isinstance(anno_dict[key][0], str):
                        anno_dict[key] = moving_average(anno_dict[key])
        if normalize_to_zero_one:
            x -= np.min(x)
            x /= np.max(x)
        X.append(x)
        if not as_heatmap:
            ax.plot(x[xlim[0]:xlim[1]], label=key)
        if ikey == 0:
            for igroup, group in enumerate(nodes):
                if len(groups_names) > 0 and group not in groups_names:
                    label = groups_names[group]
                else:
                    label = group
                x_tick_labels.append(label)
    X = np.array(X)
    if as_heatmap:
        img = ax.imshow(X,
                        aspect='auto',
                        interpolation='nearest',
                        cmap=color_map)
        if show_yticks:
            ax.set_yticks(range(len(X)))
            ax.set_yticklabels(keys, fontsize=ytick_fontsize)
        else:
            ax.set_yticks([])
        ax.set_frame_on(False)
        ax.set_xticks([])
        ax.tick_params(axis='both', which='both', length=0)
        ax.grid(False)
        if show_colorbar:
            pl.colorbar(img, ax=ax)
        left_margin = 0.2 if left_margin is None else left_margin
        pl.subplots_adjust(left=left_margin)
    else:
        left_margin = 0.4 if left_margin is None else left_margin
        if len(keys) > 1:
            pl.legend(frameon=False,
                      loc='center left',
                      bbox_to_anchor=(-left_margin, 0.5),
                      fontsize=legend_fontsize)
    xlabel = groups_key
    if not as_heatmap:
        ax.set_xlabel(xlabel)
        pl.yticks([])
        if len(keys) == 1: pl.ylabel(keys[0] + ' (a.u.)')
    else:
        import matplotlib.colors
        # groups bar
        ax_bounds = ax.get_position().bounds
        groups_axis = pl.axes([
            ax_bounds[0], ax_bounds[1] - ax_bounds[3] / len(keys),
            ax_bounds[2], ax_bounds[3] / len(keys)
        ])
        groups = np.array(groups)[None, :]
        groups_axis.imshow(
            groups,
            aspect='auto',
            interpolation="nearest",
            cmap=matplotlib.colors.ListedColormap(
                # the following line doesn't work because of normalization
                # adata.uns['paga_groups_colors'])
                palette_groups[np.min(groups).astype(int):],
                N=int(np.max(groups) + 1 - np.min(groups))))
        if show_yticks:
            groups_axis.set_yticklabels(['', xlabel, ''],
                                        fontsize=ytick_fontsize)
        else:
            groups_axis.set_yticks([])
        groups_axis.set_frame_on(False)
        if show_node_names:
            ypos = (groups_axis.get_ylim()[1] + groups_axis.get_ylim()[0]) / 2
            x_tick_locs = sc_utils.moving_average(x_tick_locs, n=2)
            for ilabel, label in enumerate(x_tick_labels):
                groups_axis.text(x_tick_locs[ilabel],
                                 ypos,
                                 x_tick_labels[ilabel],
                                 fontdict={
                                     'horizontalalignment': 'center',
                                     'verticalalignment': 'center'
                                 })
        groups_axis.set_xticks([])
        groups_axis.grid(False)
        groups_axis.tick_params(axis='both', which='both', length=0)
        # further annotations
        y_shift = ax_bounds[3] / len(keys)
        for ianno, anno in enumerate(annotations):
            if ianno > 0: y_shift = ax_bounds[3] / len(keys) / 2
            anno_axis = pl.axes([
                ax_bounds[0], ax_bounds[1] - (ianno + 2) * y_shift,
                ax_bounds[2], y_shift
            ])
            arr = np.array(anno_dict[anno])[None, :]
            if anno not in color_maps_annotations:
                color_map_anno = ('Vega10' if is_categorical_dtype(
                    adata.obs[anno]) else 'Greys')
            else:
                color_map_anno = color_maps_annotations[anno]
            img = anno_axis.imshow(arr,
                                   aspect='auto',
                                   interpolation='nearest',
                                   cmap=color_map_anno)
            if show_yticks:
                anno_axis.set_yticklabels(['', anno, ''],
                                          fontsize=ytick_fontsize)
                anno_axis.tick_params(axis='both', which='both', length=0)
            else:
                anno_axis.set_yticks([])
            anno_axis.set_frame_on(False)
            anno_axis.set_xticks([])
            anno_axis.grid(False)
    if title is not None: ax.set_title(title, fontsize=title_fontsize)
    if show is None and not ax_was_none: show = False
    else: show = settings.autoshow if show is None else show
    utils.savefig_or_show('paga_path', show=show, save=save)
    if return_data:
        df = pd.DataFrame(data=X.T, columns=keys)
        df['groups'] = moving_average(
            groups)  # groups is without moving average, yet
        if 'dpt_pseudotime' in anno_dict:
            df['distance'] = anno_dict['dpt_pseudotime'].T
        return ax, df if ax_was_none and show == False else df
    else:
        return ax if ax_was_none and show == False else None
    def test_get_col(self):
        self.assertIsInstance(self.explainer.get_col("Sex"), pd.Series)
        self.assertTrue(is_categorical_dtype(self.explainer.get_col("Sex")))

        self.assertIsInstance(self.explainer.get_col("Age"), pd.Series)
        self.assertTrue(is_numeric_dtype(self.explainer.get_col("Age")))
示例#21
0
文件: data.py 项目: hetong007/xgboost
def _transform_cudf_df(
    data,
    feature_names: FeatNamesT,
    feature_types: Optional[List[str]],
    enable_categorical: bool,
):
    try:
        from cudf.api.types import is_categorical_dtype
    except ImportError:
        from cudf.utils.dtypes import is_categorical_dtype

    if _is_cudf_ser(data):
        dtypes = [data.dtype]
    else:
        dtypes = data.dtypes

    if not all(dtype.name in _pandas_dtype_mapper or
               (is_categorical_dtype(dtype) and enable_categorical)
               for dtype in dtypes):
        _invalid_dataframe_dtype(data)

    # handle feature names
    if feature_names is None:
        if _is_cudf_ser(data):
            feature_names = [data.name]
        elif lazy_isinstance(data.columns, "cudf.core.multiindex",
                             "MultiIndex"):
            feature_names = [
                " ".join([str(x) for x in i]) for i in data.columns
            ]
        elif (lazy_isinstance(data.columns, "cudf.core.index", "RangeIndex")
              or lazy_isinstance(data.columns, "cudf.core.index", "Int64Index")
              # Unique to cuDF, no equivalence in pandas 1.3.3
              or lazy_isinstance(data.columns, "cudf.core.index",
                                 "Int32Index")):
            feature_names = list(map(str, data.columns))
        else:
            feature_names = data.columns.format()

    # handle feature types
    if feature_types is None:
        feature_types = []
        for dtype in dtypes:
            if is_categorical_dtype(dtype) and enable_categorical:
                feature_types.append(CAT_T)
            else:
                feature_types.append(_pandas_dtype_mapper[dtype.name])

    # handle categorical data
    cat_codes = []
    if _is_cudf_ser(data):
        # unlike pandas, cuDF uses NA for missing data.
        if is_categorical_dtype(data.dtype) and enable_categorical:
            codes = data.cat.codes
            cat_codes.append(codes)
    else:
        for col in data:
            if is_categorical_dtype(data[col].dtype) and enable_categorical:
                codes = data[col].cat.codes
                cat_codes.append(codes)

    return data, cat_codes, feature_names, feature_types
示例#22
0
def _get_color_values(adata,
                      value_to_plot,
                      groups=None,
                      palette=None,
                      use_raw=False,
                      gene_symbols=None,
                      layer=None) -> Tuple[Union[np.ndarray, str], bool]:
    """
    Returns the value or color associated to each data point.
    For categorical data, the return value is list of colors taken
    from the category palette or from the given `palette` value.

    For non-categorical data, the values are returned

    Returns
    -------
    Tuple of values to plot, and boolean indicating whether they are categorical.
    """
    if value_to_plot is None:
        return "lightgray", False
    if (gene_symbols is not None and value_to_plot not in adata.obs.columns
            and value_to_plot not in adata.var_names):
        # We should probably just make an index for this, and share it over runs
        value_to_plot = adata.var.index[
            adata.var[gene_symbols] == value_to_plot][
                0]  # TODO: Throw helpful error if this doesn't work
    if use_raw and value_to_plot not in adata.obs.columns:
        values = adata.raw.obs_vector(value_to_plot)
    else:
        values = adata.obs_vector(value_to_plot, layer=layer)

    ###
    # when plotting, the color of the dots is determined for each plot
    # the data is either categorical or continuous and the data could be in
    # 'obs' or in 'var'
    if not is_categorical_dtype(values):
        return values, False
    else:  # is_categorical_dtype(values)
        color_key = f"{value_to_plot}_colors"
        if palette:
            _set_colors_for_categorical_obs(adata, value_to_plot, palette)
        elif color_key not in adata.uns or \
            len(adata.uns[color_key]) < len(values.categories):
            #  set a default palette in case that no colors or few colors are found
            _set_default_colors_for_categorical_obs(adata, value_to_plot)
        else:
            _palette = []
            for color in adata.uns[color_key]:
                if not is_color_like(color):
                    # check if the color is a valid R color and translate it
                    # to a valid hex color value
                    if color in _utils.additional_colors:
                        color = _utils.additional_colors[color]
                    else:
                        logg.warning(
                            f"The following color value found in adata.uns['{value_to_plot}_colors'] "
                            f"is not valid: '{color}'. Default colors are used."
                        )
                        _set_default_colors_for_categorical_obs(
                            adata, value_to_plot)
                        _palette = None
                        break
                _palette.append(color)
            if _palette is not None:
                adata.uns[color_key] = _palette
        color_vector = np.asarray(adata.uns[color_key])[values.codes]

        # Handle groups
        if groups is not None:
            if isinstance(groups, str):
                groups = [groups]
            color_vector = np.array(color_vector, dtype='<U15')
            # set color to 'light gray' for all values
            # that are not in the groups
            color_vector[~adata.obs[value_to_plot].isin(groups)] = "lightgray"
        return color_vector, True
示例#23
0
文件: simple.py 项目: idroz/scanpy
def regress_out(adata, keys, n_jobs=None, copy=False):
    """Regress out unwanted sources of variation.

    Uses simple linear regression. This is inspired by Seurat's `regressOut`
    function in R [Satija15].

    Parameters
    ----------
    adata : :class:`~anndata.AnnData`
        The annotated data matrix.
    keys : `str` or list of `str`
        Keys for observation annotation on which to regress on.
    n_jobs : `int` or `None`, optional. If None is given, then the n_jobs seting is used (default: `None`)
        Number of jobs for parallel computation.
    copy : `bool`, optional (default: `False`)
        If an :class:`~anndata.AnnData` is passed, determines whether a copy
        is returned.

    Returns
    -------
    AnnData, None
        Depending on `copy` returns or updates `adata` with the corrected data matrix.
    """
    logg.info('regressing out', keys, r=True)
    if issparse(adata.X):
        logg.info('    sparse input is densified and may '
                  'lead to high memory use')
    adata = adata.copy() if copy else adata
    if isinstance(keys, str):
        keys = [keys]

    if issparse(adata.X):
        adata.X = adata.X.toarray()

    n_jobs = sett.n_jobs if n_jobs is None else n_jobs

    # regress on a single categorical variable
    sanitize_anndata(adata)
    variable_is_categorical = False
    if keys[0] in adata.obs_keys() and is_categorical_dtype(
            adata.obs[keys[0]]):
        if len(keys) > 1:
            raise ValueError('If providing categorical variable, '
                             'only a single one is allowed. For this one '
                             'we regress on the mean for each category.')
        logg.msg('... regressing on per-gene means within categories')
        regressors = np.zeros(adata.X.shape, dtype='float32')
        for category in adata.obs[keys[0]].cat.categories:
            mask = (category == adata.obs[keys[0]]).values
            for ix, x in enumerate(adata.X.T):
                regressors[mask, ix] = x[mask].mean()
        variable_is_categorical = True
    # regress on one or several ordinal variables
    else:
        # create data frame with selected keys (if given)
        if keys:
            regressors = adata.obs[keys]
        else:
            regressors = adata.obs.copy()

        # add column of ones at index 0 (first column)
        regressors.insert(0, 'ones', 1.0)

    len_chunk = np.ceil(min(1000, adata.X.shape[1]) / n_jobs).astype(int)
    n_chunks = np.ceil(adata.X.shape[1] / len_chunk).astype(int)

    tasks = []
    # split the adata.X matrix by columns in chunks of size n_chunk (the last chunk could be of smaller
    # size than the others)
    chunk_list = np.array_split(adata.X, n_chunks, axis=1)
    if variable_is_categorical:
        regressors_chunk = np.array_split(regressors, n_chunks, axis=1)
    for idx, data_chunk in enumerate(chunk_list):
        # each task is a tuple of a data_chunk eg. (adata.X[:,0:100]) and
        # the regressors. This data will be passed to each of the jobs.
        if variable_is_categorical:
            regres = regressors_chunk[idx]
        else:
            regres = regressors
        tasks.append(tuple((data_chunk, regres, variable_is_categorical)))

    if n_jobs > 1 and n_chunks > 1:
        import multiprocessing
        pool = multiprocessing.Pool(n_jobs)
        res = pool.map_async(_regress_out_chunk, tasks).get(9999999)
        pool.close()

    else:
        res = list(map(_regress_out_chunk, tasks))

    # res is a list of vectors (each corresponding to a regressed gene column).
    # The transpose is needed to get the matrix in the shape needed
    adata.X = np.vstack(res).T.astype(adata.X.dtype)
    logg.info('    finished', t=True)
    return adata if copy else None
def de_analysis(
    data: Union[MultimodalData, UnimodalData, AnnData],
    cluster: str,
    condition: Optional[str] = None,
    subset: Optional[List[str]] = None,
    de_key: Optional[str] = "de_res",
    n_jobs: Optional[int] = -1,
    t: Optional[bool] = False,
    fisher: Optional[bool] = False,
    temp_folder: Optional[str] = None,
    verbose: Optional[bool] = True,
) -> None:
    """Perform Differential Expression (DE) Analysis on data.

    The analysis considers one cluster at one time, comparing gene expression levels on cells
    within the cluster with all the others using a number of statistical tools, and determining
    up-regulated genes and down-regulated genes of the cluster.

    Mann-Whitney U test and AUROC are calculated by default. Welch's T test and Fisher's Exact test are optionally.

    The scalability performance on calculating all the test statistics is improved by the inspiration from `Presto <https://github.com/immunogenomics/presto>`_.

    Parameters
    ----------
    data: ``MultimodalData``, ``UnimodalData``, or ``anndata.AnnData``
        Data matrix with rows for cells and columns for genes.

    cluster: ``str``
        Cluster labels used in DE analysis. Must exist in ``data.obs``.

    condition: ``str``, optional, default: ``None``
        Sample attribute used as condition in DE analysis. If ``None``, no condition is considered; otherwise, must exist in ``data.obs``.
        If ``condition`` is used, the DE analysis will be performed on cells of each level of ``data.obs[condition]`` respectively, and collect the results after finishing.

    subset: ``List[str]``, optional, default: ``None``
        Perform DE analysis on only a subset of cluster IDs. Cluster ID subset is specified as a list of strings, such as ``[clust_1,clust_3,clust_5]``, where all IDs must exist in ``data.obs[cluster]``.

    de_key: ``str``, optional, default: ``"de_res"``
        Key name of DE analysis results stored.

    n_jobs: ``int``, optional, default: ``-1``
        Number of threads to use. If ``-1``, use all available threads.

    t: ``bool``, optional, default: ``True``
        If ``True``, calculate Welch's t test.

    fisher: ``bool``, optional, default: ``False``
        If ``True``, calculate Fisher's exact test.

    temp_folder: ``str``, optional, default: ``None``
        Joblib temporary folder for memmapping numpy arrays.

    verbose: ``bool``, optional, default: ``True``
        If ``True``, show detailed intermediate output.

    Returns
    -------
    ``None``

    Update ``data.varm``:
        ``data.varm[de_key]``: DE analysis result.

    Examples
    --------
    >>> pg.de_analysis(data, cluster='spectral_leiden_labels')
    >>> pg.de_analysis(data, cluster='louvain_labels', condition='anno')
    """
    if cluster not in data.obs:
        raise ValueError("Cannot find cluster label!")
    cluster_labels = data.obs[cluster].values
    if not is_categorical_dtype(cluster_labels):
        from natsort import natsorted
        cluster_labels = pd.Categorical(cluster_labels, natsorted(np.unique(cluster_labels)))

    cond_labels = None
    if condition is not None:
        if condition not in data.obs:
            raise ValueError("Cannot find condition!")
        cond_labels = data.obs[condition].values
        if not is_categorical_dtype(cond_labels):
            from natsort import natsorted
            cond_labels = pd.Categorical(cond_labels, natsorted(np.unique(cond_labels)))
        if cond_labels.categories.size < 2:
            raise ValueError("Number of conditions must be at least 2!")

    X = data.X if isinstance(data.X, csr_matrix) else csr_matrix(data.X) # If dense matrix, force it to be a csr_matrix

    if subset is not None:
        # subset data for de analysis
        subset = np.array(subset)
        idx_s = np.isin(subset, cluster_labels.categories.values)
        if idx_s.sum() < subset.size:
            raise ValueError(
                "These cluster labels do not exist: " + ",".join(subset[~idx_s]) + "!"
            )

        idx = np.isin(cluster_labels, subset)
        cluster_labels = pd.Categorical(cluster_labels[idx], categories = subset)
        if cond_labels is not None:
            cond_labels = cond_labels[idx]
        X = X[idx]

    if condition is not None:
        #Eliminate NaN rows from calculation
        idx_na = cond_labels.isna()
        if idx_na.sum() > 0:
            logger.warning("Detected NaN values in condition. Cells with NaN values are excluded from DE analysis.")
            idx_not_na = ~idx_na
            X = X[idx_not_na]
            cluster_labels = cluster_labels[idx_not_na]
            cond_labels = cond_labels[idx_not_na]

    n_jobs = eff_n_jobs(n_jobs)
    gene_names = data.var_names.values

    if cond_labels is None:
        df = _de_test(X, cluster_labels, gene_names, n_jobs, t, fisher, temp_folder, verbose)
    else:
        df = _de_test_cond(X, cluster_labels, cond_labels, gene_names, n_jobs, t, fisher, temp_folder, verbose)

    data.varm[de_key] = df.to_records(index=False)

    logger.info("Differential expression analysis is finished.")
示例#25
0
文件: paga.py 项目: gamazeps/scanpy
def _paga_graph(
    adata,
    ax,
    solid_edges=None,
    dashed_edges=None,
    adjacency_solid=None,
    adjacency_dashed=None,
    transitions=None,
    threshold=None,
    root=0,
    colors=None,
    labels=None,
    fontsize=None,
    fontweight=None,
    fontoutline=None,
    text_kwds=None,
    node_size_scale=1.,
    node_size_power=0.5,
    edge_width_scale=1.,
    normalize_to_color='reference',
    title=None,
    pos=None,
    cmap=None,
    frameon=True,
    min_edge_width=None,
    max_edge_width=None,
    export_to_gexf=False,
    cax=None,
    colorbar=None,
    use_raw=True,
    cb_kwds=None,
    single_component=False,
    arrowsize=30,
):
    import networkx as nx
    if text_kwds is None: text_kwds = {}
    if cb_kwds is None: cb_kwds = {}

    node_labels = labels  # rename for clarity
    if (node_labels is not None and isinstance(node_labels, str)
            and node_labels != adata.uns['paga']['groups']):
        raise ValueError(
            'Provide a list of group labels for the PAGA groups {}, not {}.'.
            format(adata.uns['paga']['groups'], node_labels))
    groups_key = adata.uns['paga']['groups']
    if node_labels is None:
        node_labels = adata.obs[groups_key].cat.categories

    if (colors is None or colors == groups_key) and groups_key is not None:
        if (groups_key + '_colors' not in adata.uns
                or len(adata.obs[groups_key].cat.categories) != len(
                    adata.uns[groups_key + '_colors'])):
            utils.add_colors_for_categorical_sample_annotation(
                adata, groups_key)
        colors = adata.uns[groups_key + '_colors']
        for iname, name in enumerate(adata.obs[groups_key].cat.categories):
            if name in settings.categories_to_ignore: colors[iname] = 'grey'

    nx_g_solid = nx.Graph(adjacency_solid)
    if dashed_edges is not None:
        nx_g_dashed = nx.Graph(adjacency_dashed)

    # convert pos to array and dict
    if not isinstance(pos, (Path, str)):
        pos_array = pos
    else:
        pos = Path(pos)
        if pos.suffix != '.gdf':
            raise ValueError(
                'Currently only supporting reading positions from .gdf files. '
                'Consider generating them using, for instance, Gephi.')
        s = ''  # read the node definition from the file
        with pos.open() as f:
            f.readline()
            for line in f:
                if line.startswith('edgedef>'):
                    break
                s += line
        from io import StringIO
        df = pd.read_csv(StringIO(s), header=-1)
        pos_array = df[[4, 5]].values

    # convert to dictionary
    pos = {n: [p[0], p[1]] for n, p in enumerate(pos_array)}

    # uniform color
    if isinstance(colors, str) and is_color_like(colors):
        colors = [colors for c in range(len(node_labels))]

    # color degree of the graph
    if isinstance(colors, str) and colors.startswith('degree'):
        # see also tools.paga.paga_degrees
        if colors == 'degree_dashed':
            colors = [d for _, d in nx_g_dashed.degree(weight='weight')]
        elif colors == 'degree_solid':
            colors = [d for _, d in nx_g_solid.degree(weight='weight')]
        else:
            raise ValueError(
                '`degree` either "degree_dashed" or "degree_solid".')
        colors = (np.array(colors) - np.min(colors)) / (np.max(colors) -
                                                        np.min(colors))

    # plot gene expression
    var_names = adata.var_names if adata.raw is None else adata.raw.var_names
    if isinstance(colors, str) and colors in var_names:
        x_color = []
        cats = adata.obs[groups_key].cat.categories
        for icat, cat in enumerate(cats):
            subset = (cat == adata.obs[groups_key]).values
            if adata.raw is not None and use_raw:
                adata_gene = adata.raw[:, colors]
            else:
                adata_gene = adata[:, colors]
            x_color.append(np.mean(adata_gene.X[subset]))
        colors = x_color

    # plot continuous annotation
    if (isinstance(colors, str) and colors in adata.obs
            and not is_categorical_dtype(adata.obs[colors])):
        x_color = []
        cats = adata.obs[groups_key].cat.categories
        for icat, cat in enumerate(cats):
            subset = (cat == adata.obs[groups_key]).values
            x_color.append(adata.obs.loc[subset, colors].mean())
        colors = x_color

    # plot categorical annotation
    if (isinstance(colors, str) and colors in adata.obs
            and is_categorical_dtype(adata.obs[colors])):
        from ... import utils as sc_utils
        asso_names, asso_matrix = sc_utils.compute_association_matrix_of_groups(
            adata,
            prediction=groups_key,
            reference=colors,
            normalization='reference' if normalize_to_color else 'prediction')
        utils.add_colors_for_categorical_sample_annotation(adata, colors)
        asso_colors = sc_utils.get_associated_colors_of_groups(
            adata.uns[colors + '_colors'], asso_matrix)
        colors = asso_colors

    if len(colors) < len(node_labels):
        print(node_labels, colors)
        raise ValueError(
            '`color` list need to be at least as long as `groups`/`node_labels` list.'
        )

    # count number of connected components
    n_components, labels = scipy.sparse.csgraph.connected_components(
        adjacency_solid)
    if n_components > 1 and not single_component:
        logg.debug(
            'Graph has more than a single connected component. '
            'To restrict to this component, pass `single_component=True`.')
    if n_components > 1 and single_component:
        component_sizes = np.bincount(labels)
        largest_component = np.where(
            component_sizes == component_sizes.max())[0][0]
        adjacency_solid = adjacency_solid.tocsr()[labels ==
                                                  largest_component, :]
        adjacency_solid = adjacency_solid.tocsc()[:,
                                                  labels == largest_component]
        colors = np.array(colors)[labels == largest_component]
        node_labels = np.array(node_labels)[labels == largest_component]
        cats_dropped = adata.obs[groups_key].cat.categories[
            labels != largest_component].tolist()
        logg.info(
            'Restricting graph to largest connected component by dropping categories\n'
            f'{cats_dropped}')
        nx_g_solid = nx.Graph(adjacency_solid)
        if dashed_edges is not None:
            raise ValueError(
                '`single_component` only if `dashed_edges` is `None`.')

    # edge widths
    base_edge_width = edge_width_scale * 5 * rcParams['lines.linewidth']

    # draw dashed edges
    if dashed_edges is not None:
        widths = [x[-1]['weight'] for x in nx_g_dashed.edges(data=True)]
        widths = base_edge_width * np.array(widths)
        if max_edge_width is not None:
            widths = np.clip(widths, None, max_edge_width)
        nx.draw_networkx_edges(nx_g_dashed,
                               pos,
                               ax=ax,
                               width=widths,
                               edge_color='grey',
                               style='dashed',
                               alpha=0.5)

    # draw solid edges
    if transitions is None:
        widths = [x[-1]['weight'] for x in nx_g_solid.edges(data=True)]
        widths = base_edge_width * np.array(widths)
        if min_edge_width is not None or max_edge_width is not None:
            widths = np.clip(widths, min_edge_width, max_edge_width)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            nx.draw_networkx_edges(nx_g_solid,
                                   pos,
                                   ax=ax,
                                   width=widths,
                                   edge_color='black')
    # draw directed edges
    else:
        adjacency_transitions = adata.uns['paga'][transitions].copy()
        if threshold is None: threshold = 0.01
        adjacency_transitions.data[adjacency_transitions.data < threshold] = 0
        adjacency_transitions.eliminate_zeros()
        g_dir = nx.DiGraph(adjacency_transitions.T)
        widths = [x[-1]['weight'] for x in g_dir.edges(data=True)]
        widths = base_edge_width * np.array(widths)
        if min_edge_width is not None or max_edge_width is not None:
            widths = np.clip(widths, min_edge_width, max_edge_width)
        nx.draw_networkx_edges(g_dir,
                               pos,
                               ax=ax,
                               width=widths,
                               edge_color='black',
                               arrowsize=arrowsize)

    if export_to_gexf:
        if isinstance(colors[0], tuple):
            from matplotlib.colors import rgb2hex
            colors = [rgb2hex(c) for c in colors]
        for count, n in enumerate(nx_g_solid.nodes()):
            nx_g_solid.node[count]['label'] = str(node_labels[count])
            nx_g_solid.node[count]['color'] = str(colors[count])
            nx_g_solid.node[count]['viz'] = {
                'position': {
                    'x': 1000 * pos[count][0],
                    'y': 1000 * pos[count][1],
                    'z': 0
                }
            }
        filename = settings.writedir / 'paga_graph.gexf'
        logg.warning(f'exporting to {filename}')
        settings.writedir.mkdir(parents=True, exist_ok=True)
        nx.write_gexf(nx_g_solid, settings.writedir / 'paga_graph.gexf')

    ax.set_frame_on(frameon)
    ax.set_xticks([])
    ax.set_yticks([])

    # groups sizes
    if groups_key is not None and groups_key + '_sizes' in adata.uns:
        groups_sizes = adata.uns[groups_key + '_sizes']
    else:
        groups_sizes = np.ones(len(node_labels))
    base_scale_scatter = 2000
    base_pie_size = (base_scale_scatter /
                     (np.sqrt(adjacency_solid.shape[0]) + 10) *
                     node_size_scale)
    median_group_size = np.median(groups_sizes)
    groups_sizes = base_pie_size * np.power(groups_sizes / median_group_size,
                                            node_size_power)

    if fontsize is None:
        fontsize = rcParams['legend.fontsize']
    if fontoutline is not None:
        text_kwds['path_effects'] = [
            patheffects.withStroke(linewidth=fontoutline, foreground='w')
        ]
    # usual scatter plot
    if not isinstance(colors[0], dict):
        n_groups = len(pos_array)
        sct = ax.scatter(pos_array[:, 0],
                         pos_array[:, 1],
                         c=colors[:n_groups],
                         edgecolors='face',
                         s=groups_sizes,
                         cmap=cmap)
        for count, group in enumerate(node_labels):
            ax.text(pos_array[count, 0],
                    pos_array[count, 1],
                    group,
                    verticalalignment='center',
                    horizontalalignment='center',
                    size=fontsize,
                    fontweight=fontweight,
                    **text_kwds)
    # else pie chart plot
    else:
        # start with this dummy plot... otherwise strange behavior
        sct = ax.scatter(pos_array[:, 0],
                         pos_array[:, 1],
                         c='white',
                         edgecolors='face',
                         s=groups_sizes,
                         cmap=cmap)
        trans = ax.transData.transform
        bbox = ax.get_position().get_points()
        ax_x_min = bbox[0, 0]
        ax_x_max = bbox[1, 0]
        ax_y_min = bbox[0, 1]
        ax_y_max = bbox[1, 1]
        ax_len_x = ax_x_max - ax_x_min
        ax_len_y = ax_y_max - ax_y_min
        trans2 = ax.transAxes.inverted().transform
        pie_axs = []
        for count, n in enumerate(nx_g_solid.nodes()):
            pie_size = groups_sizes[count] / base_scale_scatter
            x1, y1 = trans(pos[n])  # data coordinates
            xa, ya = trans2((x1, y1))  # axis coordinates
            xa = ax_x_min + (xa - pie_size / 2) * ax_len_x
            ya = ax_y_min + (ya - pie_size / 2) * ax_len_y
            # clip, the fruchterman layout sometimes places below figure
            if ya < 0: ya = 0
            if xa < 0: xa = 0
            pie_axs.append(
                pl.axes([xa, ya, pie_size * ax_len_x, pie_size * ax_len_y],
                        frameon=False))
            pie_axs[count].set_xticks([])
            pie_axs[count].set_yticks([])
            if not isinstance(colors[count], dict):
                raise ValueError(
                    '{} is neither a dict of valid matplotlib colors '
                    'nor a valid matplotlib color.'.format(colors[count]))
            color_single = colors[count].keys()
            fracs = [colors[count][c] for c in color_single]
            if sum(fracs) < 1:
                color_single = list(color_single)
                color_single.append('grey')
                fracs.append(1 - sum(fracs))
            pie_axs[count].pie(fracs, colors=color_single)
        if node_labels is not None:
            for ia, a in enumerate(pie_axs):
                a.text(0.5,
                       0.5,
                       node_labels[ia],
                       verticalalignment='center',
                       horizontalalignment='center',
                       transform=a.transAxes,
                       size=fontsize,
                       fontweight=fontweight,
                       **text_kwds)
    return sct
示例#26
0
def infer_variable_types(df, link_vars, variable_types, time_index,
                         secondary_time_index):
    '''Infer variable types from dataframe

    Args:
        df (DataFrame): Input DataFrame
        link_vars (list[]): Linked variables
        variable_types (dict[str -> dict[str -> type]]) : An entity's
            variable_types dict maps string variable ids to types (:class:`.Variable`)
            or (type, kwargs) to pass keyword arguments to the Variable.
        time_index (str or None): Name of time_index column
        secondary_time_index (dict[str: [str]]): Dictionary of secondary time columns
            that each map to a list of columns that depend on that secondary time
    '''
    # TODO: set pk and pk types here
    inferred_types = {}
    vids_to_assume_datetime = [time_index]
    if len(list(secondary_time_index.keys())):
        vids_to_assume_datetime.append(list(secondary_time_index.keys())[0])
    inferred_type = vtypes.Unknown
    for variable in df.columns:
        if variable in variable_types:
            continue

        elif variable in vids_to_assume_datetime:
            if col_is_datetime(df[variable]):
                inferred_type = vtypes.Datetime
            else:
                inferred_type = vtypes.Numeric

        elif variable in link_vars:
            inferred_type = vtypes.Categorical

        elif df[variable].dtype == "object":
            if not len(df[variable]):
                inferred_type = vtypes.Categorical
            elif col_is_datetime(df[variable]):
                inferred_type = vtypes.Datetime
            else:
                inferred_type = vtypes.Categorical

                # heuristics to predict this some other than categorical
                sample = df[variable].sample(min(10000, len(df[variable])))

                # catch cases where object dtype cannot be interpreted as a string
                try:
                    avg_length = sample.str.len().mean()
                    if avg_length > 50:
                        inferred_type = vtypes.Text
                except AttributeError:
                    pass

        elif df[variable].dtype == "bool":
            inferred_type = vtypes.Boolean

        elif pdtypes.is_categorical_dtype(df[variable].dtype):
            inferred_type = vtypes.Categorical

        elif pdtypes.is_numeric_dtype(df[variable].dtype):
            inferred_type = vtypes.Numeric

        elif col_is_datetime(df[variable]):
            inferred_type = vtypes.Datetime

        elif len(df[variable]):
            sample = df[variable] \
                .sample(min(10000, df[variable].nunique(dropna=False)))

            unique = sample.unique()
            percent_unique = sample.size / len(unique)

            if percent_unique < .05:
                inferred_type = vtypes.Categorical
            else:
                inferred_type = vtypes.Numeric

        inferred_types[variable] = inferred_type

    return inferred_types
示例#27
0
文件: methods.py 项目: vecchp/dask
def concat_pandas(dfs, axis=0, join='outer', uniform=False, filter_warning=True):
    if axis == 1:
        return pd.concat(dfs, axis=axis, join=join, **concat_kwargs)

    # Support concatenating indices along axis 0
    if isinstance(dfs[0], pd.Index):
        if isinstance(dfs[0], pd.CategoricalIndex):
            return pd.CategoricalIndex(union_categoricals(dfs),
                                       name=dfs[0].name)
        elif isinstance(dfs[0], pd.MultiIndex):
            first, rest = dfs[0], dfs[1:]
            if all((isinstance(o, pd.MultiIndex) and o.nlevels >= first.nlevels)
                    for o in rest):
                arrays = [concat([_get_level_values(i, n) for i in dfs])
                          for n in range(first.nlevels)]
                return pd.MultiIndex.from_arrays(arrays, names=first.names)

            to_concat = (first.values, ) + tuple(k._values for k in rest)
            new_tuples = np.concatenate(to_concat)
            try:
                return pd.MultiIndex.from_tuples(new_tuples, names=first.names)
            except Exception:
                return pd.Index(new_tuples)
        return dfs[0].append(dfs[1:])

    # Handle categorical index separately
    dfs0_index = dfs[0].index

    has_categoricalindex = (
        isinstance(dfs0_index, pd.CategoricalIndex) or
        (isinstance(dfs0_index, pd.MultiIndex) and
         any(isinstance(i, pd.CategoricalIndex) for i in dfs0_index.levels)))

    if has_categoricalindex:
        dfs2 = [df.reset_index(drop=True) for df in dfs]
        ind = concat([df.index for df in dfs])
    else:
        dfs2 = dfs
        ind = None

    # Concatenate the partitions together, handling categories as needed
    if (isinstance(dfs2[0], pd.DataFrame) if uniform else
            any(isinstance(df, pd.DataFrame) for df in dfs2)):
        if uniform:
            dfs3 = dfs2
            cat_mask = dfs2[0].dtypes == 'category'
        else:
            # When concatenating mixed dataframes and series on axis 1, Pandas
            # converts series to dataframes with a single column named 0, then
            # concatenates.
            dfs3 = [df if isinstance(df, pd.DataFrame) else
                    df.to_frame().rename(columns={df.name: 0}) for df in dfs2]
            # pandas may raise a RuntimeWarning for comparing ints and strs
            with warnings.catch_warnings():
                warnings.simplefilter("ignore", RuntimeWarning)
                if filter_warning:
                    warnings.simplefilter('ignore', FutureWarning)
                cat_mask = pd.concat([(df.dtypes == 'category').to_frame().T
                                      for df in dfs3], join=join,
                                     **concat_kwargs).any()

        if cat_mask.any():
            not_cat = cat_mask[~cat_mask].index
            # this should be aligned, so no need to filter warning
            out = pd.concat([df[df.columns.intersection(not_cat)]
                             for df in dfs3], join=join, **concat_kwargs)
            temp_ind = out.index
            for col in cat_mask.index.difference(not_cat):
                # Find an example of categoricals in this column
                for df in dfs3:
                    sample = df.get(col)
                    if sample is not None:
                        break
                # Extract partitions, subbing in missing if needed
                parts = []
                for df in dfs3:
                    if col in df.columns:
                        parts.append(df[col])
                    else:
                        codes = np.full(len(df), -1, dtype='i8')
                        data = pd.Categorical.from_codes(codes,
                                                         sample.cat.categories,
                                                         sample.cat.ordered)
                        parts.append(data)
                out[col] = union_categoricals(parts)
                # Pandas resets index type on assignment if frame is empty
                # https://github.com/pandas-dev/pandas/issues/17101
                if not len(temp_ind):
                    out.index = temp_ind
            out = out.reindex(columns=cat_mask.index)
        else:
            # pandas may raise a RuntimeWarning for comparing ints and strs
            with warnings.catch_warnings():
                warnings.simplefilter("ignore", RuntimeWarning)
                if filter_warning:
                    warnings.simplefilter("ignore", FutureWarning)
                out = pd.concat(dfs3, join=join, **concat_kwargs)
    else:
        if is_categorical_dtype(dfs2[0].dtype):
            if ind is None:
                ind = concat([df.index for df in dfs2])
            return pd.Series(union_categoricals(dfs2), index=ind,
                             name=dfs2[0].name)
        with warnings.catch_warnings():
            if filter_warning:
                warnings.simplefilter('ignore', FutureWarning)
            out = pd.concat(dfs2, join=join, **concat_kwargs)
    # Re-add the index if needed
    if ind is not None:
        out.index = ind
    return out
示例#28
0
def get_df(
    data,
    keys=None,
    layer=None,
    index=None,
    columns=None,
    sort_values=None,
    dropna="all",
    precision=None,
):
    """Get dataframe for a specified adata key.

    Return values for specified key
    (in obs, var, obsm, varm, obsp, varp, uns, or layers) as a dataframe.

    Arguments
    ------
    adata
        AnnData object or a numpy array to get values from.
    keys
        Keys from `.var_names`, `.obs_names`, `.var`, `.obs`,
        `.obsm`, `.varm`, `.obsp`, `.varp`, `.uns`, or `.layers`.
    layer
        Layer of `adata` to use as expression values.
    index
        List to set as index.
    columns
        List to set as columns names.
    sort_values
        Wether to sort values by first column (sort_values=True) or a specified column.
    dropna
        Drop columns/rows that contain NaNs in all ('all') or in any entry ('any').
    precision
        Set precision for pandas dataframe.

    Returns
    -------
    A dataframe.
    """
    if precision is not None:
        pd.set_option("precision", precision)

    if isinstance(data, AnnData):
        keys, keys_split = (keys.split("*") if isinstance(keys, str)
                            and "*" in keys else (keys, None))
        keys, key_add = (keys.split("/") if isinstance(keys, str)
                         and "/" in keys else (keys, None))
        keys = [keys] if isinstance(keys, str) else keys
        key = keys[0]

        s_keys = ["obs", "var", "obsm", "varm", "uns", "layers"]
        d_keys = [
            data.obs.keys(),
            data.var.keys(),
            data.obsm.keys(),
            data.varm.keys(),
            data.uns.keys(),
            data.layers.keys(),
        ]

        if hasattr(data, "obsp") and hasattr(data, "varp"):
            s_keys.extend(["obsp", "varp"])
            d_keys.extend([data.obsp.keys(), data.varp.keys()])

        if keys is None:
            df = data.to_df()
        elif key in data.var_names:
            df = obs_df(data, keys, layer=layer)
        elif key in data.obs_names:
            df = var_df(data, keys, layer=layer)
        else:
            if keys_split is not None:
                keys = [
                    k for k in list(data.obs.keys()) + list(data.var.keys())
                    if key in k and keys_split in k
                ]
                key = keys[0]
            s_key = [s for (s, d_key) in zip(s_keys, d_keys) if key in d_key]
            if len(s_key) == 0:
                raise ValueError(
                    f"'{key}' not found in any of {', '.join(s_keys)}.")
            if len(s_key) > 1:
                logg.warn(
                    f"'{key}' found multiple times in {', '.join(s_key)}.")

            s_key = s_key[-1]
            df = getattr(data, s_key)[keys if len(keys) > 1 else key]
            if key_add is not None:
                df = df[key_add]
            if index is None:
                index = (data.var_names if s_key == "varm" else data.obs_names
                         if s_key in {"obsm", "layers"} else None)
                if index is None and s_key == "uns" and hasattr(df, "shape"):
                    key_cats = np.array([
                        key for key in data.obs.keys()
                        if is_categorical_dtype(data.obs[key])
                    ])
                    num_cats = [
                        len(data.obs[key].cat.categories) == df.shape[0]
                        for key in key_cats
                    ]
                    if np.sum(num_cats) == 1:
                        index = data.obs[key_cats[num_cats][0]].cat.categories
                        if (columns is None and len(df.shape) > 1
                                and df.shape[0] == df.shape[1]):
                            columns = index
            elif isinstance(index, str) and index in data.obs.keys():
                index = pd.Categorical(data.obs[index]).categories
            if columns is None and s_key == "layers":
                columns = data.var_names
            elif isinstance(columns, str) and columns in data.obs.keys():
                columns = pd.Categorical(data.obs[columns]).categories
    elif isinstance(data, pd.DataFrame):
        if isinstance(keys, str) and "*" in keys:
            keys, keys_split = keys.split("*")
            keys = [k for k in data.columns if keys in k and keys_split in k]
        df = data[keys] if keys is not None else data
    else:
        df = data

    if issparse(df):
        df = np.array(df.A)
    if columns is None and hasattr(df, "names"):
        columns = df.names

    df = pd.DataFrame(df, index=index, columns=columns)

    if dropna:
        df.replace("", np.nan, inplace=True)
        how = dropna if isinstance(dropna,
                                   str) else "any" if dropna is True else "all"
        df.dropna(how=how, axis=0, inplace=True)
        df.dropna(how=how, axis=1, inplace=True)

    if sort_values:
        sort_by = (sort_values if isinstance(sort_values, str)
                   and sort_values in df.columns else df.columns[0])
        df = df.sort_values(by=sort_by, ascending=False)

    if hasattr(data, "var_names"):
        if df.index[0] in data.var_names:
            df.var_names = df.index
        elif df.columns[0] in data.var_names:
            df.var_names = df.columns
    if hasattr(data, "obs_names"):
        if df.index[0] in data.obs_names:
            df.obs_names = df.index
        elif df.columns[0] in data.obs_names:
            df.obs_names = df.columns

    return df
    def _calculate_agg_features(self, features, frame, df_trie,
                                progress_callback):
        test_feature = features[0]
        child_entity = test_feature.base_features[0].entity
        base_frame = df_trie.get_node(test_feature.relationship_path).value
        parent_merge_var = test_feature.relationship_path[0][
            1].parent_variable.id
        # Sometimes approximate features get computed in a previous filter frame
        # and put in the current one dynamically,
        # so there may be existing features here
        fl = []
        for f in features:
            for ind in f.get_feature_names():
                if ind not in frame.columns:
                    fl.append(f)
                    break
        features = fl
        if not len(features):
            progress_callback(len(features) / float(self.num_features))
            return frame

        # handle where
        base_frame_empty = base_frame.empty if isinstance(
            base_frame, pd.DataFrame) else False
        where = test_feature.where
        if where is not None and not base_frame_empty:
            base_frame = base_frame.loc[base_frame[where.get_name()]]

        # when no child data, just add all the features to frame with nan
        base_frame_empty = base_frame.empty if isinstance(
            base_frame, pd.DataFrame) else False
        if base_frame_empty:
            feature_values = []
            for f in features:
                feature_values.append(
                    (f, np.full(f.number_output_features, np.nan)))
                progress_callback(1 / float(self.num_features))
            frame = update_feature_columns(feature_values, frame)
        else:
            relationship_path = test_feature.relationship_path

            groupby_var = get_relationship_variable_id(relationship_path)

            # if the use_previous property exists on this feature, include only the
            # instances from the child entity included in that Timedelta
            use_previous = test_feature.use_previous
            if use_previous:
                # Filter by use_previous values
                time_last = self.time_last
                if use_previous.has_no_observations():
                    time_first = time_last - use_previous
                    ti = child_entity.time_index
                    if ti is not None:
                        base_frame = base_frame[base_frame[ti] >= time_first]
                else:
                    n = use_previous.get_value('o')

                    def last_n(df):
                        return df.iloc[-n:]

                    base_frame = base_frame.groupby(groupby_var,
                                                    observed=True,
                                                    sort=False).apply(last_n)

            to_agg = {}
            agg_rename = {}
            to_apply = set()
            # apply multivariable and time-dependent features as we find them, and
            # save aggregable features for later
            for f in features:
                if _can_agg(f):

                    variable_id = f.base_features[0].get_name()
                    if variable_id not in to_agg:
                        to_agg[variable_id] = []
                    if isinstance(base_frame, dd.DataFrame):
                        func = f.get_dask_aggregation()
                    else:
                        func = f.get_function()

                    # for some reason, using the string count is significantly
                    # faster than any method a primitive can return
                    # https://stackoverflow.com/questions/55731149/use-a-function-instead-of-string-in-pandas-groupby-agg
                    if func == pd.Series.count:
                        func = "count"

                    funcname = func
                    if callable(func):
                        # if the same function is being applied to the same
                        # variable twice, wrap it in a partial to avoid
                        # duplicate functions
                        funcname = str(id(func))
                        if u"{}-{}".format(variable_id,
                                           funcname) in agg_rename:
                            func = partial(func)
                            funcname = str(id(func))

                        func.__name__ = funcname

                    if isinstance(func, dd.Aggregation):
                        # TODO: handle aggregation being applied to same variable twice
                        # (see above partial wrapping of functions)
                        funcname = func.__name__

                    to_agg[variable_id].append(func)
                    # this is used below to rename columns that pandas names for us
                    agg_rename[u"{}-{}".format(variable_id,
                                               funcname)] = f.get_name()
                    continue

                to_apply.add(f)

            # Apply the non-aggregable functions generate a new dataframe, and merge
            # it with the existing one
            if len(to_apply):
                wrap = agg_wrapper(to_apply, self.time_last)
                # groupby_var can be both the name of the index and a column,
                # to silence pandas warning about ambiguity we explicitly pass
                # the column (in actuality grouping by both index and group would
                # work)
                to_merge = base_frame.groupby(base_frame[groupby_var],
                                              observed=True,
                                              sort=False).apply(wrap)
                frame = pd.merge(left=frame,
                                 right=to_merge,
                                 left_index=True,
                                 right_index=True,
                                 how='left')

                progress_callback(len(to_apply) / float(self.num_features))

            # Apply the aggregate functions to generate a new dataframe, and merge
            # it with the existing one
            if len(to_agg):
                # groupby_var can be both the name of the index and a column,
                # to silence pandas warning about ambiguity we explicitly pass
                # the column (in actuality grouping by both index and group would
                # work)
                if isinstance(base_frame, dd.DataFrame):
                    to_merge = base_frame.groupby(groupby_var).agg(to_agg)

                else:
                    to_merge = base_frame.groupby(base_frame[groupby_var],
                                                  observed=True,
                                                  sort=False).agg(to_agg)
                # rename columns to the correct feature names
                to_merge.columns = [
                    agg_rename["-".join(x)] for x in to_merge.columns.ravel()
                ]
                to_merge = to_merge[list(agg_rename.values())]

                # workaround for pandas bug where categories are in the wrong order
                # see: https://github.com/pandas-dev/pandas/issues/22501
                if pdtypes.is_categorical_dtype(frame.index):
                    categories = pdtypes.CategoricalDtype(
                        categories=frame.index.categories)
                    to_merge.index = to_merge.index.astype(object).astype(
                        categories)

                if isinstance(frame, dd.DataFrame):
                    frame = frame.merge(to_merge,
                                        left_on=parent_merge_var,
                                        right_index=True,
                                        how='left')
                else:
                    frame = pd.merge(left=frame,
                                     right=to_merge,
                                     left_index=True,
                                     right_index=True,
                                     how='left')

                # determine number of features that were just merged
                progress_callback(
                    len(to_merge.columns) / float(self.num_features))

        # Handle default values
        fillna_dict = {}
        for f in features:
            feature_defaults = {
                name: f.default_value
                for name in f.get_feature_names()
            }
            fillna_dict.update(feature_defaults)

        frame = frame.fillna(fillna_dict)

        # convert boolean dtypes to floats as appropriate
        # pandas behavior: https://github.com/pydata/pandas/issues/3752
        for f in features:
            if (f.number_output_features == 1
                    and f.variable_type == variable_types.Numeric
                    and frame[f.get_name()].dtype.name in ['object', 'bool']):
                frame[f.get_name()] = frame[f.get_name()].astype(float)

        return frame
示例#30
0
文件: scale.py 项目: has2k1/mizani
    def train(cls, new_data, old=None, drop=False, na_rm=False):
        """
        Train a continuous scale

        Parameters
        ----------
        new_data : array_like
            New values
        old : array_like
            Old range. List of values known to the scale.
        drop : bool
            Whether to drop(not include) unused categories
        na_rm : bool
            If ``True``, remove missing values. Missing values
            are either ``NaN`` or ``None``.

        Returns
        -------
        out : list
            Values covered by the scale
        """
        if not len(new_data):
            return old

        if old is None:
            old = []

        # Get the missing values (NaN & Nones) locations and remove them
        nan_bool_idx = pd.isnull(new_data)
        has_na = np.any(nan_bool_idx)
        if not hasattr(new_data, 'dtype'):
            new_data = np.asarray(new_data)
        new_data = new_data[~nan_bool_idx]

        if new_data.dtype.kind not in DISCRETE_KINDS:
            raise TypeError(
                "Continuous value supplied to discrete scale")

        # Train i.e. get the new values
        if pdtypes.is_categorical_dtype(new_data):
            try:
                new = list(new_data.cat.categories)  # series
            except AttributeError:
                new = list(new_data.categories)      # plain categorical
            if drop:
                present = set(new_data.drop_duplicates())
                new = [i for i in new if i in present]
        else:
            try:
                new = np.unique(new_data)
                new.sort()
            except TypeError:
                # new_data probably has nans and other types
                new = list(set(new_data))
                new = multitype_sort(new)

        # Add nan if required
        if has_na and not na_rm:
            new = np.hstack([new, np.nan])

        # update old
        old_set = set(old)
        return list(old) + [i for i in new if (i not in old_set)]
示例#31
0
    def infer_variable_types(self, variable_types, time_index, secondary_time_index):
        '''Infer variable types from dataframe

        Args:
            variable_types (dict[str -> dict[str -> type]]) : An entity's
                variable_types dict maps string variable ids to types (:class:`.Variable`)
                or (type, kwargs) to pass keyword arguments to the Variable.
            time_index (str or None): Name of time_index column
            secondary_time_index (dict[str: [str]]): Dictionary of secondary time columns
                that each map to a list of columns that depend on that secondary time
        '''
        link_relationships = [r for r in self.entityset.relationships
                              if r.parent_entity.id == self.id or
                              r.child_entity.id == self.id]

        link_vars = [v.id for rel in link_relationships
                     for v in [rel.parent_variable, rel.child_variable]
                     if v.entity.id == self.id]

        # TODO: set pk and pk types here
        inferred_types = {}
        df = self.df
        vids_to_assume_datetime = [time_index]
        if len(list(secondary_time_index.keys())):
            vids_to_assume_datetime.append(list(secondary_time_index.keys())[0])
        inferred_type = vtypes.Unknown
        for variable in df.columns:
            if variable in variable_types:
                continue

            elif variable in vids_to_assume_datetime:
                if col_is_datetime(df[variable]):
                    inferred_type = vtypes.Datetime
                else:
                    inferred_type = vtypes.Numeric

            elif df[variable].dtype == "object":
                if variable in link_vars:
                    inferred_type = vtypes.Categorical
                elif len(df[variable]):
                    if col_is_datetime(df[variable]):
                        inferred_type = vtypes.Datetime
                    else:
                        # heuristics to predict this some other than categorical
                        sample = df[variable].sample(min(10000, df[variable].nunique()))
                        avg_length = sample.str.len().mean()
                        if avg_length > 50:
                            inferred_type = vtypes.Text
                        else:
                            inferred_type = vtypes.Categorical

            elif df[variable].dtype == "bool":
                inferred_type = vtypes.Boolean

            elif pdtypes.is_categorical_dtype(df[variable].dtype):
                inferred_type = vtypes.Categorical

            elif col_is_datetime(df[variable]):
                inferred_type = vtypes.Datetime

            elif variable in link_vars:
                inferred_type = vtypes.Ordinal

            elif len(df[variable]):
                sample = df[variable] \
                    .sample(min(10000, df[variable].nunique(dropna=False)))

                unique = sample.unique()
                percent_unique = sample.size / len(unique)

                if percent_unique < .05:
                    inferred_type = vtypes.Categorical
                else:
                    inferred_type = vtypes.Numeric

            inferred_types[variable] = inferred_type

        return inferred_types
示例#32
0
    def query_by_values(self, instance_vals, variable_id=None, columns=None,
                        time_last=None, training_window=None, include_cutoff_time=True):
        """Query instances that have variable with given value

        Args:
            instance_vals (pd.Dataframe, pd.Series, list[str] or str) :
                Instance(s) to match.
            variable_id (str) : Variable to query on. If None, query on index.
            columns (list[str]) : Columns to return. Return all columns if None.
            time_last (pd.TimeStamp) : Query data up to and including this
                time. Only applies if entity has a time index.
            training_window (Timedelta, optional):
                Window defining how much time before the cutoff time data
                can be used when calculating features. If None, all data before cutoff time is used.
            include_cutoff_time (bool):
                If True, data at cutoff time are included in calculating features

        Returns:
            pd.DataFrame : instances that match constraints with ids in order of underlying dataframe
        """
        if not variable_id:
            variable_id = self.index

        instance_vals = self._vals_to_series(instance_vals, variable_id)

        training_window = _check_timedelta(training_window)

        if training_window is not None:
            assert training_window.has_no_observations(), "Training window cannot be in observations"

        if instance_vals is None:
            df = self.df.copy()

        elif isinstance(instance_vals, pd.Series) and instance_vals.empty:
            df = self.df.head(0)

        else:
            if is_instance(instance_vals, (dd, ks), 'Series'):
                df = self.df.merge(instance_vals.to_frame(), how="inner", on=variable_id)
            elif isinstance(instance_vals, pd.Series) and is_instance(self.df, ks, 'DataFrame'):
                df = self.df.merge(ks.DataFrame({variable_id: instance_vals}), how="inner", on=variable_id)
            else:
                df = self.df[self.df[variable_id].isin(instance_vals)]

            if isinstance(self.df, pd.DataFrame):
                df = df.set_index(self.index, drop=False)

            # ensure filtered df has same categories as original
            # workaround for issue below
            # github.com/pandas-dev/pandas/issues/22501#issuecomment-415982538
            if pdtypes.is_categorical_dtype(self.df[variable_id]):
                categories = pd.api.types.CategoricalDtype(categories=self.df[variable_id].cat.categories)
                df[variable_id] = df[variable_id].astype(categories)

        df = self._handle_time(df=df,
                               time_last=time_last,
                               training_window=training_window,
                               include_cutoff_time=include_cutoff_time)

        if columns is not None:
            df = df[columns]

        return df
示例#33
0
def scatter(adata,
            x=None,
            y=None,
            color='grey',
            use_raw=True,
            sort_order=True,
            alpha=None,
            basis=None,
            groups=None,
            components=None,
            projection='2d',
            legend_loc='right margin',
            legend_fontsize=None,
            legend_fontweight=None,
            color_map=None,
            palette=None,
            right_margin=None,
            left_margin=None,
            size=None,
            title=None,
            show=None,
            save=None,
            ax=None):
    """Scatter plot.

    Color with annotation of observations (`.obs`) or expression of genes
    (`.var_names`).

    Parameters
    ----------
    adata : :class:`~scanpy.api.AnnData`
        Annotated data matrix.
    x : `str` or `None`
        x coordinate.
    y : `str` or `None`
        y coordinate.
    color : string or list of strings, optional (default: `None`)
        Keys for observation/cell annotation `[\'ann1\', \'ann2\']`.
    use_raw : `bool`, optional (default: `True`)
        Use `raw` attribute of `adata` if present.
    sort_order : `bool`, optional (default: `True`)
        For continuous annotations used as color parameter, plot data points
        with higher values on top of others.
    basis : {'pca', 'tsne', 'umap', 'diffmap', 'draw_graph_fr', etc.}
        String that denotes a plotting tool that computed coordinates.
    groups : str, optional (default: all groups in color)
        Allows to restrict categories in observation annotation to a subset.
    components : `str` or list of `str`, optional (default: '1,2')
         String of the form '1,2' or ['1,2', '2,3'].
    projection : {'2d', '3d'}, optional (default: '2d')
         Projection of plot.
    legend_loc : `str`, optional (default: 'right margin')
         Location of legend, either 'on data', 'right margin' or valid keywords
         for `matplotlib.pyplot.legend
         <https://matplotlib.org/api/_as_gen/matplotlib.pyplot.legend.html>`_.
    legend_fontsize : `int` (default: `None`)
         Legend font size.
    legend_fontweight : `int` (default: `None`)
         Legend font weight.
    color_map : `str` (default: 'RdBu_r')
         String denoting matplotlib color map for continuous coloring.
    palette : list of `str` (default: `None`)
         Colors to use for plotting groups (categorical annotation).
    right_margin : `float` (default: 0.3)
         Adjust how far the plotting panel extends to the right.
    size : float (default: None)
         Point size. Observation-number dependent by default.
    title : `str` or list of `str`, optional (default: `None`)
         Provide title for panels either as `[\'title1\', ...]`.
    show : `bool`, optional (default: `None`)
         Show the plot.
    save : `bool` or `str`, optional (default: `None`)
        If `True` or a `str`, save the figure. A string is appended to the
        default filename. Infer the filetype if ending on \{'.pdf', '.png', '.svg'\}.
    ax : `matplotlib.Axes`
         A matplotlib axes object.

    Returns
    -------
    A list of `matplotlib.Axis` objects.
    """
    sanitize_anndata(adata)
    if legend_loc not in VALID_LEGENDLOCS:
        raise ValueError('Invalid `legend_loc`, need to be one of: {}.'.format(
            VALID_LEGENDLOCS))
    if components is None:
        components = '1,2' if '2d' in projection else '1,2,3'
    if isinstance(components, str): components = components.split(',')
    components = np.array(components).astype(int) - 1
    title = None if title is None else title.split(',') if isinstance(
        title, str) else title
    keys = ['grey'] if color is None else color.split(',') if isinstance(
        color, str) else color
    groups = None if groups is None else groups.split(',') if isinstance(
        groups, str) else groups
    highlights = adata.uns['highlights'] if 'highlights' in adata.uns else []
    if basis is not None:
        try:
            Y = adata.obsm['X_' + basis][:, components]
        except KeyError:
            raise KeyError(
                'compute coordinates using visualization tool {} first'.format(
                    basis))
    elif x is not None and y is not None:
        x_arr = adata._get_obs_array(x)
        y_arr = adata._get_obs_array(y)
        Y = np.c_[x_arr[:, None], y_arr[:, None]]
    else:
        raise ValueError(
            'Either provide keys for a `basis` or for `x` and `y`.')

    if size is None:
        n = Y.shape[0]
        size = 120000 / n

    if legend_loc == 'on data' and legend_fontsize is None:
        legend_fontsize = rcParams['legend.fontsize']
    elif legend_fontsize is None:
        legend_fontsize = rcParams['legend.fontsize']

    palette_was_none = False
    if palette is None: palette_was_none = True
    if isinstance(palette, list):
        if not is_color_like(palette[0]):
            palettes = palette
        else:
            palettes = [palette]
    else:
        palettes = [palette for i in range(len(keys))]
    for i, palette in enumerate(palettes):
        palettes[i] = utils.default_palette(palette)

    if basis is not None:
        component_name = ('DC' if basis == 'diffmap' else
                          basis.replace('draw_graph_', '').upper() if
                          'draw_graph' in basis else 'tSNE' if basis == 'tsne'
                          else 'UMAP' if basis == 'umap' else 'PC' if basis ==
                          'pca' else 'Spring' if basis == 'spring' else None)
    else:
        component_name = None
    axis_labels = (x, y) if component_name is None else None
    show_ticks = True if component_name is None else False

    # the actual color ids, e.g. 'grey' or '#109482'
    color_ids = [None if not is_color_like(key) else key for key in keys]
    categoricals = []
    colorbars = []
    for ikey, key in enumerate(keys):
        if color_ids[ikey] is not None:
            c = color_ids[ikey]
            continuous = True
            categorical = False
            colorbars.append(False)
        else:
            c = 'white' if projection == '2d' else 'white'
            categorical = False
            continuous = False
            # test whether we have categorial or continuous annotation
            if key in adata.obs_keys():
                if is_categorical_dtype(adata.obs[key]):
                    categorical = True
                else:
                    continuous = True
                    c = adata.obs[key]
            # coloring according to gene expression
            elif (use_raw and adata.raw is not None
                  and key in adata.raw.var_names):
                c = adata.raw[:, key].X
                continuous = True
            elif key in adata.var_names:
                c = adata[:, key].X
                continuous = True
            else:
                raise ValueError(
                    '"' + key + '" is invalid!' +
                    ' specify valid observation annotation, one of ' +
                    str(adata.obs_keys()) + ' or a gene name ' +
                    str(adata.var_names))
            colorbars.append(True if continuous else False)
        if categorical: categoricals.append(ikey)
        color_ids[ikey] = c

    if right_margin is None and len(categoricals) > 0:
        if legend_loc == 'right margin': right_margin = 0.5
    if title is None and keys[0] is not None:
        title = [
            key.replace('_', ' ') if not is_color_like(key) else ''
            for key in keys
        ]

    axs = scatter_base(Y,
                       title=title,
                       alpha=alpha,
                       component_name=component_name,
                       axis_labels=axis_labels,
                       component_indexnames=components + 1,
                       projection=projection,
                       colors=color_ids,
                       highlights=highlights,
                       colorbars=colorbars,
                       right_margin=right_margin,
                       left_margin=left_margin,
                       sizes=[size for c in keys],
                       color_map=color_map,
                       show_ticks=show_ticks,
                       ax=ax)

    def add_centroid(centroids, name, Y, mask):
        Y_mask = Y[mask]
        if Y_mask.shape[0] == 0: return
        median = np.median(Y_mask, axis=0)
        i = np.argmin(np.sum(np.abs(Y_mask - median), axis=1))
        centroids[name] = Y_mask[i]

    for i, ikey in enumerate(categoricals):
        palette = palettes[i]
        key = keys[ikey]
        if (not key + '_colors' in adata.uns or not palette_was_none
                or len(adata.obs[key].cat.categories) != len(
                    adata.uns[key + '_colors'])):
            utils.add_colors_for_categorical_sample_annotation(
                adata, key, palette)
        # actually plot the groups
        mask_remaining = np.ones(Y.shape[0], dtype=bool)
        centroids = {}
        if groups is None:
            for iname, name in enumerate(adata.obs[key].cat.categories):
                if name not in settings.categories_to_ignore:
                    mask = scatter_group(axs[ikey],
                                         key,
                                         iname,
                                         adata,
                                         Y,
                                         projection,
                                         size=size,
                                         alpha=alpha)
                    mask_remaining[mask] = False
                    if legend_loc == 'on data':
                        add_centroid(centroids, name, Y, mask)
        else:
            for name in groups:
                if name not in set(adata.obs[key].cat.categories):
                    raise ValueError('"' + name + '" is invalid!' +
                                     ' specify valid name, one of ' +
                                     str(adata.obs[key].cat.categories))
                else:
                    iname = np.flatnonzero(
                        adata.obs[key].cat.categories.values == name)[0]
                    mask = scatter_group(axs[ikey],
                                         key,
                                         iname,
                                         adata,
                                         Y,
                                         projection,
                                         size=size,
                                         alpha=alpha)
                    if legend_loc == 'on data':
                        add_centroid(centroids, name, Y, mask)
                    mask_remaining[mask] = False
        if mask_remaining.sum() > 0:
            data = [Y[mask_remaining, 0], Y[mask_remaining, 1]]
            if projection == '3d': data.append(Y[mask_remaining, 2])
            axs[ikey].scatter(*data,
                              marker='.',
                              c='grey',
                              s=size,
                              edgecolors='none',
                              zorder=-1)
        legend = None
        if legend_loc == 'on data':
            for name, pos in centroids.items():
                axs[ikey].text(pos[0],
                               pos[1],
                               name,
                               weight=legend_fontweight,
                               verticalalignment='center',
                               horizontalalignment='center',
                               fontsize=legend_fontsize)
        elif legend_loc == 'right margin':
            legend = axs[ikey].legend(
                frameon=False,
                loc='center left',
                bbox_to_anchor=(1, 0.5),
                ncol=(1 if len(adata.obs[key].cat.categories) <= 14 else
                      2 if len(adata.obs[key].cat.categories) <= 30 else 3),
                fontsize=legend_fontsize)
        elif legend_loc != 'none':
            legend = axs[ikey].legend(frameon=False,
                                      loc=legend_loc,
                                      fontsize=legend_fontsize)
        if legend is not None:
            for handle in legend.legendHandles:
                handle.set_sizes([300.0])
    utils.savefig_or_show('scatter' if basis is None else basis,
                          show=show,
                          save=save)
    if show == False: axs
示例#34
0
    def contains_op(cls, series: pd.Series) -> bool:
        if not pdt.is_categorical_dtype(series) and pdt.is_bool_dtype(series):
            return True

        return False
示例#35
0
文件: data.py 项目: hetong007/xgboost
def _transform_pandas_df(
    data: DataFrame,
    enable_categorical: bool,
    feature_names: FeatNamesT = None,
    feature_types: Optional[List[str]] = None,
    meta: Optional[str] = None,
    meta_type: Optional[str] = None,
) -> Tuple[np.ndarray, FeatNamesT, Optional[List[str]]]:
    import pandas as pd
    from pandas.api.types import is_sparse, is_categorical_dtype

    if not all(dtype.name in _pandas_dtype_mapper or is_sparse(dtype) or
               (is_categorical_dtype(dtype) and enable_categorical)
               for dtype in data.dtypes):
        _invalid_dataframe_dtype(data)

    # handle feature names
    if feature_names is None and meta is None:
        if isinstance(data.columns, pd.MultiIndex):
            feature_names = [
                " ".join([str(x) for x in i]) for i in data.columns
            ]
        elif isinstance(data.columns, (pd.Index, pd.RangeIndex)):
            feature_names = list(map(str, data.columns))
        else:
            feature_names = data.columns.format()

    # handle feature types
    if feature_types is None and meta is None:
        feature_types = []
        for i, dtype in enumerate(data.dtypes):
            if is_sparse(dtype):
                feature_types.append(_pandas_dtype_mapper[dtype.subtype.name])
            elif is_categorical_dtype(dtype) and enable_categorical:
                feature_types.append(CAT_T)
            else:
                feature_types.append(_pandas_dtype_mapper[dtype.name])

    # handle category codes.
    transformed = pd.DataFrame()
    # Avoid transformation due to: PerformanceWarning: DataFrame is highly fragmented
    if enable_categorical and any(
            is_categorical_dtype(dtype) for dtype in data.dtypes):
        for i, dtype in enumerate(data.dtypes):
            if is_categorical_dtype(dtype):
                # pandas uses -1 as default missing value for categorical data
                transformed[data.columns[i]] = (
                    data[data.columns[i]].cat.codes.astype(np.float32).replace(
                        -1.0, np.NaN))
            else:
                transformed[data.columns[i]] = data[data.columns[i]]
    else:
        transformed = data

    if meta and len(data.columns) > 1 and meta not in _matrix_meta:
        raise ValueError(f"DataFrame for {meta} cannot have multiple columns")

    dtype = meta_type if meta_type else np.float32
    arr = transformed.values
    if meta_type:
        arr = arr.astype(meta_type)
    return arr, feature_names, feature_types
示例#36
0
    def query_by_values(self,
                        instance_vals,
                        variable_id=None,
                        columns=None,
                        time_last=None,
                        training_window=None):
        """Query instances that have variable with given value

        Args:
            instance_vals (pd.Dataframe, pd.Series, list[str] or str) :
                Instance(s) to match.
            variable_id (str) : Variable to query on. If None, query on index.
            columns (list[str]) : Columns to return. Return all columns if None.
            time_last (pd.TimeStamp) : Query data up to and including this
                time. Only applies if entity has a time index.
            training_window (Timedelta, optional):
                Data older than time_last by more than this will be ignored

        Returns:
            pd.DataFrame : instances that match constraints
        """
        instance_vals = self._vals_to_series(instance_vals, variable_id)

        training_window = _check_timedelta(training_window)
        if training_window is not None:
            assert (isinstance(training_window, Timedelta) and
                    training_window.is_absolute()),\
                "training window must be an absolute Timedelta"

        if instance_vals is None:
            df = self.df.copy()

        elif instance_vals.shape[0] == 0:
            df = self.df.head(0)

        elif variable_id is None or variable_id == self.index:
            df = self.df.reindex(instance_vals)
            df.dropna(subset=[self.index], inplace=True)

        else:
            df = self.df.merge(instance_vals.to_frame(variable_id),
                               how="inner",
                               on=variable_id)
            df = df.set_index(self.index, drop=False)

            # ensure filtered df has same categories as original
            # workaround for issue below
            # github.com/pandas-dev/pandas/issues/22501#issuecomment-415982538
            if pdtypes.is_categorical_dtype(self.df[variable_id]):
                categories = pd.api.types.CategoricalDtype(
                    categories=self.df[variable_id].cat.categories)
                df[variable_id] = df[variable_id].astype(categories)

        df = self._handle_time(df=df,
                               time_last=time_last,
                               training_window=training_window)

        if columns is not None:
            df = df[columns]

        return df
示例#37
0
def regress_out(
    adata: AnnData,
    keys: Union[str, Sequence[str]],
    n_jobs: Optional[int] = None,
    copy: bool = False,
) -> Optional[AnnData]:
    """\
    Regress out (mostly) unwanted sources of variation.

    Uses simple linear regression. This is inspired by Seurat's `regressOut`
    function in R [Satija15]. Note that this function tends to overcorrect
    in certain circumstances as described in :issue:`526`.

    Parameters
    ----------
    adata
        The annotated data matrix.
    keys
        Keys for observation annotation on which to regress on.
    n_jobs
        Number of jobs for parallel computation.
        `None` means using :attr:`scanpy._settings.ScanpyConfig.n_jobs`.
    copy
        Determines whether a copy of `adata` is returned.

    Returns
    -------
    Depending on `copy` returns or updates `adata` with the corrected data matrix.
    """
    start = logg.info(f'regressing out {keys}')
    if issparse(adata.X):
        logg.info('    sparse input is densified and may '
                  'lead to high memory use')
    adata = adata.copy() if copy else adata

    sanitize_anndata(adata)

    # TODO: This should throw an implicit modification warning
    if adata.is_view:
        adata._init_as_actual(adata.copy())

    if isinstance(keys, str):
        keys = [keys]

    if issparse(adata.X):
        adata.X = adata.X.toarray()

    n_jobs = sett.n_jobs if n_jobs is None else n_jobs

    # regress on a single categorical variable
    variable_is_categorical = False
    if keys[0] in adata.obs_keys() and is_categorical_dtype(
            adata.obs[keys[0]]):
        if len(keys) > 1:
            raise ValueError('If providing categorical variable, '
                             'only a single one is allowed. For this one '
                             'we regress on the mean for each category.')
        logg.debug('... regressing on per-gene means within categories')
        regressors = np.zeros(adata.X.shape, dtype='float32')
        for category in adata.obs[keys[0]].cat.categories:
            mask = (category == adata.obs[keys[0]]).values
            for ix, x in enumerate(adata.X.T):
                regressors[mask, ix] = x[mask].mean()
        variable_is_categorical = True
    # regress on one or several ordinal variables
    else:
        # create data frame with selected keys (if given)
        if keys:
            regressors = adata.obs[keys]
        else:
            regressors = adata.obs.copy()

        # add column of ones at index 0 (first column)
        regressors.insert(0, 'ones', 1.0)

    len_chunk = np.ceil(min(1000, adata.X.shape[1]) / n_jobs).astype(int)
    n_chunks = np.ceil(adata.X.shape[1] / len_chunk).astype(int)

    tasks = []
    # split the adata.X matrix by columns in chunks of size n_chunk
    # (the last chunk could be of smaller size than the others)
    chunk_list = np.array_split(adata.X, n_chunks, axis=1)
    if variable_is_categorical:
        regressors_chunk = np.array_split(regressors, n_chunks, axis=1)
    for idx, data_chunk in enumerate(chunk_list):
        # each task is a tuple of a data_chunk eg. (adata.X[:,0:100]) and
        # the regressors. This data will be passed to each of the jobs.
        if variable_is_categorical:
            regres = regressors_chunk[idx]
        else:
            regres = regressors
        tasks.append(tuple((data_chunk, regres, variable_is_categorical)))

    from joblib import Parallel, delayed

    # TODO: figure out how to test that this doesn't oversubscribe resources
    res = Parallel(n_jobs=n_jobs)(delayed(_regress_out_chunk)(task)
                                  for task in tasks)

    # res is a list of vectors (each corresponding to a regressed gene column).
    # The transpose is needed to get the matrix in the shape needed
    adata.X = np.vstack(res).T.astype(adata.X.dtype)
    logg.info('    finished', time=start)
    return adata if copy else None
示例#38
0
文件: methods.py 项目: fortizc/dask
def concat(dfs, axis=0, join='outer', uniform=False):
    """Concatenate, handling some edge cases:

    - Unions categoricals between partitions
    - Ignores empty partitions

    Parameters
    ----------
    dfs : list of DataFrame, Series, or Index
    axis : int or str, optional
    join : str, optional
    uniform : bool, optional
        Whether to treat ``dfs[0]`` as representative of ``dfs[1:]``. Set to
        True if all arguments have the same columns and dtypes (but not
        necessarily categories). Default is False.
    """
    if axis == 1:
        return pd.concat(dfs, axis=axis, join=join)

    if len(dfs) == 1:
        return dfs[0]

    # Support concatenating indices along axis 0
    if isinstance(dfs[0], pd.Index):
        if isinstance(dfs[0], pd.CategoricalIndex):
            return pd.CategoricalIndex(union_categoricals(dfs),
                                       name=dfs[0].name)
        elif isinstance(dfs[0], pd.MultiIndex):
            first, rest = dfs[0], dfs[1:]
            if all((isinstance(o, pd.MultiIndex) and o.nlevels >= first.nlevels)
                    for o in rest):
                arrays = [concat([_get_level_values(i, n) for i in dfs])
                          for n in range(first.nlevels)]
                return pd.MultiIndex.from_arrays(arrays, names=first.names)

            to_concat = (first.values, ) + tuple(k._values for k in rest)
            new_tuples = np.concatenate(to_concat)
            try:
                return pd.MultiIndex.from_tuples(new_tuples, names=first.names)
            except Exception:
                return pd.Index(new_tuples)
        return dfs[0].append(dfs[1:])

    # Handle categorical index separately
    dfs0_index = dfs[0].index

    has_categoricalindex = (
        isinstance(dfs0_index, pd.CategoricalIndex) or
        (isinstance(dfs0_index, pd.MultiIndex) and
         any(isinstance(i, pd.CategoricalIndex) for i in dfs0_index.levels)))

    if has_categoricalindex:
        dfs2 = [df.reset_index(drop=True) for df in dfs]
        ind = concat([df.index for df in dfs])
    else:
        dfs2 = dfs
        ind = None

    # Concatenate the partitions together, handling categories as needed
    if (isinstance(dfs2[0], pd.DataFrame) if uniform else
            any(isinstance(df, pd.DataFrame) for df in dfs2)):
        if uniform:
            dfs3 = dfs2
            cat_mask = dfs2[0].dtypes == 'category'
        else:
            # When concatenating mixed dataframes and series on axis 1, Pandas
            # converts series to dataframes with a single column named 0, then
            # concatenates.
            dfs3 = [df if isinstance(df, pd.DataFrame) else
                    df.to_frame().rename(columns={df.name: 0}) for df in dfs2]
            # pandas may raise a RuntimeWarning for comparing ints and strs
            with warnings.catch_warnings():
                warnings.simplefilter("ignore", RuntimeWarning)
                cat_mask = pd.concat([(df.dtypes == 'category').to_frame().T
                                      for df in dfs3], join=join).any()

        if cat_mask.any():
            not_cat = cat_mask[~cat_mask].index
            out = pd.concat([df[df.columns.intersection(not_cat)]
                             for df in dfs3], join=join)
            temp_ind = out.index
            for col in cat_mask.index.difference(not_cat):
                # Find an example of categoricals in this column
                for df in dfs3:
                    sample = df.get(col)
                    if sample is not None:
                        break
                # Extract partitions, subbing in missing if needed
                parts = []
                for df in dfs3:
                    if col in df.columns:
                        parts.append(df[col])
                    else:
                        codes = np.full(len(df), -1, dtype='i8')
                        data = pd.Categorical.from_codes(codes,
                                                         sample.cat.categories,
                                                         sample.cat.ordered)
                        parts.append(data)
                out[col] = union_categoricals(parts)
                # Pandas resets index type on assignment if frame is empty
                # https://github.com/pandas-dev/pandas/issues/17101
                if not len(temp_ind):
                    out.index = temp_ind
            out = out.reindex(columns=cat_mask.index)
        else:
            # pandas may raise a RuntimeWarning for comparing ints and strs
            with warnings.catch_warnings():
                warnings.simplefilter("ignore", RuntimeWarning)
                out = pd.concat(dfs3, join=join)
    else:
        if is_categorical_dtype(dfs2[0].dtype):
            if ind is None:
                ind = concat([df.index for df in dfs2])
            return pd.Series(union_categoricals(dfs2), index=ind,
                             name=dfs2[0].name)
        out = pd.concat(dfs2, join=join)
    # Re-add the index if needed
    if ind is not None:
        out.index = ind
    return out
示例#39
0
def _paga_graph(adata,
                ax,
                layout=None,
                layout_kwds={},
                init_pos=None,
                solid_edges=None,
                dashed_edges=None,
                transitions=None,
                threshold=None,
                threshold_arrows=None,
                threshold_solid=None,
                threshold_dashed=None,
                root=0,
                colors=None,
                labels=None,
                fontsize=None,
                fontweight=None,
                text_kwds=None,
                node_size_scale=1,
                node_size_power=0.5,
                edge_width_scale=1,
                title=None,
                pos=None,
                cmap=None,
                frameon=True,
                min_edge_width=None,
                max_edge_width=None,
                export_to_gexf=False,
                cax=None,
                colorbar=None,
                use_raw=True,
                cb_kwds={},
                single_component=False,
                arrowsize=30,
                random_state=0):
    node_labels = labels  # rename for clarity
    if (node_labels is not None and isinstance(node_labels, str)
            and node_labels != adata.uns['paga']['groups']):
        raise ValueError(
            'Provide a list of group labels for the PAGA groups {}, not {}.'.
            format(adata.uns['paga']['groups'], node_labels))
    groups_key = adata.uns['paga']['groups']
    if node_labels is None:
        node_labels = adata.obs[groups_key].cat.categories

    if (colors is None or colors == groups_key) and groups_key is not None:
        if (groups_key + '_colors' not in adata.uns
                or len(adata.obs[groups_key].cat.categories) != len(
                    adata.uns[groups_key + '_colors'])):
            utils.add_colors_for_categorical_sample_annotation(
                adata, groups_key)
        colors = adata.uns[groups_key + '_colors']
        for iname, name in enumerate(adata.obs[groups_key].cat.categories):
            if name in settings.categories_to_ignore: colors[iname] = 'grey'

    if isinstance(root, str):
        if root in node_labels:
            root = list(node_labels).index(root)
        else:
            raise ValueError(
                'If `root` is a string, it needs to be one of {} not \'{}\'.'.
                format(node_labels.tolist(), root))
    if isinstance(root, list) and root[0] in node_labels:
        root = [list(node_labels).index(r) for r in root]

    # define the objects
    adjacency_solid = adata.uns['paga'][solid_edges].copy()
    # set the the thresholds, either explicitly
    if threshold is not None:
        threshold_solid = threshold
        threshold_dashed = threshold
    # or to a default value
    else:
        if threshold_solid is None:
            threshold_solid = 0.01  # default threshold
        if threshold_dashed is None:
            threshold_dashed = 0.01  # default treshold
    if threshold_solid > 0:
        adjacency_solid.data[adjacency_solid.data < threshold_solid] = 0
        adjacency_solid.eliminate_zeros()
    nx_g_solid = nx.Graph(adjacency_solid)
    if dashed_edges is not None:
        adjacency_dashed = adata.uns['paga'][dashed_edges].copy()
        if threshold_dashed > 0:
            adjacency_dashed.data[adjacency_dashed.data < threshold_dashed] = 0
            adjacency_dashed.eliminate_zeros()
        nx_g_dashed = nx.Graph(adjacency_dashed)

    # uniform color
    if isinstance(colors, str) and is_color_like(colors):
        colors = [colors for c in range(len(node_labels))]

    # color degree of the graph
    if isinstance(colors, str) and colors.startswith('degree'):
        # see also tools.paga.paga_degrees
        if colors == 'degree_dashed':
            colors = [d for _, d in nx_g_dashed.degree(weight='weight')]
        elif colors == 'degree_solid':
            colors = [d for _, d in nx_g_solid.degree(weight='weight')]
        else:
            raise ValueError(
                '`degree` either "degree_dashed" or "degree_solid".')
        colors = (np.array(colors) - np.min(colors)) / (np.max(colors) -
                                                        np.min(colors))

    # plot gene expression
    var_names = adata.var_names if adata.raw is None else adata.raw.var_names
    if isinstance(colors, str) and colors in var_names:
        x_color = []
        cats = adata.obs[groups_key].cat.categories
        for icat, cat in enumerate(cats):
            subset = (cat == adata.obs[groups_key]).values
            if adata.raw is not None and use_raw:
                adata_gene = adata.raw[:, colors]
            else:
                adata_gene = adata[:, colors]
            x_color.append(np.mean(adata_gene.X[subset]))
        colors = x_color

    # plot continuous annotation
    if (isinstance(colors, str) and colors in adata.obs
            and not is_categorical_dtype(adata.obs[colors])):
        x_color = []
        cats = adata.obs[groups_key].cat.categories
        for icat, cat in enumerate(cats):
            subset = (cat == adata.obs[groups_key]).values
            x_color.append(adata.obs.loc[subset, colors].mean())
        colors = x_color

    # plot categorical annotation
    if (isinstance(colors, str) and colors in adata.obs
            and is_categorical_dtype(adata.obs[colors])):
        from ... import utils as sc_utils
        asso_names, asso_matrix = sc_utils.compute_association_matrix_of_groups(
            adata,
            prediction=groups_key,
            reference=colors,
            normalization='reference')
        utils.add_colors_for_categorical_sample_annotation(adata, colors)
        asso_colors = sc_utils.get_associated_colors_of_groups(
            adata.uns[colors + '_colors'], asso_matrix)
        colors = asso_colors

    if len(colors) < len(node_labels):
        print(node_labels, colors)
        raise ValueError(
            '`color` list need to be at least as long as `groups`/`node_labels` list.'
        )

    # count number of connected components
    n_components, labels = scipy.sparse.csgraph.connected_components(
        adjacency_solid)
    if n_components > 1 and not single_component:
        logg.msg(
            'Graph has more than a single connected component. '
            'To restrict to this component, pass `single_component=True`.')
    if n_components > 1 and single_component:
        component_sizes = np.bincount(labels)
        largest_component = np.where(
            component_sizes == component_sizes.max())[0][0]
        adjacency_solid = adjacency_solid.tocsr()[labels ==
                                                  largest_component, :]
        adjacency_solid = adjacency_solid.tocsc()[:,
                                                  labels == largest_component]
        colors = np.array(colors)[labels == largest_component]
        node_labels = np.array(node_labels)[labels == largest_component]
        logg.info(
            'Restricting graph to largest connected component by dropping categories\n'
            '{}'.format(adata.obs[groups_key].cat.categories[
                labels != largest_component].tolist()))
        nx_g_solid = nx.Graph(adjacency_solid)
        if dashed_edges is not None:
            raise ValueError(
                '`single_component` only if `dashed_edges` is `None`.')

    # node positions from adjacency_solid
    if pos is None:
        if layout is None:
            layout = 'fr'
        if layout == 'fa':
            try:
                from fa2 import ForceAtlas2
            except:
                logg.warn(
                    'Package \'fa2\' is not installed, falling back to layout \'fr\'.'
                    'To use the faster and better ForceAtlas2 layout, '
                    'install package \'fa2\' (`pip install fa2`).')
                layout = 'fr'
        if layout == 'fa':
            np.random.seed(random_state)
            if init_pos is None:
                init_coords = np.random.random((adjacency_solid.shape[0], 2))
            else:
                init_coords = init_pos.copy()
            forceatlas2 = ForceAtlas2(
                # Behavior alternatives
                outboundAttractionDistribution=False,  # Dissuade hubs
                linLogMode=False,  # NOT IMPLEMENTED
                adjustSizes=False,  # Prevent overlap (NOT IMPLEMENTED)
                edgeWeightInfluence=1.0,
                # Performance
                jitterTolerance=1.0,  # Tolerance
                barnesHutOptimize=True,
                barnesHutTheta=1.2,
                multiThreaded=False,  # NOT IMPLEMENTED
                # Tuning
                scalingRatio=2.0,
                strongGravityMode=False,
                gravity=1.0,
                # Log
                verbose=False)
            if 'maxiter' in layout_kwds:
                iterations = layout_kwds['maxiter']
            elif 'iterations' in layout_kwds:
                iterations = layout_kwds['iterations']
            else:
                iterations = 500
            pos_list = forceatlas2.forceatlas2(adjacency_solid,
                                               pos=init_coords,
                                               iterations=iterations)
            pos = {n: [p[0], -p[1]] for n, p in enumerate(pos_list)}
        elif layout == 'eq_tree':
            nx_g_tree = nx_g_solid
            if solid_edges == 'connectivities':
                adj_tree = adata.uns['paga']['connectivities_tree']
                nx_g_tree = nx.Graph(adj_tree)
            pos = utils.hierarchy_pos(nx_g_tree, root)
            if len(pos) < adjacency_solid.shape[0]:
                raise ValueError('This is a forest and not a single tree. '
                                 'Try another `layout`, e.g., {\'fr\'}.')
        else:
            # igraph layouts
            from ... import utils as sc_utils
            g = sc_utils.get_igraph_from_adjacency(adjacency_solid)
            if 'rt' in layout:
                g_tree = g
                if solid_edges == 'connectivities':
                    adj_tree = adata.uns['paga']['connectivities_tree']
                    g_tree = sc_utils.get_igraph_from_adjacency(adj_tree)
                pos_list = g_tree.layout(
                    layout,
                    root=root if isinstance(root, list) else [root]).coords
            elif layout == 'circle':
                pos_list = g.layout(layout).coords
            else:
                # I don't know why this is necessary
                np.random.seed(random_state)
                if init_pos is None:
                    init_coords = np.random.random(
                        (adjacency_solid.shape[0], 2)).tolist()
                else:
                    init_pos = init_pos.copy()
                    # this is a super-weird hack that is necessary as igraphs layout function
                    # seems to do some strange stuff, here
                    init_pos[:, 1] *= -1
                    init_coords = init_pos.tolist()
                try:
                    pos_list = g.layout(layout,
                                        seed=init_coords,
                                        weights='weight',
                                        **layout_kwds).coords
                except:  # hack for excepting attribute error for empty graphs...
                    pos_list = g.layout(layout,
                                        seed=init_coords,
                                        **layout_kwds).coords
            pos = {n: [p[0], -p[1]] for n, p in enumerate(pos_list)}
        pos_array = np.array([pos[n] for count, n in enumerate(nx_g_solid)])
    else:
        if isinstance(pos, str):
            if not pos.endswith('.gdf'):
                raise ValueError(
                    'Currently only supporting reading positions from .gdf files.'
                    'Consider generating them using, for instance, Gephi.')
            s = ''  # read the node definition from the file
            with open(pos) as f:
                f.readline()
                for line in f:
                    if line.startswith('edgedef>'):
                        break
                    s += line
            from io import StringIO
            df = pd.read_csv(StringIO(s), header=-1)
            pos = df[[4, 5]].values
        pos_array = pos
        # convert to dictionary
        pos = {n: [p[0], p[1]] for n, p in enumerate(pos)}

    if len(pos) == 1: pos[0] = (0.5, 0.5)

    # edge widths
    base_edge_width = edge_width_scale * 5 * rcParams['lines.linewidth']

    # draw dashed edges
    if dashed_edges is not None:
        widths = [x[-1]['weight'] for x in nx_g_dashed.edges(data=True)]
        widths = base_edge_width * np.array(widths)
        if max_edge_width is not None:
            widths = np.clip(widths, None, max_edge_width)
        nx.draw_networkx_edges(nx_g_dashed,
                               pos,
                               ax=ax,
                               width=widths,
                               edge_color='grey',
                               style='dashed',
                               alpha=0.5)

    # draw solid edges
    if transitions is None:
        widths = [x[-1]['weight'] for x in nx_g_solid.edges(data=True)]
        widths = base_edge_width * np.array(widths)
        if min_edge_width is not None or max_edge_width is not None:
            widths = np.clip(widths, min_edge_width, max_edge_width)
        nx.draw_networkx_edges(nx_g_solid,
                               pos,
                               ax=ax,
                               width=widths,
                               edge_color='black')
    # draw directed edges
    else:
        adjacency_transitions = adata.uns['paga'][transitions].copy()
        if threshold_arrows is None:
            threshold_arrows = 0.005
        adjacency_transitions.data[
            adjacency_transitions.data < threshold_arrows] = 0
        adjacency_transitions.eliminate_zeros()
        g_dir = nx.DiGraph(adjacency_transitions.T)
        widths = [x[-1]['weight'] for x in g_dir.edges(data=True)]
        widths = 100 * base_edge_width * np.array(widths)
        if min_edge_width is not None or max_edge_width is not None:
            widths = np.clip(widths, min_edge_width, max_edge_width)
        nx.draw_networkx_edges(g_dir,
                               pos,
                               ax=ax,
                               width=widths,
                               edge_color='black',
                               arrowsize=arrowsize)

    if export_to_gexf:
        if isinstance(colors[0], tuple):
            from matplotlib.colors import rgb2hex
            colors = [rgb2hex(c) for c in colors]
        for count, n in enumerate(nx_g_solid.nodes()):
            nx_g_solid.node[count]['label'] = str(node_labels[count])
            nx_g_solid.node[count]['color'] = str(colors[count])
            nx_g_solid.node[count]['viz'] = {
                'position': {
                    'x': 1000 * pos[count][0],
                    'y': 1000 * pos[count][1],
                    'z': 0
                }
            }
        filename = settings.writedir + 'paga_graph.gexf'
        logg.msg('exporting to {}'.format(filename), v=1)
        if settings.writedir != '' and not os.path.exists(settings.writedir):
            os.makedirs(settings.writedir)
        nx.write_gexf(nx_g_solid, settings.writedir + 'paga_graph.gexf')

    ax.set_frame_on(frameon)
    ax.set_xticks([])
    ax.set_yticks([])

    # groups sizes
    if (groups_key is not None and groups_key + '_sizes' in adata.uns):
        groups_sizes = adata.uns[groups_key + '_sizes']
    else:
        groups_sizes = np.ones(len(node_labels))
    base_scale_scatter = 2000
    base_pie_size = (base_scale_scatter /
                     (np.sqrt(adjacency_solid.shape[0]) + 10) *
                     node_size_scale)
    median_group_size = np.median(groups_sizes)
    groups_sizes = base_pie_size * np.power(groups_sizes / median_group_size,
                                            node_size_power)

    # usual scatter plot
    if not isinstance(colors[0], dict):
        sct = ax.scatter(pos_array[:, 0],
                         pos_array[:, 1],
                         c=colors,
                         edgecolors='face',
                         s=groups_sizes,
                         cmap=cmap)
        if fontsize is None:
            fontsize = rcParams['legend.fontsize']
        for count, group in enumerate(node_labels):
            ax.text(pos_array[count, 0],
                    pos_array[count, 1],
                    group,
                    verticalalignment='center',
                    horizontalalignment='center',
                    size=fontsize,
                    fontweight=fontweight,
                    **text_kwds)
    # else pie chart plot
    else:
        # start with this dummy plot... otherwise strange behavior
        sct = ax.scatter(pos_array[:, 0],
                         pos_array[:, 1],
                         c='white',
                         edgecolors='face',
                         s=groups_sizes,
                         cmap=cmap)
        trans = ax.transData.transform
        bbox = ax.get_position().get_points()
        ax_x_min = bbox[0, 0]
        ax_x_max = bbox[1, 0]
        ax_y_min = bbox[0, 1]
        ax_y_max = bbox[1, 1]
        ax_len_x = ax_x_max - ax_x_min
        ax_len_y = ax_y_max - ax_y_min
        trans2 = ax.transAxes.inverted().transform
        pie_axs = []
        for count, n in enumerate(nx_g_solid.nodes()):
            pie_size = groups_sizes[count] / base_scale_scatter
            x1, y1 = trans(pos[n])  # data coordinates
            xa, ya = trans2((x1, y1))  # axis coordinates
            xa = ax_x_min + (xa - pie_size / 2) * ax_len_x
            ya = ax_y_min + (ya - pie_size / 2) * ax_len_y
            # clip, the fruchterman layout sometimes places below figure
            if ya < 0: ya = 0
            if xa < 0: xa = 0
            pie_axs.append(
                pl.axes([xa, ya, pie_size * ax_len_x, pie_size * ax_len_y],
                        frameon=False))
            pie_axs[count].set_xticks([])
            pie_axs[count].set_yticks([])
            if not isinstance(colors[count], dict):
                raise ValueError(
                    '{} is neither a dict of valid matplotlib colors '
                    'nor a valid matplotlib color.'.format(colors[count]))
            color_single = colors[count].keys()
            fracs = [colors[count][c] for c in color_single]
            if sum(fracs) < 1:
                color_single = list(color_single)
                color_single.append('grey')
                fracs.append(1 - sum(fracs))
            pie_axs[count].pie(fracs, colors=color_single)
        if node_labels is not None:
            for ia, a in enumerate(pie_axs):
                a.text(0.5,
                       0.5,
                       node_labels[ia],
                       verticalalignment='center',
                       horizontalalignment='center',
                       transform=a.transAxes,
                       size=fontsize)
    return pos_array, sct
示例#40
0
def get_numpy_type(dtype):
    if is_categorical_dtype(dtype):
        return 'category'
    else:
        return str(dtype)
示例#41
0
 def test_columns(self):
     ce = dpp.Categorizer(columns=["A"])
     trn = ce.fit_transform(raw)
     assert is_categorical_dtype(trn["A"])
     assert is_object_dtype(trn["B"])
示例#42
0
def dendrogram(
    adata: AnnData,
    groupby: str,
    n_pcs: Optional[int] = None,
    use_rep: Optional[str] = None,
    var_names: Optional[Sequence[str]] = None,
    use_raw: Optional[bool] = None,
    cor_method: str = 'pearson',
    linkage_method: str = 'complete',
    optimal_ordering: bool = False,
    key_added: Optional[str] = None,
    inplace: bool = True,
) -> Optional[Dict[str, Any]]:
    """\
    Computes a hierarchical clustering for the given `groupby` categories.

    By default, the PCA representation is used unless `.X`
    has less than 50 variables.

    Alternatively, a list of `var_names` (e.g. features) can be given.

    Average values of either `var_names` or components are used
    to compute a correlation matrix.

    The hierarchical clustering can be visualized using
    :func:`quanp.pl.dendrogram` or multiple other visualizations that can
    include a dendrogram: :func:`~quanp.pl.matrixplot`,
    :func:`~quanp.pl.heatmap`, :func:`~quanp.pl.dotplot`,
    and :func:`~quanp.pl.stacked_violin`.

    .. note::
        The computation of the hierarchical clustering is based on predefined
        groups and not per subject. The correlation matrix is computed using by
        default pearson but other methods are available.

    Parameters
    ----------
    adata
        Annotated data matrix
    {n_pcs}
    {use_rep}
    var_names
        List of var_names to use for computing the hierarchical clustering.
        If `var_names` is given, then `use_rep` and `n_pcs` is ignored.
    use_raw
        Only when `var_names` is not None.
        Use `raw` attribute of `adata` if present.
    cor_method
        correlation method to use.
        Options are 'pearson', 'kendall', and 'spearman'
    linkage_method
        linkage method to use. See :func:`scipy.cluster.hierarchy.linkage`
        for more information.
    optimal_ordering
        Same as the optimal_ordering argument of :func:`scipy.cluster.hierarchy.linkage`
        which reorders the linkage matrix so that the distance between successive
        leaves is minimal.
    key_added
        By default, the dendrogram information is added to
        `.uns[f'dendrogram_{{groupby}}']`.
        Notice that the `groupby` information is added to the dendrogram.
    inplace
        If `True`, adds dendrogram information to `adata.uns[key_added]`,
        else this function returns the information.

    Returns
    -------
    If `inplace=False`, returns dendrogram information,
    else `adata.uns[key_added]` is updated with it.

    Examples
    --------
    >>> import quanp as qp
    >>> adata = qp.datasets.pbmc68k_reduced()
    >>> qp.tl.dendrogram(adata, groupby='bulk_labels')
    >>> qp.pl.dendrogram(adata)
    >>> markers = ['featureA', 'featureB', 'featureC', 'featureD', 'featureE', 'featureF']
    >>> qp.pl.dotplot(adata, markers, groupby='bulk_labels', dendrogram=True)
    """
    if isinstance(groupby, str):
        # if not a list, turn into a list
        groupby = [groupby]
    for group in groupby:
        if group not in adata.obs_keys():
            raise ValueError(
                'groupby has to be a valid observation. '
                f'Given value: {group}, valid observations: {adata.obs_keys()}'
            )
        if not is_categorical_dtype(adata.obs[group]):
            raise ValueError(
                'groupby has to be a categorical observation. '
                f'Given value: {group}, Column type: {adata.obs[group].dtype}'
            )

    if var_names is None:
        rep_df = pd.DataFrame(
            _choose_representation(adata, use_rep=use_rep, n_pcs=n_pcs)
        )
        categorical = adata.obs[groupby[0]]
        if len(groupby) > 1:
            for group in groupby[1:]:
                # create new category by merging the given groupby categories
                categorical = (
                    categorical.astype(str) + "_" + adata.obs[group].astype(str)
                ).astype('category')
        categorical.name = "_".join(groupby)

        rep_df.set_index(categorical, inplace=True)
        categories = rep_df.index.categories
    else:
        if use_raw is None and adata.raw is not None:
            use_raw = True
        feature_names = adata.raw.var_names if use_raw else adata.var_names
        from ..plotting._anndata import _prepare_dataframe

        categories, rep_df = _prepare_dataframe(adata, feature_names, groupby, use_raw)

    # aggregate values within categories using 'mean'
    mean_df = rep_df.groupby(level=0).mean()

    import scipy.cluster.hierarchy as sch

    corr_matrix = mean_df.T.corr(method=cor_method)
    z_var = sch.linkage(
        corr_matrix, method=linkage_method, optimal_ordering=optimal_ordering
    )
    dendro_info = sch.dendrogram(z_var, labels=list(categories), no_plot=True)

    dat = dict(
        linkage=z_var,
        groupby=groupby,
        use_rep=use_rep,
        cor_method=cor_method,
        linkage_method=linkage_method,
        categories_ordered=dendro_info['ivl'],
        categories_idx_ordered=dendro_info['leaves'],
        dendrogram_info=dendro_info,
        correlation_matrix=corr_matrix.values,
    )

    if inplace:
        if key_added is None:
            key_added = f'dendrogram_{groupby}'
        logg.info(f'Storing dendrogram info using `.uns[{key_added!r}]`')
        adata.uns[key_added] = dat
    else:
        return dat
示例#43
0
def write_column(f, data, selement, compression=None):
    """
    Write a single column of data to an open Parquet file

    Parameters
    ----------
    f: open binary file
    data: pandas Series or numpy (1d) array
    selement: thrift SchemaElement
        produced by ``find_type``
    compression: str, dict, or None
        if ``str``, must be one of the keys in ``compression.compress``
        if ``dict``, must have key ``"type"`` which specifies the compression
        type to use, which must be one of the keys in ``compression.compress``,
        and may optionally have key ``"args`` which should be a dictionary of
        options to pass to the underlying compression engine.

    Returns
    -------
    chunk: ColumnChunk structure

    """
    has_nulls = selement.repetition_type == parquet_thrift.FieldRepetitionType.OPTIONAL
    tot_rows = len(data)
    encoding = "PLAIN"

    if has_nulls:
        if is_categorical_dtype(data.dtype):
            num_nulls = (data.cat.codes == -1).sum()
        elif data.dtype.kind in ['i', 'b']:
            num_nulls = 0
        else:
            num_nulls = len(data) - data.count()
        definition_data, data = make_definitions(data, num_nulls == 0)
        if data.dtype.kind == "O" and not is_categorical_dtype(data.dtype):
            try:
                if selement.type == parquet_thrift.Type.INT64:
                    data = data.astype(int)
                elif selement.type == parquet_thrift.Type.BOOLEAN:
                    data = data.astype(bool)
            except ValueError as e:
                t = parquet_thrift.Type._VALUES_TO_NAMES[selement.type]
                raise ValueError('Error converting column "%s" to primitive '
                                 'type %s. Original error: '
                                 '%s' % (data.name, t, e))

    else:
        definition_data = b""
        num_nulls = 0

    # No nested field handling (encode those as J/BSON)
    repetition_data = b""

    cats = False
    name = data.name
    diff = 0
    max, min = None, None
    start = f.tell()

    if is_categorical_dtype(data.dtype):
        dph = parquet_thrift.DictionaryPageHeader(
                num_values=len(data.cat.categories),
                encoding=parquet_thrift.Encoding.PLAIN)
        bdata = encode['PLAIN'](pd.Series(data.cat.categories), selement)
        bdata += 8 * b'\x00'
        l0 = len(bdata)
        if compression:
            bdata = compress_data(bdata, compression)
            l1 = len(bdata)
        else:
            l1 = l0
        diff += l0 - l1
        ph = parquet_thrift.PageHeader(
                type=parquet_thrift.PageType.DICTIONARY_PAGE,
                uncompressed_page_size=l0, compressed_page_size=l1,
                dictionary_page_header=dph, crc=None)

        dict_start = f.tell()
        write_thrift(f, ph)
        f.write(bdata)
        try:
            if num_nulls == 0:
                max, min = data.values.max(), data.values.min()
                if selement.type == parquet_thrift.Type.BYTE_ARRAY:
                    if selement.converted_type is not None:
                        max = encode['PLAIN'](pd.Series([max]), selement)[4:]
                        min = encode['PLAIN'](pd.Series([min]), selement)[4:]
                else:
                    max = encode['PLAIN'](pd.Series([max]), selement)
                    min = encode['PLAIN'](pd.Series([min]), selement)
        except TypeError:
            pass
        ncats = len(data.cat.categories)
        data = data.cat.codes
        cats = True
        encoding = "PLAIN_DICTIONARY"
    elif str(data.dtype) in ['int8', 'int16', 'uint8', 'uint16']:
        # encoding = "RLE"
        # disallow bitpacking for compatability
        data = data.astype('int32')

    bdata = definition_data + repetition_data + encode[encoding](
            data, selement)
    bdata += 8 * b'\x00'
    try:
        if encoding != 'PLAIN_DICTIONARY' and num_nulls == 0:
            max, min = data.values.max(), data.values.min()
            if selement.type == parquet_thrift.Type.BYTE_ARRAY:
                if selement.converted_type is not None:
                    max = encode['PLAIN'](pd.Series([max]), selement)[4:]
                    min = encode['PLAIN'](pd.Series([min]), selement)[4:]
            else:
                max = encode['PLAIN'](pd.Series([max]), selement)
                min = encode['PLAIN'](pd.Series([min]), selement)
    except TypeError:
        pass

    dph = parquet_thrift.DataPageHeader(
            num_values=tot_rows,
            encoding=getattr(parquet_thrift.Encoding, encoding),
            definition_level_encoding=parquet_thrift.Encoding.RLE,
            repetition_level_encoding=parquet_thrift.Encoding.BIT_PACKED)
    l0 = len(bdata)

    if compression:
        bdata = compress_data(bdata, compression)
        l1 = len(bdata)
    else:
        l1 = l0
    diff += l0 - l1

    ph = parquet_thrift.PageHeader(type=parquet_thrift.PageType.DATA_PAGE,
                                   uncompressed_page_size=l0,
                                   compressed_page_size=l1,
                                   data_page_header=dph, crc=None)
    try:
        write_thrift(f, ph)
    except OverflowError as err:
        raise IOError('Overflow error while writing page; try using a smaller '
                      'value for `row_group_offsets`. Original message: ' +
                      str(err))

    f.write(bdata)

    compressed_size = f.tell() - start
    uncompressed_size = compressed_size + diff

    offset = f.tell()
    s = parquet_thrift.Statistics(max=max, min=min, null_count=num_nulls)

    p = [parquet_thrift.PageEncodingStats(
            page_type=parquet_thrift.PageType.DATA_PAGE,
            encoding=parquet_thrift.Encoding.PLAIN, count=1)]

    if isinstance(compression, dict):
        algorithm = compression.get("type", None)
    else:
        algorithm = compression

    cmd = parquet_thrift.ColumnMetaData(
            type=selement.type, path_in_schema=[name],
            encodings=[parquet_thrift.Encoding.RLE,
                       parquet_thrift.Encoding.BIT_PACKED,
                       parquet_thrift.Encoding.PLAIN],
            codec=(getattr(parquet_thrift.CompressionCodec, algorithm.upper())
                   if algorithm else 0),
            num_values=tot_rows,
            statistics=s,
            data_page_offset=start,
            encoding_stats=p,
            key_value_metadata=[],
            total_uncompressed_size=uncompressed_size,
            total_compressed_size=compressed_size)
    if cats:
        p.append(parquet_thrift.PageEncodingStats(
                page_type=parquet_thrift.PageType.DICTIONARY_PAGE,
                encoding=parquet_thrift.Encoding.PLAIN, count=1))
        cmd.dictionary_page_offset = dict_start
        cmd.key_value_metadata.append(
            parquet_thrift.KeyValue(key='num_categories', value=str(ncats)))
        cmd.key_value_metadata.append(
            parquet_thrift.KeyValue(key='numpy_dtype', value=str(data.dtype)))
    chunk = parquet_thrift.ColumnChunk(file_offset=offset,
                                       meta_data=cmd)
    write_thrift(f, chunk)
    return chunk
示例#44
0
    def infer_variable_types(self, variable_types, time_index,
                             secondary_time_index):
        '''Infer variable types from dataframe

        Args:
            variable_types (dict[str -> dict[str -> type]]) : An entity's
                variable_types dict maps string variable ids to types (:class:`.Variable`)
                or (type, kwargs) to pass keyword arguments to the Variable.
            time_index (str or None): Name of time_index column
            secondary_time_index (dict[str: [str]]): Dictionary of secondary time columns
                that each map to a list of columns that depend on that secondary time
        '''
        link_relationships = [
            r for r in self.entityset.relationships
            if r.parent_entity.id == self.id or r.child_entity.id == self.id
        ]

        link_vars = [
            v.id for rel in link_relationships
            for v in [rel.parent_variable, rel.child_variable]
            if v.entity.id == self.id
        ]

        # TODO: set pk and pk types here
        inferred_types = {}
        df = self.df
        vids_to_assume_datetime = [time_index]
        if len(list(secondary_time_index.keys())):
            vids_to_assume_datetime.append(
                list(secondary_time_index.keys())[0])
        inferred_type = vtypes.Unknown
        for variable in df.columns:
            if variable in variable_types:
                continue

            elif variable in vids_to_assume_datetime:
                if col_is_datetime(df[variable]):
                    inferred_type = vtypes.Datetime
                else:
                    inferred_type = vtypes.Numeric

            elif df[variable].dtype == "object":
                if variable in link_vars:
                    inferred_type = vtypes.Categorical
                elif len(df[variable]):
                    if col_is_datetime(df[variable]):
                        inferred_type = vtypes.Datetime
                    else:
                        # heuristics to predict this some other than categorical
                        sample = df[variable].sample(
                            min(10000, df[variable].nunique()))
                        avg_length = sample.str.len().mean()
                        if avg_length > 50:
                            inferred_type = vtypes.Text
                        else:
                            inferred_type = vtypes.Categorical

            elif df[variable].dtype == "bool":
                inferred_type = vtypes.Boolean

            elif pdtypes.is_categorical_dtype(df[variable].dtype):
                inferred_type = vtypes.Categorical

            elif col_is_datetime(df[variable]):
                inferred_type = vtypes.Datetime

            elif variable in link_vars:
                inferred_type = vtypes.Ordinal

            elif len(df[variable]):
                sample = df[variable] \
                    .sample(min(10000, df[variable].nunique(dropna=False)))

                unique = sample.unique()
                percent_unique = sample.size / len(unique)

                if percent_unique < .05:
                    inferred_type = vtypes.Categorical
                else:
                    inferred_type = vtypes.Numeric

            inferred_types[variable] = inferred_type

        return inferred_types
示例#45
0
文件: methods.py 项目: wikiped/dask
def concat(dfs, axis=0, join='outer', uniform=False):
    """Concatenate, handling some edge cases:

    - Unions categoricals between partitions
    - Ignores empty partitions

    Parameters
    ----------
    dfs : list of DataFrame, Series, or Index
    axis : int or str, optional
    join : str, optional
    uniform : bool, optional
        Whether to treat ``dfs[0]`` as representative of ``dfs[1:]``. Set to
        True if all arguments have the same columns and dtypes (but not
        necessarily categories). Default is False.
    """
    if axis == 1:
        return pd.concat(dfs, axis=axis, join=join)

    if len(dfs) == 1:
        return dfs[0]

    # Support concatenating indices along axis 0
    if isinstance(dfs[0], pd.Index):
        if isinstance(dfs[0], pd.CategoricalIndex):
            return pd.CategoricalIndex(union_categoricals(dfs),
                                       name=dfs[0].name)
        return dfs[0].append(dfs[1:])

    # Handle categorical index separately
    if isinstance(dfs[0].index, pd.CategoricalIndex):
        dfs2 = [df.reset_index(drop=True) for df in dfs]
        ind = concat([df.index for df in dfs])
    else:
        dfs2 = dfs
        ind = None

    # Concatenate the partitions together, handling categories as needed
    if (isinstance(dfs2[0], pd.DataFrame) if uniform else
            any(isinstance(df, pd.DataFrame) for df in dfs2)):
        if uniform:
            dfs3 = dfs2
            cat_mask = dfs2[0].dtypes == 'category'
        else:
            # When concatenating mixed dataframes and series on axis 1, Pandas
            # converts series to dataframes with a single column named 0, then
            # concatenates.
            dfs3 = [df if isinstance(df, pd.DataFrame) else
                    df.rename(0).to_frame() for df in dfs2]
            cat_mask = pd.concat([(df.dtypes == 'category').to_frame().T
                                  for df in dfs3], join=join).any()

        if cat_mask.any():
            not_cat = cat_mask[~cat_mask].index
            out = pd.concat([df[df.columns.intersection(not_cat)]
                             for df in dfs3], join=join)
            for col in cat_mask.index.difference(not_cat):
                # Find an example of categoricals in this column
                for df in dfs3:
                    sample = df.get(col)
                    if sample is not None:
                        break
                # Extract partitions, subbing in missing if needed
                parts = []
                for df in dfs3:
                    if col in df.columns:
                        parts.append(df[col])
                    else:
                        codes = np.full(len(df), -1, dtype='i8')
                        data = pd.Categorical.from_codes(codes,
                                                         sample.cat.categories,
                                                         sample.cat.ordered)
                        parts.append(data)
                out[col] = union_categoricals(parts)
            out = out.reindex_axis(cat_mask.index, axis=1)
        else:
            out = pd.concat(dfs3, join=join)
    else:
        if is_categorical_dtype(dfs2[0].dtype):
            if ind is None:
                ind = concat([df.index for df in dfs2])
            return pd.Series(union_categoricals(dfs2), index=ind,
                             name=dfs2[0].name)
        out = pd.concat(dfs2, join=join)
    # Re-add the index if needed
    if ind is not None:
        out.index = ind
    return out
示例#46
0
    def _calculate_agg_features(self, features, entity_frames):
        test_feature = features[0]
        entity = test_feature.entity
        child_entity = test_feature.base_features[0].entity

        assert entity.id in entity_frames and child_entity.id in entity_frames

        frame = entity_frames[entity.id]
        base_frame = entity_frames[child_entity.id]
        # Sometimes approximate features get computed in a previous filter frame
        # and put in the current one dynamically,
        # so there may be existing features here
        features = [f for f in features if f.get_name() not in frame.columns]
        if not len(features):
            return frame

        # handle where
        where = test_feature.where
        if where is not None and not base_frame.empty:
            base_frame = base_frame.loc[base_frame[where.get_name()]]

        # when no child data, just add all the features to frame with nan
        if base_frame.empty:
            for f in features:
                frame[f.get_name()] = np.nan
        else:
            relationship_path = self.entityset.find_backward_path(
                entity.id, child_entity.id)

            groupby_var = Relationship._get_link_variable_name(
                relationship_path)

            # if the use_previous property exists on this feature, include only the
            # instances from the child entity included in that Timedelta
            use_previous = test_feature.use_previous
            if use_previous and not base_frame.empty:
                # Filter by use_previous values
                time_last = self.time_last
                if use_previous.is_absolute():
                    time_first = time_last - use_previous
                    ti = child_entity.time_index
                    if ti is not None:
                        base_frame = base_frame[base_frame[ti] >= time_first]
                else:
                    n = use_previous.value

                    def last_n(df):
                        return df.iloc[-n:]

                    base_frame = base_frame.groupby(groupby_var,
                                                    observed=True,
                                                    sort=False).apply(last_n)

            to_agg = {}
            agg_rename = {}
            to_apply = set()
            # apply multivariable and time-dependent features as we find them, and
            # save aggregable features for later
            for f in features:
                if _can_agg(f):
                    variable_id = f.base_features[0].get_name()

                    if variable_id not in to_agg:
                        to_agg[variable_id] = []

                    func = f.get_function()
                    funcname = func
                    if callable(func):
                        funcname = func.__name__

                    to_agg[variable_id].append(func)
                    # this is used below to rename columns that pandas names for us
                    agg_rename[u"{}-{}".format(variable_id,
                                               funcname)] = f.get_name()
                    continue

                to_apply.add(f)

            # Apply the non-aggregable functions generate a new dataframe, and merge
            # it with the existing one
            if len(to_apply):
                wrap = agg_wrapper(to_apply, self.time_last)
                # groupby_var can be both the name of the index and a column,
                # to silence pandas warning about ambiguity we explicitly pass
                # the column (in actuality grouping by both index and group would
                # work)
                to_merge = base_frame.groupby(base_frame[groupby_var],
                                              observed=True,
                                              sort=False).apply(wrap)
                frame = pd.merge(left=frame,
                                 right=to_merge,
                                 left_index=True,
                                 right_index=True,
                                 how='left')

            # Apply the aggregate functions to generate a new dataframe, and merge
            # it with the existing one
            if len(to_agg):
                # groupby_var can be both the name of the index and a column,
                # to silence pandas warning about ambiguity we explicitly pass
                # the column (in actuality grouping by both index and group would
                # work)
                to_merge = base_frame.groupby(base_frame[groupby_var],
                                              observed=True,
                                              sort=False).agg(to_agg)
                # rename columns to the correct feature names
                to_merge.columns = [
                    agg_rename["-".join(x)] for x in to_merge.columns.ravel()
                ]
                to_merge = to_merge[list(agg_rename.values())]

                # workaround for pandas bug where categories are in the wrong order
                # see: https://github.com/pandas-dev/pandas/issues/22501
                if pdtypes.is_categorical_dtype(frame.index):
                    categories = pdtypes.CategoricalDtype(
                        categories=frame.index.categories)
                    to_merge.index = to_merge.index.astype(object).astype(
                        categories)

                frame = pd.merge(left=frame,
                                 right=to_merge,
                                 left_index=True,
                                 right_index=True,
                                 how='left')

        # Handle default values
        # 1. handle non scalar default values
        iterfeats = [
            f for f in features if hasattr(f.default_value, '__iter__')
        ]
        for f in iterfeats:
            nulls = pd.isnull(frame[f.get_name()])
            for ni in nulls[nulls].index:
                frame.at[ni, f.get_name()] = f.default_value

        # 2. handle scalars default values
        fillna_dict = {
            f.get_name(): f.default_value
            for f in features if f not in iterfeats
        }
        frame.fillna(fillna_dict, inplace=True)

        # convert boolean dtypes to floats as appropriate
        # pandas behavior: https://github.com/pydata/pandas/issues/3752
        for f in features:
            if (not f.expanding and f.variable_type == variable_types.Numeric
                    and frame[f.get_name()].dtype.name in ['object', 'bool']):
                frame[f.get_name()] = frame[f.get_name()].astype(float)

        return frame
示例#47
0
def get_numpy_type(dtype):
    if is_categorical_dtype(dtype):
        return 'category'
    else:
        return str(dtype)
示例#48
0
def _is_discrete(s, force_nominal):
    return (is_categorical_dtype(s) or is_object_dtype(s) and
            (force_nominal or s.nunique() < s.size**.666))
示例#49
0
# build a transformer pipeline based on sp data
SPTransformer = CatTransformer.build_transformer_pipeline(sp_india_data,
                                                          label=None)

# transform the data before modeling
#x = SPTransformer.fit(sp_india_data)
prepared = SPTransformer.fit_transform(sp_india_data)

from pandas.api.types import is_numeric_dtype, is_categorical_dtype, \
     is_string_dtype

cats = []
nums = []
for k, v in sp_india_data.items():
    if is_categorical_dtype(v) or is_string_dtype(v):
        cats.append(k)
for k, v in sp_india_data.items():
    if is_numeric_dtype(v):
        nums.append(k)

cols = list(
    SPTransformer.transformers_[0][1].named_steps['ohe'].get_feature_names(
        input_features=cats)) + nums

clf = tree.DecisionTreeClassifier(max_depth=4)
clf = clf.fit(prepared, sp_india_label)

SPTransformer.transformers_[0][1].named_steps['ohe'].get_feature_names(
    input_features=cats)
示例#50
0
def trajectory_tree(adata: AnnData,
                    root_node: Optional[str] = None,
                    method: Literal["euclid", "gauss", "paga"] = "euclid",
                    dimension: Literal["pca", "diffmap"] = "pca",
                    tree: Optional[nx.DiGraph] = None,
                    groupby: str = "leiden"):
    """\
    Calculate a trajectory tree.

    Specify AnnData and the root name in the trajectory.

    You can pass your metadata in adata.obs
    by passing a key in adata.obs to `groupby`.

    You can also define your own tree by drawing it as nx.DiGraph
    and pass it to `tree`.

    Parameters
    ----------
    adata : AnnData
        The annotated data matrix.
    root_node : Optional[str]
        A given cluster is used as the root for the tree.
        If `None`, it calculates a most likely root of the tree,
        but this will not be reliable for the moment.
        Please set the root cluster, by default `None`.
    method : Literal["euclid", "gauss", "paga"]
        Method for calculating the tree, by default "euclid."
    dimension : Literal["pca", "diffmap"]
        The data for calculating the trajectory tree, by default "pca."
    tree : Optional[nx.DiGraph]
        If `None`, it calculates the trajectory tree. If nx.DiGraph is given,
        the tree is constructed from the nx.DiGraph, by default `None`.
    groupby : str
        Key for categorical in `adata.obs`. You can pass your
        metadata of clusters, by default "leiden."
    """
    if method not in ["euclid", "paga", "gauss"]:
        raise ValueError(
            "Argument 'method' must be 'euclid','paga' or 'gauss'.")
    if dimension not in ['pca', 'diffmap']:
        raise ValueError("Argument 'dimension' must be 'pca' or 'diffmap'.")

    if not isinstance(adata, AnnData):
        raise ValueError("trajectory_tree() expects AnnData argument")
    else:
        if groupby not in adata.obs.keys():
            raise ValueError("Did not find adata.obs[{}]".format(groupby))
        if "X_{}".format(dimension) not in list(adata.obsm):
            raise ValueError(
                "Did not find 'X_{}' in adata.obsm. Run scanpy.tl to calculate first."
                .format(dimension))

    # when adata.obs[groupby] is metadata and not categorized, it cause errors.
    # maybe there are better ways to set adata.obs, but one code below does jobs enough.
    if not is_categorical_dtype(adata.obs[groupby]):
        adata.obs[groupby] = adata.obs[groupby].astype('category')

    pp = Preprocessing()
    pp.trajectory_tree(adata,
                       root_node=root_node,
                       tree=tree,
                       method=method,
                       dimension=dimension,
                       groupby=groupby)
示例#51
0
 def _is_discrete(s):
     return (is_categorical_dtype(s) or
             is_object_dtype(s) and (force_nominal or
                                     s.nunique() < s.size**.666))
示例#52
0
def _assert_categorical_obs(adata: AnnData, key: str) -> None:
    if key not in adata.obs:
        raise KeyError(f"Cluster key `{key}` not found in `adata.obs`.")

    if not is_categorical_dtype(adata.obs[key]):
        raise TypeError(f"Expected `adata.obs[{key!r}]` to be `categorical`, found `{infer_dtype(adata.obs[key])}`.")
示例#53
0
 def less_than_equal_to_scalar(vals):
     if (pdtypes.is_categorical_dtype(vals)
             and self.value not in vals.cat.categories):
         return np.nan
     return vals <= self.value
示例#54
0
 def greater_than_scalar(vals):
     if (pdtypes.is_categorical_dtype(vals)
             and self.value not in vals.cat.categories):
         return np.nan
     return vals > self.value
示例#55
0
def empty(types, size, cats=None, cols=None, index_types=None, index_names=None,
          timezones=None):
    """
    Create empty DataFrame to assign into

    In the simplest case, will return a Pandas dataframe of the given size,
    with columns of the given names and types. The second return value `views`
    is a dictionary of numpy arrays into which you can assign values that
    show up in the dataframe.

    For categorical columns, you get two views to assign into: if the
    column name is "col", you get both "col" (the category codes) and
    "col-catdef" (the category labels).

    For a single categorical index, you should use the `.set_categories`
    method of the appropriate "-catdef" columns, passing an Index of values

    ``views['index-catdef'].set_categories(pd.Index(newvalues), fastpath=True)``

    Multi-indexes work a lot like categoricals, even if the types of each
    index are not themselves categories, and will also have "-catdef" entries
    in the views. However, these will be Dummy instances, providing only a
    ``.set_categories`` method, to be used as above.

    Parameters
    ----------
    types: like np record structure, 'i4,u2,f4,f2,f4,M8,m8', or using tuples
        applies to non-categorical columns. If there are only categorical
        columns, an empty string of None will do.
    size: int
        Number of rows to allocate
    cats: dict {col: labels}
        Location and labels for categorical columns, e.g., {1: ['mary', 'mo]}
        will create column index 1 (inserted amongst the numerical columns)
        with two possible values. If labels is an integers, `{'col': 5}`,
        will generate temporary labels using range. If None, or column name
        is missing, will assume 16-bit integers (a reasonable default).
    cols: list of labels
        assigned column names, including categorical ones.
    index_types: list of str
        For one of more index columns, make them have this type. See general
        description, above, for caveats about multi-indexing. If None, the
        index will be the default RangeIndex.
    index_names: list of str
        Names of the index column(s), if using
    timezones: dict {col: timezone_str}
        for timestamp type columns, apply this timezone to the pandas series;
        the numpy view will be UTC.

    Returns
    -------
    - dataframe with correct shape and data-types
    - list of numpy views, in order, of the columns of the dataframe. Assign
        to this.
    """
    views = {}
    timezones = timezones or {}

    if isinstance(types, STR_TYPE):
        types = types.split(',')
    cols = cols if cols is not None else range(len(types))

    def cat(col):
        if cats is None or col not in cats:
            return RangeIndex(0, 2**14)
        elif isinstance(cats[col], int):
            return RangeIndex(0, cats[col])
        else:  # explicit labels list
            return cats[col]

    df = OrderedDict()
    for t, col in zip(types, cols):
        if str(t) == 'category':
            df[six.text_type(col)] = Categorical([], categories=cat(col),
                                                 fastpath=True)
        else:
            d = np.empty(0, dtype=t)
            if d.dtype.kind == "M" and six.text_type(col) in timezones:
                d = Series(d).dt.tz_localize(timezones[six.text_type(col)])
            df[six.text_type(col)] = d

    df = DataFrame(df)
    if not index_types:
        index = RangeIndex(size)
    elif len(index_types) == 1:
        t, col = index_types[0], index_names[0]
        if col is None:
            raise ValueError('If using an index, must give an index name')
        if str(t) == 'category':
            c = Categorical([], categories=cat(col), fastpath=True)
            vals = np.zeros(size, dtype=c.codes.dtype)
            index = CategoricalIndex(c)
            index._data._codes = vals
            views[col] = vals
            views[col+'-catdef'] = index._data
        else:
            d = np.empty(size, dtype=t)
            index = Index(d)
            views[col] = index.values
    else:
        index = MultiIndex([[]], [[]])
        # index = MultiIndex.from_arrays(indexes)
        index._levels = list()
        index._labels = list()
        for i, col in enumerate(index_names):
            index._levels.append(Index([None]))

            def set_cats(values, i=i, col=col, **kwargs):
                values.name = col
                if index._levels[i][0] is None:
                    index._levels[i] = values
                elif not index._levels[i].equals(values):
                    raise RuntimeError("Different dictionaries encountered"
                                       " while building categorical")

            x = Dummy()
            x._set_categories = set_cats

            d = np.zeros(size, dtype=int)
            index._labels.append(d)
            views[col] = d
            views[col+'-catdef'] = x

    axes = [df._data.axes[0], index]

    # allocate and create blocks
    blocks = []
    for block in df._data.blocks:
        if block.is_categorical:
            categories = block.values.categories
            code = np.zeros(shape=size, dtype=block.values.codes.dtype)
            values = Categorical(values=code, categories=categories,
                                 fastpath=True)
            new_block = block.make_block_same_class(values=values)
        elif getattr(block.dtype, 'tz', None):
            new_shape = (size, )
            values = np.empty(shape=new_shape, dtype="M8[ns]")
            new_block = block.make_block_same_class(
                    values=values, dtype=block.values.dtype)
        else:
            new_shape = (block.values.shape[0], size)
            values = np.empty(shape=new_shape, dtype=block.values.dtype)
            new_block = block.make_block_same_class(values=values)

        blocks.append(new_block)

    # create block manager
    df = DataFrame(BlockManager(blocks, axes))

    # create views
    for block in df._data.blocks:
        dtype = block.dtype
        inds = block.mgr_locs.indexer
        if isinstance(inds, slice):
            inds = list(range(inds.start, inds.stop, inds.step))
        for i, ind in enumerate(inds):
            col = df.columns[ind]
            if is_categorical_dtype(dtype):
                views[col] = block.values._codes
                views[col+'-catdef'] = block.values
            elif getattr(block.dtype, 'tz', None):
                views[col] = np.asarray(block.values, dtype='M8[ns]')
            else:
                views[col] = block.values[i]

    if index_names:
        df.index.names = [
            None if re.match(r'__index_level_\d+__', n) else n
            for n in index_names
        ]
    return df, views
示例#56
0
def convert_col_dtype(col, int_to_category=True, force_fp32=True):
    """Convert datatypes for columns according to "sensible" rules for the
    tasks in this module:

    * integer types are reduced to smallest integer type without losing
      information, or to a categorical if that uses less memory (roughly)
    * float types are all made the same: either the type of the first element,
      or all are reduced to single precision
    * object types that contain strings are converted to categoricals
    * object types that contain numbers are converted according to the rules
      above to either floats, shortest-possible ints, or a categorical
    * bool types are forced to ``numpy.dtype('bool')``

    Parameters
    ----------
    col : pandas.Series
        Column

    int_to_category : bool
        Whether to convert integer types to categoricals in the case that this
        will save memory.

    force_fp32 : bool
        Force all floating-point data types to be single precision (fp32). If
        False, the type of the first element is used instead (for all values in
        the column).

    Returns
    -------
    col : pandas.Series

    """
    from pisa.utils.fileio import fsort

    categorical_dtype = CategoricalDtype()

    recognized_dtype = False
    original_dtype = col.dtype
    col_name = col.name

    if len(col) == 0: #pylint: disable=len-as-condition
        return col

    first_item = col.iloc[0]

    # Default: keep current dtype
    new_dtype = original_dtype

    if (is_categorical_dtype(original_dtype)
            or is_datetime64_any_dtype(original_dtype)
            or is_timedelta64_dtype(original_dtype)
            or is_timedelta64_ns_dtype(original_dtype)):
        recognized_dtype = True
        new_dtype = original_dtype
    elif is_object_dtype(original_dtype):
        if isinstance(first_item, basestring):
            recognized_dtype = True
            new_dtype = categorical_dtype
        # NOTE: Must check bool before int since bools look like ints (but not
        # vice versa)
        elif isinstance(first_item, BOOL_TYPES):
            recognized_dtype = True
            new_dtype = np.dtype('bool')
        elif isinstance(first_item, INT_TYPES + UINT_TYPES):
            recognized_dtype = True
            new_dtype = np.dtype('int')
        elif isinstance(first_item, FLOAT_TYPES):
            recognized_dtype = True
            new_dtype = np.dtype(type(first_item))

    # Convert ints to either shortest int possible or categorical,
    # whichever is smaller (use int if same size)
    if new_dtype in INT_DTYPES + UINT_DTYPES:
        recognized_dtype = True
        # See how large an int would be necessary
        col_min, col_max = col.min(), col.max()
        found_int_dtype = False
        int_dtype = None
        for int_dtype in INT_DTYPES:
            exponent = 8*int_dtype.itemsize - 1
            min_representable = -2 ** exponent
            max_representable = (2 ** exponent) - 1
            if col_min >= min_representable and col_max <= max_representable:
                found_int_dtype = True
                break
        if not found_int_dtype:
            raise ValueError('Value(s) in column "%s" exceed %s bounds'
                             % (col_name, int_dtype))

        # Check if categorical is probably smaller than int dtype; note that
        # the below is not perfect (i.e. is not based on exact internal
        # representation of categoricals in Pandas...) but should get us pretty
        # close, so that at least order-of-magnitude efficiencies will be
        # found)
        if int_to_category:
            num_unique = len(col.unique())
            category_bytes = int(np.ceil(np.log2(num_unique) / 8))
            if category_bytes < int_dtype.itemsize:
                new_dtype = categorical_dtype
            else:
                new_dtype = int_dtype

    elif new_dtype in FLOAT_DTYPES:
        recognized_dtype = True
        if force_fp32:
            new_dtype = np.dtype('float32')
        else:
            new_dtype = np.dtype(type(first_item))

    elif new_dtype in BOOL_DTYPES:
        recognized_dtype = True
        new_dtype = np.dtype('bool')

    if not recognized_dtype:
        wstderr('WARNING: Not modifying column "%s" with unhandled dtype "%s"'
                ' and/or sub-type "%s"\n'
                % (col_name, original_dtype.name, type(first_item)))

    if is_dtype_equal(new_dtype, original_dtype):
        if isinstance(first_item, basestring):
            return col.cat.reorder_categories(fsort(col.cat.categories))
        return col

    if is_categorical_dtype(new_dtype):
        new_col = col.astype('category')
        if isinstance(first_item, basestring):
            new_col.cat.reorder_categories(fsort(new_col.cat.categories),
                                           inplace=True)
        return new_col

    try:
        return col.astype(new_dtype)
    except ValueError:
        wstderr('WARNING: Could not convert column "%s" to dtype "%s"; keeping'
                ' original dtype "%s"\n'
                % (col_name, new_dtype, original_dtype))
        return col