Exemplo n.º 1
0
    def na_op(x, y):
        try:
            result = expressions.evaluate(
                op, str_rep, x, y, raise_on_error=True, **eval_kwargs)
        except TypeError:
            xrav = x.ravel()
            if isinstance(y, (np.ndarray, pd.Series)):
                dtype = np.find_common_type([x.dtype, y.dtype], [])
                result = np.empty(x.size, dtype=dtype)
                yrav = y.ravel()
                mask = notnull(xrav) & notnull(yrav)
                xrav = xrav[mask]
                yrav = yrav[mask]
                if np.prod(xrav.shape) and np.prod(yrav.shape):
                    result[mask] = op(xrav, yrav)
            else:
                result = np.empty(x.size, dtype=x.dtype)
                mask = notnull(xrav)
                xrav = xrav[mask]
                if np.prod(xrav.shape):
                    result[mask] = op(xrav, y)

            result, changed = com._maybe_upcast_putmask(result, ~mask, np.nan)
            result = result.reshape(x.shape)

        result = com._fill_zeros(result, x, y, name, fill_zeros)

        return result
Exemplo n.º 2
0
	def df_lambda_replace(self, func_dict, df_list=None, inplace=False):
		"""
		replacing values columnwise using a dict of functions ('func_dict') 
		where the keys are tuples such as ('X','column_name').

		Parameters
	    ----------

		func_dict : dict()
		df_list : list()
				List of dataframes. If None then it will use self.ALL.
				All dataframes in df_list must be _df_XY_split
		NOTE: NaN, or missing values will be avoided.
		"""
		df = self.ALL
		for key in func_dict:
			idx = ('X', key)
			try:
				df.loc[com.notnull(df[idx]), idx] = \
					df.loc[com.notnull(df[idx]), idx].\
					map(func_dict[key]).astype(func_dict[1])
			except:
				print 'No type specified in func_dict so float is assumed.'
				df.loc[com.notnull(df[idx]), idx] = \
					df.loc[com.notnull(df[idx]), idx].\
					map(func_dict[key]).astype(float)
		if inplace == False:
			return self
Exemplo n.º 3
0
    def na_op(x, y):
        try:
            result = expressions.evaluate(
                op, str_rep, x, y, raise_on_error=True, **eval_kwargs)
        except TypeError:
            xrav = x.ravel()
            if isinstance(y, (np.ndarray, pd.Series)):
                dtype = np.find_common_type([x.dtype, y.dtype], [])
                result = np.empty(x.size, dtype=dtype)
                yrav = y.ravel()
                mask = notnull(xrav) & notnull(yrav)
                xrav = xrav[mask]
                yrav = yrav[mask]
                if np.prod(xrav.shape) and np.prod(yrav.shape):
                    result[mask] = op(xrav, yrav)
            elif hasattr(x,'size'):
                result = np.empty(x.size, dtype=x.dtype)
                mask = notnull(xrav)
                xrav = xrav[mask]
                if np.prod(xrav.shape):
                    result[mask] = op(xrav, y)
            else:
                raise TypeError("cannot perform operation {op} between objects "
                                "of type {x} and {y}".format(op=name,x=type(x),y=type(y)))

            result, changed = com._maybe_upcast_putmask(result, ~mask, np.nan)
            result = result.reshape(x.shape)

        result = com._fill_zeros(result, x, y, name, fill_zeros)

        return result
Exemplo n.º 4
0
    def relabel(key):
        pos = index_map[key]

        xlab = xlabels[pos]
        ylab = ylabels[pos]

        return '%sx%s' % (int(xlab) if com.notnull(xlab) else 'NULL',
                          int(ylab) if com.notnull(ylab) else 'NULL')
Exemplo n.º 5
0
def nancov(a, b):
    assert(len(a) == len(b))
    if len(a) == 0:
        return np.nan

    valid = notnull(a) & notnull(b)
    if not valid.all():
        a = a[valid]
        b = b[valid]
    return np.cov(a, b)[0, 1]
Exemplo n.º 6
0
def test_notnull():
    assert notnull(1.)
    assert not notnull(None)
    assert not notnull(np.NaN)

    with cf.option_context("mode.use_inf_as_null", False):
        assert notnull(np.inf)
        assert notnull(-np.inf)

        arr = np.array([1.5, np.inf, 3.5, -np.inf])
        result = notnull(arr)
        assert result.all()

    with cf.option_context("mode.use_inf_as_null", True):
        assert not notnull(np.inf)
        assert not notnull(-np.inf)

        arr = np.array([1.5, np.inf, 3.5, -np.inf])
        result = notnull(arr)
        assert result.sum() == 2

    with cf.option_context("mode.use_inf_as_null", False):
        for s in [tm.makeFloatSeries(),tm.makeStringSeries(),
                  tm.makeObjectSeries(),tm.makeTimeSeries(),tm.makePeriodSeries()]:
            assert(isinstance(isnull(s), Series))
Exemplo n.º 7
0
def test_isnull_datetime():
    assert (not isnull(datetime.now()))
    assert notnull(datetime.now())

    idx = date_range('1/1/1990', periods=20)
    assert(notnull(idx).all())

    idx = np.asarray(idx)
    idx[0] = iNaT
    idx = DatetimeIndex(idx)
    mask = isnull(idx)
    assert(mask[0])
    assert(not mask[1:].any())
Exemplo n.º 8
0
def nancov(a, b):
    if len(a) != len(b):
        raise AssertionError('Operands to nancov must have same size')

    valid = notnull(a) & notnull(b)
    if not valid.all():
        a = a[valid]
        b = b[valid]

    if len(a) == 0:
        return np.nan

    return np.cov(a, b)[0, 1]
Exemplo n.º 9
0
def nancorr(a, b, method='pearson'):
    """
    a, b: ndarrays
    """
    assert(len(a) == len(b))
    if len(a) == 0:
        return np.nan

    valid = notnull(a) & notnull(b)
    if not valid.all():
        a = a[valid]
        b = b[valid]

    f = get_corr_func(method)
    return f(a, b)
Exemplo n.º 10
0
def test_isnull_datetime():
    assert not isnull(datetime.now())
    assert notnull(datetime.now())

    idx = date_range("1/1/1990", periods=20)
    assert notnull(idx).all()

    import pandas.lib as lib

    idx = np.asarray(idx)
    idx[0] = lib.iNaT
    idx = DatetimeIndex(idx)
    mask = isnull(idx)
    assert mask[0]
    assert not mask[1:].any()
Exemplo n.º 11
0
def value_counts(values, sort=True, ascending=False):
    """
    Compute a histogram of the counts of non-null values

    Parameters
    ----------
    values : ndarray (1-d)
    sort : boolean, default True
        Sort by values
    ascending : boolean, default False
        Sort in ascending order

    Returns
    -------
    value_counts : Series
    """
    from pandas.core.series import Series
    from collections import defaultdict
    if com.is_integer_dtype(values.dtype):
        values = com._ensure_int64(values)
        keys, counts = lib.value_count_int64(values)
        result = Series(counts, index=keys)
    else:
        counter = defaultdict(lambda: 0)
        values = values[com.notnull(values)]
        for value in values:
            counter[value] += 1
        result = Series(counter)

    if sort:
        result.sort()
        if not ascending:
            result = result[::-1]

    return result
Exemplo n.º 12
0
def _bucketpanel_cat(series, xcat, ycat):
    xlabels, xmapping = _intern(xcat)
    ylabels, ymapping = _intern(ycat)

    shift = 10 ** (np.ceil(np.log10(ylabels.max())))
    labels = xlabels * shift + ylabels

    sorter = labels.argsort()
    sorted_labels = labels.take(sorter)
    sorted_xlabels = xlabels.take(sorter)
    sorted_ylabels = ylabels.take(sorter)

    unique_labels = np.unique(labels)
    unique_labels = unique_labels[com.notnull(unique_labels)]

    locs = sorted_labels.searchsorted(unique_labels)
    xkeys = sorted_xlabels.take(locs)
    ykeys = sorted_ylabels.take(locs)

    stringified = ['(%s, %s)' % arg
                   for arg in zip(xmapping.take(xkeys), ymapping.take(ykeys))]

    result = bucketcat(series, labels)
    result.columns = stringified

    return result
Exemplo n.º 13
0
def stack(frame, level=-1, dropna=True):
    """
    Convert DataFrame to Series with multi-level Index. Columns become the
    second level of the resulting hierarchical index

    Returns
    -------
    stacked : Series
    """
    N, K = frame.shape
    if isinstance(frame.columns, MultiIndex):
        return _stack_multi_columns(frame, level=level, dropna=True)
    elif isinstance(frame.index, MultiIndex):

        new_levels = list(frame.index.levels)
        new_levels.append(frame.columns)

        new_labels = [lab.repeat(K) for lab in frame.index.labels]
        new_labels.append(np.tile(np.arange(K), N).ravel())

        new_names = list(frame.index.names)
        new_names.append("columns")
        new_index = MultiIndex(levels=new_levels, labels=new_labels, names=new_names)
    else:
        ilabels = np.arange(N).repeat(K)
        clabels = np.tile(np.arange(K), N).ravel()
        new_index = MultiIndex(levels=[frame.index, frame.columns], labels=[ilabels, clabels])

    new_values = frame.values.ravel()
    if dropna:
        mask = notnull(new_values)
        new_values = new_values[mask]
        new_index = new_index[mask]
    return Series(new_values, index=new_index)
Exemplo n.º 14
0
def nancov(a, b, min_periods=None):
    if len(a) != len(b):
        raise AssertionError('Operands to nancov must have same size')

    if min_periods is None:
        min_periods = 1

    valid = notnull(a) & notnull(b)
    if not valid.all():
        a = a[valid]
        b = b[valid]

    if len(a) < min_periods:
        return np.nan

    return np.cov(a, b)[0, 1]
Exemplo n.º 15
0
def make_sparse(arr, kind='block', fill_value=nan):
    """
    Convert ndarray to sparse format

    Parameters
    ----------
    arr : ndarray
    kind : {'block', 'integer'}
    fill_value : NaN or another value

    Returns
    -------
    (sparse_values, index) : (ndarray, SparseIndex)
    """

    arr = _sanitize_values(arr)

    if arr.ndim > 1:
        raise TypeError("expected dimension <= 1 data")

    if com.isnull(fill_value):
        mask = com.notnull(arr)
    else:
        mask = arr != fill_value

    length = len(arr)
    if length != mask.size:
        # the arr is a SparseArray
        indices = mask.sp_index.indices
    else:
        indices = np.arange(length, dtype=np.int32)[mask]

    index = _make_index(length, indices, kind)
    sparsified_values = arr[mask]
    return sparsified_values, index
Exemplo n.º 16
0
def value_counts(values, sort=True, ascending=False):
    """
    Compute a histogram of the counts of non-null values

    Returns
    -------
    value_counts : Series
    """
    from collections import defaultdict
    if com.is_integer_dtype(values.dtype):
        values = com._ensure_int64(values)
        keys, counts = lib.value_count_int64(values)
        result = Series(counts, index=keys)
    else:
        counter = defaultdict(lambda: 0)
        values = values[com.notnull(values)]
        for value in values:
            counter[value] += 1
        result = Series(counter)

    if sort:
        result.sort()
        if not ascending:
            result = result[::-1]

    return result
Exemplo n.º 17
0
 def median(self):
     """
     Compute median value of non-null values
     """
     arr = self.values
     arr = arr[notnull(arr)]
     return tseries.median(arr)
Exemplo n.º 18
0
    def _format_strings(self):
        if self.float_format is None:
            float_format = print_config.float_format
            if float_format is None:
                fmt_str = '%% .%dg' % print_config.precision
                float_format = lambda x: fmt_str % x
        else:
            float_format = self.float_format

        formatter = com.pprint_thing if self.formatter is None else self.formatter

        def _format(x):
            if self.na_rep is not None and lib.checknull(x):
                if x is None:
                    return 'None'
                return self.na_rep
            else:
                # object dtype
                return '%s' % formatter(x)

        vals = self.values

        is_float = lib.map_infer(vals, com.is_float) & notnull(vals)
        leading_space = is_float.any()

        fmt_values = []
        for i, v in enumerate(vals):
            if not is_float[i] and leading_space:
                fmt_values.append(' %s' % _format(v))
            elif is_float[i]:
                fmt_values.append(float_format(v))
            else:
                fmt_values.append(' %s' % _format(v))

        return fmt_values
Exemplo n.º 19
0
    def dropna(self, axis=0, how='any'):
        """
        Drop 2D from panel, holding passed axis constant

        Parameters
        ----------
        axis : int, default 0
            Axis to hold constant. E.g. axis=1 will drop major_axis entries
            having a certain amount of NA data
        how : {'all', 'any'}, default 'any'
            'any': one or more values are NA in the DataFrame along the
            axis. For 'all' they all must be.

        Returns
        -------
        dropped : Panel
        """
        axis = self._get_axis_number(axis)

        values = self.values
        mask = com.notnull(values)

        for ax in reversed(sorted(set(range(self._AXIS_LEN)) - set([axis]))):
            mask = mask.sum(ax)

        per_slice = np.prod(values.shape[:axis] + values.shape[axis + 1:])

        if how == 'all':
            cond = mask > 0
        else:
            cond = mask == per_slice

        new_ax = self._get_axis(axis)[cond]
        return self.reindex_axis(new_ax, axis=axis)
Exemplo n.º 20
0
def bucketcat(series, cats):
    """
    Produce DataFrame representing quantiles of a Series

    Parameters
    ----------
    series : Series
    cat : Series or same-length array
        bucket by category; mutually exxlusive with 'by'

    Returns
    -------
    DataFrame
    """
    if not isinstance(series, Series):
        series = Series(series, index=np.arange(len(series)))

    cats = np.asarray(cats)

    unique_labels = np.unique(cats)
    unique_labels = unique_labels[com.notnull(unique_labels)]

    # group by
    data = {}

    for label in unique_labels:
        data[label] = series[cats == label]

    return DataFrame(data, columns=unique_labels)
Exemplo n.º 21
0
    def na_op(x, y):
        try:
            result = expressions.evaluate(op, str_rep, x, y,
                                          raise_on_error=True, **eval_kwargs)
        except TypeError:
            result = pa.empty(len(x), dtype=x.dtype)
            if isinstance(y, (pa.Array, pd.Series)):
                mask = notnull(x) & notnull(y)
                result[mask] = op(x[mask], y[mask])
            else:
                mask = notnull(x)
                result[mask] = op(x[mask], y)

            result, changed = com._maybe_upcast_putmask(result, -mask, pa.NA)

        result = com._fill_zeros(result, y, fill_zeros)
        return result
Exemplo n.º 22
0
 def cumprod(self, axis=0, dtype=None, out=None):
     """
     Overriding numpy's built-in cumprod functionality
     """
     arr = self.copy()
     okLocs = notnull(arr)
     arr[okLocs] = np.cumprod(arr.view(ndarray)[okLocs])
     return arr
Exemplo n.º 23
0
 def f(arr):
     mask = common.notnull(arr)
     if skipna:
         return _tseries.median(arr[mask])
     else:
         if not mask.all():
             return np.nan
         return _tseries.median(arr)
Exemplo n.º 24
0
def nancorr(a, b, method='pearson'):
    """
    a, b: ndarrays
    """
    if len(a) != len(b):
        raise AssertionError('Operands to nancorr must have same size')

    valid = notnull(a) & notnull(b)
    if not valid.all():
        a = a[valid]
        b = b[valid]

    if len(a) == 0:
        return np.nan

    f = get_corr_func(method)
    return f(a, b)
Exemplo n.º 25
0
    def count(self):
        """
        Return number of observations of Series.

        Returns
        -------
        nobs : int
        """
        return notnull(self.values).sum()
Exemplo n.º 26
0
    def na_op(x, y):
        try:
            result = expressions.evaluate(op, str_rep, x, y,
                                          raise_on_error=True, **eval_kwargs)
        except TypeError:
            if isinstance(y, (pa.Array, pd.Series)):
                dtype = np.find_common_type([x.dtype, y.dtype], [])
                result = np.empty(x.size, dtype=dtype)
                mask = notnull(x) & notnull(y)
                result[mask] = op(x[mask], y[mask])
            else:
                result = pa.empty(len(x), dtype=x.dtype)
                mask = notnull(x)
                result[mask] = op(x[mask], y)

            result, changed = com._maybe_upcast_putmask(result, ~mask, pa.NA)

        result = com._fill_zeros(result, x, y, name, fill_zeros)
        return result
Exemplo n.º 27
0
 def _get_workbook_rowdicts(self):
     '''
     returns list of key-value dicts for all rows in sheet, with keys in first row. empty values are removed.
     '''
     rows = self.workbook.parse().to_dict(outtype='records')
     rows_ret = list()
     for row in rows:
         ret = dict((k, v) for k, v in row.iteritems() if notnull(v))
         rows_ret.append(ret)
     return rows_ret
Exemplo n.º 28
0
 def cumsum(self, axis=0, dtype=None, out=None):
     """
     Overriding numpy's built-in cumsum functionality
     """
     arr = self.copy()
     okLocs = notnull(arr)
     result = np.cumsum(arr.view(ndarray)[okLocs])
     arr = arr.astype(result.dtype)
     arr[okLocs] = result
     return arr
Exemplo n.º 29
0
    def _reindex_columns(self, columns, copy, level, fill_value):
        if level is not None:
            raise Exception("Reindex by level not supported for sparse")

        if com.notnull(fill_value):
            raise NotImplementedError

        # TODO: fill value handling
        sdict = dict((k, v) for k, v in self.iteritems() if k in columns)
        return SparseDataFrame(sdict, index=self.index, columns=columns, default_fill_value=self.default_fill_value)
Exemplo n.º 30
0
def stack(frame, level=-1, dropna=True):
    """
    Convert DataFrame to Series with multi-level Index. Columns become the
    second level of the resulting hierarchical index

    Returns
    -------
    stacked : Series
    """

    def factorize(index):
        if index.is_unique:
            return index, np.arange(len(index))
        cat = Categorical(index, ordered=True)
        return cat.categories, cat.codes

    N, K = frame.shape
    if isinstance(frame.columns, MultiIndex):
        if frame.columns._reference_duplicate_name(level):
            msg = ("Ambiguous reference to {0}. The column "
                   "names are not unique.".format(level))
            raise ValueError(msg)

    # Will also convert negative level numbers and check if out of bounds.
    level_num = frame.columns._get_level_number(level)

    if isinstance(frame.columns, MultiIndex):
        return _stack_multi_columns(frame, level_num=level_num, dropna=dropna)
    elif isinstance(frame.index, MultiIndex):
        new_levels = list(frame.index.levels)
        new_labels = [lab.repeat(K) for lab in frame.index.labels]

        clev, clab = factorize(frame.columns)
        new_levels.append(clev)
        new_labels.append(np.tile(clab, N).ravel())

        new_names = list(frame.index.names)
        new_names.append(frame.columns.name)
        new_index = MultiIndex(levels=new_levels, labels=new_labels,
                               names=new_names, verify_integrity=False)
    else:
        levels, (ilab, clab) = zip(*map(factorize, (frame.index,
                                                    frame.columns)))
        labels = ilab.repeat(K), np.tile(clab, N).ravel()
        new_index = MultiIndex(levels=levels, labels=labels,
                               names=[frame.index.name, frame.columns.name],
                               verify_integrity=False)

    new_values = frame.values.ravel()
    if dropna:
        mask = notnull(new_values)
        new_values = new_values[mask]
        new_index = new_index[mask]
    return Series(new_values, index=new_index)
Exemplo n.º 31
0
def stack(frame, level=-1, dropna=True):
    """
    Convert DataFrame to Series with multi-level Index. Columns become the
    second level of the resulting hierarchical index

    Returns
    -------
    stacked : Series
    """
    N, K = frame.shape
    if isinstance(level, int) and level < 0:
        level += frame.columns.nlevels

    level = frame.columns._get_level_number(level)

    if isinstance(frame.columns, MultiIndex):
        return _stack_multi_columns(frame, level=level, dropna=True)
    elif isinstance(frame.index, MultiIndex):
        new_levels = list(frame.index.levels)
        new_levels.append(frame.columns)

        new_labels = [lab.repeat(K) for lab in frame.index.labels]
        new_labels.append(np.tile(np.arange(K), N).ravel())

        new_names = list(frame.index.names)
        new_names.append(frame.columns.name)
        new_index = MultiIndex(levels=new_levels, labels=new_labels,
                               names=new_names)
    else:
        ilabels = np.arange(N).repeat(K)
        clabels = np.tile(np.arange(K), N).ravel()
        new_index = MultiIndex(levels=[frame.index, frame.columns],
                               labels=[ilabels, clabels],
                               names=[frame.index.name, frame.columns.name])

    new_values = frame.values.ravel()
    if dropna:
        mask = notnull(new_values)
        new_values = new_values[mask]
        new_index = new_index[mask]
    return Series(new_values, index=new_index)
Exemplo n.º 32
0
    def dropna(self, axis=0, how='any', inplace=False, **kwargs):
        """
        Drop 2D from panel, holding passed axis constant

        Parameters
        ----------
        axis : int, default 0
            Axis to hold constant. E.g. axis=1 will drop major_axis entries
            having a certain amount of NA data
        how : {'all', 'any'}, default 'any'
            'any': one or more values are NA in the DataFrame along the
            axis. For 'all' they all must be.
        inplace : bool, default False
            If True, do operation inplace and return None.

        Returns
        -------
        dropped : Panel
        """
        axis = self._get_axis_number(axis)

        values = self.values
        mask = com.notnull(values)

        for ax in reversed(sorted(set(range(self._AXIS_LEN)) - set([axis]))):
            mask = mask.sum(ax)

        per_slice = np.prod(values.shape[:axis] + values.shape[axis + 1:])

        if how == 'all':
            cond = mask > 0
        else:
            cond = mask == per_slice

        new_ax = self._get_axis(axis)[cond]
        result = self.reindex_axis(new_ax, axis=axis)
        if inplace:
            self._update_inplace(result)
        else:
            return result
Exemplo n.º 33
0
    def _format_strings(self, use_unicode=False):
        if self.float_format is None:
            float_format = print_config.float_format
            if float_format is None:
                fmt_str = '%% .%dg' % print_config.precision
                float_format = lambda x: fmt_str % x
        else:
            float_format = self.float_format

        if use_unicode:
            def _strify(x):
                return _stringify(x, print_config.encoding)
            formatter = _strify if self.formatter is None else self.formatter
        else:
            formatter = str if self.formatter is None else self.formatter

        def _format(x):
            if self.na_rep is not None and lib.checknull(x):
                if x is None:
                    return 'None'
                return self.na_rep
            else:
                # object dtype
                return '%s' % formatter(x)

        vals = self.values

        is_float = lib.map_infer(vals, com.is_float) & notnull(vals)
        leading_space = is_float.any()

        fmt_values = []
        for i, v in enumerate(vals):
            if not is_float[i] and leading_space:
                fmt_values.append(' %s' % _format(v))
            elif is_float[i]:
                fmt_values.append(float_format(v))
            else:
                fmt_values.append(' %s' % _format(v))

        return fmt_values
Exemplo n.º 34
0
    def to_frame(self, filter_observations=True):
        """
        Transform wide format into long (stacked) format as DataFrame

        Parameters
        ----------
        filter_observations : boolean, default True
            Drop (major, minor) pairs without a complete set of observations
            across all the items

        Returns
        -------
        y : DataFrame
        """
        _, N, K = self.shape

        if filter_observations:
            mask = com.notnull(self.values).all(axis=0)
            # size = mask.sum()
            selector = mask.ravel()
        else:
            # size = N * K
            selector = slice(None, None)

        data = {}
        for item in self.items:
            data[item] = self[item].values.ravel()[selector]

        major_labels = np.arange(N).repeat(K)[selector]

        # Anyone think of a better way to do this? np.repeat does not
        # do what I want
        minor_labels = np.arange(K).reshape(1, K)[np.zeros(N, dtype=int)]
        minor_labels = minor_labels.ravel()[selector]

        index = MultiIndex(levels=[self.major_axis, self.minor_axis],
                           labels=[major_labels, minor_labels],
                           names=['major', 'minor'])

        return DataFrame(data, index=index, columns=self.items)
Exemplo n.º 35
0
def _attempt_YYYYMMDD(arg, coerce):
    """ try to parse the YYYYMMDD/%Y%m%d format, try to deal with NaT-like,
        arg is a passed in as an object dtype, but could really be ints/strings with nan-like/or floats (e.g. with nan) """
    def calc(carg):
        # calculate the actual result
        carg = carg.astype(object)
        return tslib.array_to_datetime(lib.try_parse_year_month_day(
            carg / 10000, carg / 100 % 100, carg % 100),
                                       coerce=coerce)

    def calc_with_mask(carg, mask):
        result = np.empty(carg.shape, dtype='M8[ns]')
        iresult = result.view('i8')
        iresult[~mask] = tslib.iNaT
        result[mask] = calc(carg[mask].astype(np.float64).astype(
            np.int64)).astype('M8[ns]')
        return result

    # try intlike / strings that are ints
    try:
        return calc(arg.astype(np.int64))
    except:
        pass

    # a float with actual np.nan
    try:
        carg = arg.astype(np.float64)
        return calc_with_mask(carg, com.notnull(carg))
    except:
        pass

    # string with NaN-like
    try:
        mask = ~lib.ismember(arg, tslib._nat_strings)
        return calc_with_mask(arg, mask)
    except:
        pass

    return None
Exemplo n.º 36
0
    def _reindex_columns(self,
                         columns,
                         copy,
                         level,
                         fill_value,
                         limit=None,
                         takeable=False):
        if level is not None:
            raise TypeError('Reindex by level not supported for sparse')

        if com.notnull(fill_value):
            raise NotImplementedError

        if limit:
            raise NotImplementedError

        # TODO: fill value handling
        sdict = dict((k, v) for k, v in compat.iteritems(self) if k in columns)
        return SparseDataFrame(sdict,
                               index=self.index,
                               columns=columns,
                               default_fill_value=self._default_fill_value)
Exemplo n.º 37
0
def value_counts(values, sort=True, ascending=False):
    """
    Compute a histogram of the counts of non-null values

    Parameters
    ----------
    values : ndarray (1-d)
    sort : boolean, default True
        Sort by values
    ascending : boolean, default False
        Sort in ascending order

    Returns
    -------
    value_counts : Series
    """
    from pandas.core.series import Series
    from collections import defaultdict

    values = np.asarray(values)

    if com.is_integer_dtype(values.dtype):
        values = com._ensure_int64(values)
        keys, counts = lib.value_count_int64(values)
        result = Series(counts, index=keys)
    else:
        counter = defaultdict(lambda: 0)
        values = values[com.notnull(values)]
        for value in values:
            counter[value] += 1
        result = Series(counter)

    if sort:
        result.sort()
        if not ascending:
            result = result[::-1]

    return result
Exemplo n.º 38
0
    def _format_strings(self):
        if self.float_format is None:
            float_format = get_option("display.float_format")
            if float_format is None:
                fmt_str = '%% .%dg' % get_option("display.precision")
                float_format = lambda x: fmt_str % x
        else:
            float_format = self.float_format

        formatter = (lambda x: com.pprint_thing(x,escape_chars=('\t','\r','\n'))) \
                    if self.formatter is None else self.formatter

        def _format(x):
            if self.na_rep is not None and lib.checknull(x):
                if x is None:
                    return 'None'
                return self.na_rep
            else:
                # object dtype
                return '%s' % formatter(x)

        vals = self.values

        is_float = lib.map_infer(vals, com.is_float) & notnull(vals)
        leading_space = is_float.any()

        fmt_values = []
        for i, v in enumerate(vals):
            if not is_float[i] and leading_space:
                fmt_values.append(' %s' % _format(v))
            elif is_float[i]:
                fmt_values.append(float_format(v))
            else:
                fmt_values.append(' %s' % _format(v))

        return fmt_values
Exemplo n.º 39
0
    def asOf(self, date):
        """
        Return last good (non-NaN) value in TimeSeries if value is NaN for
        requested date.

        If there is no good value, NaN is returned.

        Parameters
        ----------
        date : datetime or similar value

        Notes
        -----
        Dates are assumed to be sorted

        Returns
        -------
        value or NaN
        """
        if isinstance(date, basestring):
            date = datetools.to_datetime(date)

        v = self.get(date)

        if isnull(v):
            candidates = self.index[notnull(self)]
            index = candidates.searchsorted(date)

            if index > 0:
                asOfDate = candidates[index - 1]
            else:
                return NaN

            return self.get(asOfDate)
        else:
            return v
Exemplo n.º 40
0
 def _format_with(self, fmt_str):
     fmt_values = [fmt_str % x if notnull(x) else self.na_rep
                   for x in self.values]
     return _trim_zeros(fmt_values, self.na_rep)
Exemplo n.º 41
0
    def to_frame(self, filter_observations=True):
        """
        Transform wide format into long (stacked) format as DataFrame whose
        columns are the Panel's items and whose index is a MultiIndex formed
        of the Panel's major and minor axes.

        Parameters
        ----------
        filter_observations : boolean, default True
            Drop (major, minor) pairs without a complete set of observations
            across all the items

        Returns
        -------
        y : DataFrame
        """
        _, N, K = self.shape

        if filter_observations:
            # shaped like the return DataFrame
            mask = com.notnull(self.values).all(axis=0)
            # size = mask.sum()
            selector = mask.ravel()
        else:
            # size = N * K
            selector = slice(None, None)

        data = {}
        for item in self.items:
            data[item] = self[item].values.ravel()[selector]

        def construct_multi_parts(idx, n_repeat, n_shuffle=1):
            axis_idx = idx.to_hierarchical(n_repeat, n_shuffle)
            labels = [x[selector] for x in axis_idx.labels]
            levels = axis_idx.levels
            names = axis_idx.names
            return labels, levels, names

        def construct_index_parts(idx, major=True):
            levels = [idx]
            if major:
                labels = [np.arange(N).repeat(K)[selector]]
                names = idx.name or 'major'
            else:
                labels = np.arange(K).reshape(1, K)[np.zeros(N, dtype=int)]
                labels = [labels.ravel()[selector]]
                names = idx.name or 'minor'
            names = [names]
            return labels, levels, names

        if isinstance(self.major_axis, MultiIndex):
            major_labels, major_levels, major_names = construct_multi_parts(
                self.major_axis, n_repeat=K)
        else:
            major_labels, major_levels, major_names = construct_index_parts(
                self.major_axis)

        if isinstance(self.minor_axis, MultiIndex):
            minor_labels, minor_levels, minor_names = construct_multi_parts(
                self.minor_axis, n_repeat=N, n_shuffle=K)
        else:
            minor_labels, minor_levels, minor_names = construct_index_parts(
                self.minor_axis, major=False)

        levels = major_levels + minor_levels
        labels = major_labels + minor_labels
        names = major_names + minor_names

        index = MultiIndex(levels=levels, labels=labels,
                           names=names, verify_integrity=False)

        return DataFrame(data, index=index, columns=self.items)
Exemplo n.º 42
0
def lreshape(data, groups, dropna=True, label=None):
    """
    Reshape long-format data to wide. Generalized inverse of DataFrame.pivot

    Parameters
    ----------
    data : DataFrame
    groups : dict
        {new_name : list_of_columns}
    dropna : boolean, default True

    Examples
    --------
    >>> data
       hr1  hr2     team  year1  year2
    0  514  545  Red Sox   2007   2008
    1  573  526  Yankees   2007   2008

    >>> pd.lreshape(data, {'year': ['year1', 'year2'],
                           'hr': ['hr1', 'hr2']})
          team   hr  year
    0  Red Sox  514  2007
    1  Yankees  573  2007
    2  Red Sox  545  2008
    3  Yankees  526  2008

    Returns
    -------
    reshaped : DataFrame
    """
    if isinstance(groups, dict):
        keys = groups.keys()
        values = groups.values()
    else:
        keys, values = zip(*groups)

    all_cols = list(set.union(*[set(x) for x in values]))
    id_cols = list(data.columns.diff(all_cols))

    K = len(values[0])

    for seq in values:
        if len(seq) != K:
            raise ValueError('All column lists must be same length')

    mdata = {}
    pivot_cols = []

    for target, names in zip(keys, values):
        mdata[target] = com._concat_compat([data[col].values for col in names])
        pivot_cols.append(target)

    for col in id_cols:
        mdata[col] = np.tile(data[col].values, K)

    if dropna:
        mask = np.ones(len(mdata[pivot_cols[0]]), dtype=bool)
        for c in pivot_cols:
            mask &= notnull(mdata[c])
        if not mask.all():
            mdata = dict((k, v[mask]) for k, v in mdata.iteritems())

    return DataFrame(mdata, columns=id_cols + pivot_cols)
Exemplo n.º 43
0
def _guess_datetime_format_for_array(arr, **kwargs):
    # Try to guess the format based on the first non-NaN element
    non_nan_elements = com.notnull(arr).nonzero()[0]
    if len(non_nan_elements):
        return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs)
Exemplo n.º 44
0
def test_notnull():
    assert notnull(1.)
    assert not notnull(None)
    assert not notnull(np.NaN)
    assert not notnull(np.inf)
    assert not notnull(-np.inf)
Exemplo n.º 45
0
def remove_na(arr):
    """
    Return array containing only true/non-NaN values, possibly empty.
    """
    return arr[notnull(arr)]
Exemplo n.º 46
0
def test_isnull_datetime():
    assert (not isnull(datetime.now()))
    assert notnull(datetime.now())
Exemplo n.º 47
0
def scatter_matrix_all(frame,
                       alpha=0.5,
                       figsize=None,
                       grid=False,
                       diagonal='hist',
                       marker='.',
                       density_kwds=None,
                       hist_kwds=None,
                       range_padding=0.05,
                       **kwds):

    import matplotlib.pyplot as plt
    from matplotlib.artist import setp
    import pandas.core.common as com
    from pandas.compat import range, lrange, zip
    from statsmodels.nonparametric.smoothers_lowess import lowess
    import numpy as np

    df = frame
    num_cols = frame._get_numeric_data().columns.values
    n = df.columns.size
    fig, axes = plt.subplots(nrows=n, ncols=n, figsize=figsize, squeeze=False)

    # no gaps between subplots
    fig.subplots_adjust(wspace=0, hspace=0)

    mask = com.notnull(df)
    marker = _get_marker_compat(marker)

    hist_kwds = hist_kwds or {}
    density_kwds = density_kwds or {}

    # workaround because `c='b'` is hardcoded in matplotlibs scatter method
    kwds.setdefault('c', plt.rcParams['patch.facecolor'])

    boundaries_list = []
    for a in df.columns:
        if a in num_cols:
            values = df[a].values[mask[a].values]
        else:
            values = df[a].value_counts()
        rmin_, rmax_ = np.min(values), np.max(values)
        rdelta_ext = (rmax_ - rmin_) * range_padding / 2.
        boundaries_list.append((rmin_ - rdelta_ext, rmax_ + rdelta_ext))

    for i, a in zip(lrange(n), df.columns):
        for j, b in zip(lrange(n), df.columns):
            ax = axes[i, j]

            if i == j:
                if a in num_cols:  # numerical variable
                    values = df[a].values[mask[a].values]
                    # Deal with the diagonal by drawing a histogram there.
                    if diagonal == 'hist':
                        ax.hist(values, **hist_kwds)
                    elif diagonal in ('kde', 'density'):
                        from scipy.stats import gaussian_kde
                        y = values
                        gkde = gaussian_kde(y)
                        ind = np.linspace(y.min(), y.max(), 1000)
                        ax.plot(ind, gkde.evaluate(ind), **density_kwds)
                    ax.set_xlim(boundaries_list[i])
                else:  # categorical variable
                    values = df[a].value_counts()
                    ax.bar(list(range(df[a].nunique())), values)
            else:
                common = (mask[a] & mask[b]).values
                # two numerical variables
                if a in num_cols and b in num_cols:
                    if i > j:
                        ax.scatter(df[b][common],
                                   df[a][common],
                                   marker=marker,
                                   alpha=alpha,
                                   **kwds)
                        # The following 2 lines add the lowess smoothing
                        ys = lowess(df[a][common], df[b][common])
                        ax.plot(ys[:, 0], ys[:, 1], 'red')
                    else:
                        pearR = df[[a, b]].corr()
                        ax.text(df[b].min(), df[a].min(),
                                'r = %.4f' % (pearR.iloc[0][1]))
                    ax.set_xlim(boundaries_list[j])
                    ax.set_ylim(boundaries_list[i])
                # two categorical variables
                elif a not in num_cols and b not in num_cols:
                    if i > j:
                        from statsmodels.graphics import mosaicplot
                        mosaicplot.mosaic(df, [b, a],
                                          ax,
                                          labelizer=lambda k: '')
                # one numerical variable and one categorical variable
                else:
                    if i > j:
                        tol = pd.DataFrame(df[[a, b]])
                        if a in num_cols:
                            label = [k for k, v in tol.groupby(b)]
                            values = [v[a].tolist() for k, v in tol.groupby(b)]
                            ax.boxplot(values, labels=label)
                        else:
                            label = [k for k, v in tol.groupby(a)]
                            values = [v[b].tolist() for k, v in tol.groupby(a)]
                            ax.boxplot(values, labels=label, vert=False)

            ax.set_xlabel('')
            ax.set_ylabel('')

            _label_axis(ax, kind='x', label=b, position='bottom', rotate=True)
            _label_axis(ax, kind='y', label=a, position='left')

            if j != 0:
                ax.yaxis.set_visible(False)
            if i != n - 1:
                ax.xaxis.set_visible(False)

    for ax in axes.flat:
        setp(ax.get_xticklabels(), fontsize=8)
        setp(ax.get_yticklabels(), fontsize=8)
    return fig
Exemplo n.º 48
0
def scatter_matrix_lowess1(frame,
                           alpha=0.5,
                           figsize=None,
                           grid=False,
                           diagonal='hist',
                           marker='.',
                           density_kwds=None,
                           hist_kwds=None,
                           range_padding=0.05,
                           **kwds):
    """
    Draw a matrix of scatter plots with lowess smoother.
    This is an adapted version of the pandas scatter_matrix function.

    Parameters
    ----------
    frame : DataFrame
    alpha : float, optional
        amount of transparency applied
    figsize : (float,float), optional
        a tuple (width, height) in inches
    ax : Matplotlib axis object, optional
    grid : bool, optional
        setting this to True will show the grid
    diagonal : {'hist', 'kde'}
        pick between 'kde' and 'hist' for
        either Kernel Density Estimation or Histogram
        plot in the diagonal
    marker : str, optional
        Matplotlib marker type, default '.'    
    hist_kwds : other plotting keyword arguments
        To be passed to hist function
    density_kwds : other plotting keyword arguments
        To be passed to kernel density estimate plot
    range_padding : float, optional
        relative extension of axis range in x and y
        with respect to (x_max - x_min) or (y_max - y_min),
        default 0.05
    kwds : other plotting keyword arguments
        To be passed to scatter function

    Examples
    --------
    >>> df = DataFrame(np.random.randn(1000, 4), columns=['A','B','C','D'])
    >>> scatter_matrix_lowess(df, alpha=0.2)
    """

    import numpy as np
    import matplotlib.pyplot as plt
    from matplotlib.artist import setp
    import pandas.core.common as com
    from pandas.compat import range, lrange, lmap, map, zip
    from statsmodels.nonparametric.smoothers_lowess import lowess

    df = frame._get_numeric_data()
    n = df.columns.size
    fig, axes = plt.subplots(nrows=n, ncols=n, figsize=figsize, squeeze=False)

    # no gaps between subplots
    fig.subplots_adjust(wspace=0, hspace=0)

    mask = com.notnull(df)

    marker = _get_marker_compat(marker)

    hist_kwds = hist_kwds or {}
    density_kwds = density_kwds or {}

    # workaround because `c='b'` is hardcoded in matplotlibs scatter method
    kwds.setdefault('c', plt.rcParams['patch.facecolor'])

    boundaries_list = []
    for a in df.columns:
        values = df[a].values[mask[a].values]
        rmin_, rmax_ = np.min(values), np.max(values)
        rdelta_ext = (rmax_ - rmin_) * range_padding / 2.
        boundaries_list.append((rmin_ - rdelta_ext, rmax_ + rdelta_ext))

    for i, a in zip(lrange(n), df.columns):
        for j, b in zip(lrange(n), df.columns):
            ax = axes[i, j]

            if i == j:
                values = df[a].values[mask[a].values]

                # Deal with the diagonal by drawing a histogram there.
                if diagonal == 'hist':
                    ax.hist(values, **hist_kwds)

                elif diagonal in ('kde', 'density'):
                    from scipy.stats import gaussian_kde
                    y = values
                    gkde = gaussian_kde(y)
                    ind = np.linspace(y.min(), y.max(), 1000)
                    ax.plot(ind, gkde.evaluate(ind), **density_kwds)

                ax.set_xlim(boundaries_list[i])

            else:
                common = (mask[a] & mask[b]).values

                ax.scatter(df[b][common],
                           df[a][common],
                           marker=marker,
                           alpha=alpha,
                           **kwds)
                # The following 2 lines are new and add the lowess smoothing
                ys = lowess(df[a][common], df[b][common])
                ax.plot(ys[:, 0], ys[:, 1], 'red', linewidth=1)

                ax.set_xlim(boundaries_list[j])
                ax.set_ylim(boundaries_list[i])

            ax.set_xlabel('')
            ax.set_ylabel('')

            _label_axis(ax, kind='x', label=b, position='bottom', rotate=True)

            _label_axis(ax, kind='y', label=a, position='left')

            if j != 0:
                ax.yaxis.set_visible(False)
            if i != n - 1:
                ax.xaxis.set_visible(False)

    for ax in axes.flat:
        setp(ax.get_xticklabels(), fontsize=8)
        setp(ax.get_yticklabels(), fontsize=8)
    return fig
Exemplo n.º 49
0
 def get_median(x):
     mask = notnull(x)
     if not skipna and not mask.all():
         return np.nan
     return algos.median(x[mask])
Exemplo n.º 50
0
    def na_op(x, y):

        # dispatch to the categorical if we have a categorical
        # in either operand
        if is_categorical_dtype(x):
            return op(x,y)
        elif is_categorical_dtype(y) and not isscalar(y):
            return op(y,x)

        if is_object_dtype(x.dtype):
            if isinstance(y, list):
                y = lib.list_to_object_array(y)

            if isinstance(y, (np.ndarray, pd.Series)):
                if not is_object_dtype(y.dtype):
                    result = lib.vec_compare(x, y.astype(np.object_), op)
                else:
                    result = lib.vec_compare(x, y, op)
            else:
                result = lib.scalar_compare(x, y, op)
        else:

            # we want to compare like types
            # we only want to convert to integer like if
            # we are not NotImplemented, otherwise
            # we would allow datetime64 (but viewed as i8) against
            # integer comparisons
            if is_datetimelike_v_numeric(x, y):
                raise TypeError("invalid type comparison")

            # numpy does not like comparisons vs None
            if isscalar(y) and isnull(y):
                y = np.nan

            # we have a datetime/timedelta and may need to convert
            mask = None
            if needs_i8_conversion(x) or (not isscalar(y) and needs_i8_conversion(y)):

                if isscalar(y):
                    y = _index.convert_scalar(x,_values_from_object(y))
                else:
                    y = y.view('i8')

                if name == '__ne__':
                    mask = notnull(x)
                else:
                    mask = isnull(x)

                x = x.view('i8')

            try:
                result = getattr(x, name)(y)
                if result is NotImplemented:
                    raise TypeError("invalid type comparison")
            except AttributeError:
                result = op(x, y)

            if mask is not None and mask.any():
                result[mask] = False

        return result
Exemplo n.º 51
0
def test_notnull():
    assert notnull(1.)
    assert not notnull(None)
    assert not notnull(np.NaN)

    with cf.option_context("mode.use_inf_as_null", False):
        assert notnull(np.inf)
        assert notnull(-np.inf)

        arr = np.array([1.5, np.inf, 3.5, -np.inf])
        result = notnull(arr)
        assert result.all()

    with cf.option_context("mode.use_inf_as_null", True):
        assert not notnull(np.inf)
        assert not notnull(-np.inf)

        arr = np.array([1.5, np.inf, 3.5, -np.inf])
        result = notnull(arr)
        assert result.sum() == 2

    with cf.option_context("mode.use_inf_as_null", False):
        float_series = Series(np.random.randn(5))
        obj_series = Series(np.random.randn(5), dtype=object)
        assert (isinstance(notnull(float_series), Series))
        assert (isinstance(notnull(obj_series), Series))
Exemplo n.º 52
0
 def _valid_sp_values(self):
     sp_vals = self.sp_values
     mask = com.notnull(sp_vals)
     return sp_vals[mask]
Exemplo n.º 53
0
 def get_median(x):
     mask = notnull(x)
     if not skipna and not mask.all():
         return np.nan
     return algos.median(_values_from_object(x[mask]))
Exemplo n.º 54
0
def _bins_to_cuts(x,
                  bins,
                  right=True,
                  labels=None,
                  retbins=False,
                  precision=3,
                  name=None):
    if name is None and isinstance(x, Series):
        name = x.name
    x = np.asarray(x)

    side = 'left' if right else 'right'
    ids = bins.searchsorted(x, side=side)

    na_mask = com.notnull(x)
    above = na_mask & (ids == len(bins))
    below = na_mask & (ids == 0)

    if above.any():
        raise ValueError('Values fall past last bin: %s' % str(x[above]))

    if below.any():
        raise ValueError('Values fall before first bin: %s' % str(x[below]))

    mask = com.isnull(x)
    has_nas = mask.any()

    if labels is not False:
        if labels is None:
            fmt = lambda v: _format_label(v, precision=precision)
            if right:
                levels = [
                    '(%s, %s]' % (fmt(a), fmt(b))
                    for a, b in zip(bins, bins[1:])
                ]
            else:
                levels = [
                    '[%s, %s)' % (fmt(a), fmt(b))
                    for a, b in zip(bins, bins[1:])
                ]
        else:
            if len(labels) != len(bins) - 1:
                raise ValueError('Bin labels must be one fewer than '
                                 'the number of bin edges')
            levels = labels

        levels = np.asarray(levels, dtype=object)

        if has_nas:
            np.putmask(ids, mask, 0)

        fac = Factor(ids - 1, levels, name=name)
    else:
        fac = ids - 1
        if has_nas:
            fac = ids.astype(np.float64)
            np.putmask(fac, mask, np.nan)

    if not retbins:
        return fac

    return fac, bins
Exemplo n.º 55
0
    def plots_2D_vs_response(self, range_padding=0.05):
        """
        make 2D correlation plots (hexbin) for numerical data vs the output variable 'response' (from Prudential kaggle competition)
        strongly inspired by/ code copied from scatter_matrix() from pandas/tools/plotting.py
        plot normal and with log-z enabled next to each other.
        """

        df = self._df._get_numeric_data()
        n = df.columns.size
        naxes = n * n

        mask = com.notnull(df)

        j = -1

        boundaries_list = []
        nbins_list = []
        for a in df.columns:
            values = df[a].values[mask[a].values]
            rmin_, rmax_ = np.min(values), np.max(values)
            rdelta_ext = (rmax_ - rmin_) * range_padding / 2.
            boundaries_list.append((rmin_ - rdelta_ext, rmax_ + rdelta_ext))
            nbins = len(df[a].unique())
            if nbins > 10: nbins_list.append(10)
            else: nbins_list.append(nbins)
            #print(nbins)
            if a == 'Response':
                j = len(
                    boundaries_list
                ) - 1  #j is used below to access the boundaries_list variable

        if j < 0: print("Error: Response-variable not found")

        kwds = {'bins': 'log'}

        for i, a in zip(lrange(n), df.columns):
            if a == 'Response': continue
            elif a == 'Unnamed: 0': continue
            fs = fig_summary()  #fs.mean = average(df[var_name])
            #fig, axes = plt.subplots(1,2, sharey=True) #http://matplotlib.org/examples/pylab_examples/subplots_demo.html
            fig, axes = plt.subplots(
                1, 2
            )  #http://matplotlib.org/examples/pylab_examples/subplots_demo.html
            plt.subplots_adjust(wspace=0.4)
            common = (mask[a] & mask['Response']).values
            for k, ax in enumerate(axes):

                #cmap=plt.cm.YlOrRd_r
                if k == 0:
                    img = ax.hexbin(df['Response'][common],
                                    df[a][common],
                                    gridsize=(nbins_list[j], nbins_list[i]),
                                    cmap=plt.cm.Blues_r)
                else:
                    img = ax.hexbin(df['Response'][common],
                                    df[a][common],
                                    gridsize=(nbins_list[j], nbins_list[i]),
                                    cmap=plt.cm.Blues_r,
                                    **kwds)
                ax.set_xlim(boundaries_list[j])
                ax.set_ylim(boundaries_list[i])
                ax.set_xlabel('Response')
                ax.set_ylabel(a)
                cb = plt.colorbar(img, ax=ax)
                if k == 0: cb.set_label('entries')
                else: cb.set_label('log(entries)')

            fs.xvar = 'Response'
            fs.yvar = a
            fs.label = "%s_%s" % (a, 'Response')
            #print(fs.label)
            fs.fig_path = self._output_dir
            fs.fig_rel_path = self._rel_dir + fs.label + ".png"

            self.list_fig_summary.append(fs)
            #plt.show()
            print("figure made: ", fs.fig_path + fs.fig_rel_path)
            plt.savefig(fs.fig_path + fs.fig_rel_path)
Exemplo n.º 56
0
def scatter_matrix(frame,
                   alpha=0.5,
                   figsize=None,
                   ax=None,
                   grid=False,
                   diagonal='hist',
                   marker='.',
                   **kwds):
    """
    Draw a matrix of scatter plots.

    Parameters
    ----------
    alpha : amount of transparency applied
    figsize : a tuple (width, height) in inches
    ax : Matplotlib axis object
    grid : setting this to True will show the grid
    diagonal : pick between 'kde' and 'hist' for
        either Kernel Density Estimation or Histogram
        plon in the diagonal
    kwds : other plotting keyword arguments
        To be passed to scatter function

    Examples
    --------
    >>> df = DataFrame(np.random.randn(1000, 4), columns=['A','B','C','D'])
    >>> scatter_matrix(df, alpha=0.2)
    """
    df = frame._get_numeric_data()
    n = df.columns.size
    fig, axes = _subplots(nrows=n,
                          ncols=n,
                          figsize=figsize,
                          ax=ax,
                          squeeze=False)

    # no gaps between subplots
    fig.subplots_adjust(wspace=0, hspace=0)

    mask = com.notnull(df)

    for i, a in zip(range(n), df.columns):
        for j, b in zip(range(n), df.columns):
            if i == j:
                values = df[a].values[mask[a].values]

                # Deal with the diagonal by drawing a histogram there.
                if diagonal == 'hist':
                    axes[i, j].hist(values)
                elif diagonal in ('kde', 'density'):
                    from scipy.stats import gaussian_kde
                    y = values
                    gkde = gaussian_kde(y)
                    ind = np.linspace(y.min(), y.max(), 1000)
                    axes[i, j].plot(ind, gkde.evaluate(ind), **kwds)
            else:
                common = (mask[a] & mask[b]).values

                axes[i, j].scatter(df[b][common],
                                   df[a][common],
                                   marker=marker,
                                   alpha=alpha,
                                   **kwds)

            axes[i, j].set_xlabel('')
            axes[i, j].set_ylabel('')
            axes[i, j].set_xticklabels([])
            axes[i, j].set_yticklabels([])
            ticks = df.index

            is_datetype = ticks.inferred_type in ('datetime', 'date',
                                                  'datetime64')

            if ticks.is_numeric() or is_datetype:
                """
                Matplotlib supports numeric values or datetime objects as
                xaxis values. Taking LBYL approach here, by the time
                matplotlib raises exception when using non numeric/datetime
                values for xaxis, several actions are already taken by plt.
                """
                ticks = ticks._mpl_repr()

            # setup labels
            if i == 0 and j % 2 == 1:
                axes[i, j].set_xlabel(b, visible=True)
                #axes[i, j].xaxis.set_visible(True)
                axes[i, j].set_xlabel(b)
                axes[i, j].set_xticklabels(ticks)
                axes[i, j].xaxis.set_ticks_position('top')
                axes[i, j].xaxis.set_label_position('top')
            if i == n - 1 and j % 2 == 0:
                axes[i, j].set_xlabel(b, visible=True)
                #axes[i, j].xaxis.set_visible(True)
                axes[i, j].set_xlabel(b)
                axes[i, j].set_xticklabels(ticks)
                axes[i, j].xaxis.set_ticks_position('bottom')
                axes[i, j].xaxis.set_label_position('bottom')
            if j == 0 and i % 2 == 0:
                axes[i, j].set_ylabel(a, visible=True)
                #axes[i, j].yaxis.set_visible(True)
                axes[i, j].set_ylabel(a)
                axes[i, j].set_yticklabels(ticks)
                axes[i, j].yaxis.set_ticks_position('left')
                axes[i, j].yaxis.set_label_position('left')
            if j == n - 1 and i % 2 == 1:
                axes[i, j].set_ylabel(a, visible=True)
                #axes[i, j].yaxis.set_visible(True)
                axes[i, j].set_ylabel(a)
                axes[i, j].set_yticklabels(ticks)
                axes[i, j].yaxis.set_ticks_position('right')
                axes[i, j].yaxis.set_label_position('right')

            axes[i, j].grid(b=grid)

    return axes