Пример #1
0
    def test_from_weekly_resampling(self):
        idxh = date_range('1/1/1999', periods=52, freq='W')
        idxl = date_range('1/1/1999', periods=12, freq='M')
        high = Series(np.random.randn(len(idxh)), idxh)
        low = Series(np.random.randn(len(idxl)), idxl)
        low.plot()
        ax = high.plot()

        expected_h = idxh.to_period().asi8.astype(np.float64)
        expected_l = np.array([1514, 1519, 1523, 1527, 1531, 1536, 1540, 1544,
                               1549, 1553, 1558, 1562], dtype=np.float64)
        for l in ax.get_lines():
            self.assertTrue(PeriodIndex(data=l.get_xdata()).freq, idxh.freq)
            xdata = l.get_xdata(orig=False)
            if len(xdata) == 12:  # idxl lines
                self.assert_numpy_array_equal(xdata, expected_l)
            else:
                self.assert_numpy_array_equal(xdata, expected_h)
        tm.close()

        # tsplot
        from pandas.tseries.plotting import tsplot
        import matplotlib.pyplot as plt

        tsplot(low, plt.Axes.plot)
        lines = tsplot(high, plt.Axes.plot)
        for l in lines:
            self.assertTrue(PeriodIndex(data=l.get_xdata()).freq, idxh.freq)
            xdata = l.get_xdata(orig=False)
            if len(xdata) == 12:  # idxl lines
                self.assert_numpy_array_equal(xdata, expected_l)
            else:
                self.assert_numpy_array_equal(xdata, expected_h)
Пример #2
0
    def test_coercion_with_loc_and_series(self):
        for start_data, expected_result in self.EXPECTED_RESULTS:
            start_series = Series(start_data)
            start_series.loc[start_series == start_series[0]] = None

            expected_series = Series(expected_result)
            tm.assert_series_equal(start_series, expected_series)
Пример #3
0
    def test_join_aware(self):
        rng = date_range('1/1/2011', periods=10, freq='H')
        ts = Series(np.random.randn(len(rng)), index=rng)

        ts_utc = ts.tz_localize('utc')

        self.assertRaises(Exception, ts.__add__, ts_utc)
        self.assertRaises(Exception, ts_utc.__add__, ts)

        test1 = DataFrame(np.zeros((6,3)),
                          index=date_range("2012-11-15 00:00:00", periods=6,
                                           freq="100L", tz="US/Central"))
        test2 = DataFrame(np.zeros((3,3)),
                          index=date_range("2012-11-15 00:00:00", periods=3,
                                           freq="250L", tz="US/Central"),
                          columns=range(3,6))

        result = test1.join(test2, how='outer')
        ex_index = test1.index.union(test2.index)

        self.assertTrue(result.index.equals(ex_index))
        self.assertTrue(result.index.tz.zone == 'US/Central')

        # non-overlapping
        rng = date_range("2012-11-15 00:00:00", periods=6,
                         freq="H", tz="US/Central")

        rng2 = date_range("2012-11-15 12:00:00", periods=6,
                         freq="H", tz="US/Eastern")

        result = rng.union(rng2)
        self.assertTrue(result.tz.zone == 'UTC')
Пример #4
0
def _wrap_results(result, dtype):
    """ wrap our results if needed """

    if issubclass(dtype.type, np.datetime64):
        if not isinstance(result, np.ndarray):
            result = lib.Timestamp(result)
        else:
            result = result.view(dtype)
    elif issubclass(dtype.type, np.timedelta64):
        if not isinstance(result, np.ndarray):

            # this is a scalar timedelta result!
            # we have series convert then take the element (scalar)
            # as series will do the right thing in py3 (and deal with numpy
            # 1.6.2 bug in that it results dtype of timedelta64[us]
            from pandas import Series

            # coerce float to results
            if is_float(result):
                result = int(result)
            result = Series([result], dtype='timedelta64[ns]')
        else:
            result = result.view(dtype)

    return result
Пример #5
0
    def _delegate_property_get(self, name):
        from pandas import Series

        result = getattr(self.values, name)

        # maybe need to upcast (ints)
        if isinstance(result, np.ndarray):
            if is_integer_dtype(result):
                result = result.astype('int64')
        elif not is_list_like(result):
            return result

        result = np.asarray(result)

        # blow up if we operate on categories
        if self.orig is not None:
            result = take_1d(result, self.orig.cat.codes)

        # return the result as a Series, which is by definition a copy
        result = Series(result, index=self.index, name=self.name)

        # setting this object will show a SettingWithCopyWarning/Error
        result.is_copy = ("modifications to a property of a datetimelike "
                          "object are not supported and are discarded. "
                          "Change values on the original.")

        return result
Пример #6
0
def test_get():
    # GH 6383
    s = Series(np.array([43, 48, 60, 48, 50, 51, 50, 45, 57, 48, 56, 45,
                         51, 39, 55, 43, 54, 52, 51, 54]))

    result = s.get(25, 0)
    expected = 0
    assert result == expected

    s = Series(np.array([43, 48, 60, 48, 50, 51, 50, 45, 57, 48, 56,
                         45, 51, 39, 55, 43, 54, 52, 51, 54]),
               index=pd.Float64Index(
                   [25.0, 36.0, 49.0, 64.0, 81.0, 100.0,
                    121.0, 144.0, 169.0, 196.0, 1225.0,
                    1296.0, 1369.0, 1444.0, 1521.0, 1600.0,
                    1681.0, 1764.0, 1849.0, 1936.0],
                   dtype='object'))

    result = s.get(25, 0)
    expected = 43
    assert result == expected

    # GH 7407
    # with a boolean accessor
    df = pd.DataFrame({'i': [0] * 3, 'b': [False] * 3})
    vc = df.i.value_counts()
    result = vc.get(99, default='Missing')
    assert result == 'Missing'

    vc = df.b.value_counts()
    result = vc.get(False, default='Missing')
    assert result == 3

    result = vc.get(True, default='Missing')
    assert result == 'Missing'
Пример #7
0
    def test_tz_aware_asfreq(self):
        dr = date_range('2011-12-01','2012-07-20',freq = 'D', tz = 'US/Eastern')

        s = Series(np.random.randn(len(dr)), index=dr)

        # it works!
        s.asfreq('T')
Пример #8
0
 def test_mixed_freq_irreg_period(self):
     ts = tm.makeTimeSeries()
     irreg = ts[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 16, 17, 18, 29]]
     rng = period_range('1/3/2000', periods=30, freq='B')
     ps = Series(np.random.randn(len(rng)), rng)
     irreg.plot()
     ps.plot()
Пример #9
0
    def test_tz_aware_asfreq(self):
        dr = date_range("2011-12-01", "2012-07-20", freq="D", tz=self.tzstr("US/Eastern"))

        s = Series(np.random.randn(len(dr)), index=dr)

        # it works!
        s.asfreq("T")
Пример #10
0
    def test_constructor(self):
        assert self.ts.index.is_all_dates

        # Pass in Series
        derived = Series(self.ts)
        assert derived.index.is_all_dates

        assert tm.equalContents(derived.index, self.ts.index)
        # Ensure new index is not created
        assert id(self.ts.index) == id(derived.index)

        # Mixed type Series
        mixed = Series(['hello', np.NaN], index=[0, 1])
        assert mixed.dtype == np.object_
        assert mixed[1] is np.NaN

        assert not self.empty.index.is_all_dates
        assert not Series({}).index.is_all_dates
        pytest.raises(Exception, Series, np.random.randn(3, 3),
                      index=np.arange(3))

        mixed.name = 'Series'
        rs = Series(mixed).name
        xp = 'Series'
        assert rs == xp

        # raise on MultiIndex GH4187
        m = MultiIndex.from_arrays([[1, 2], [3, 4]])
        pytest.raises(NotImplementedError, Series, m)
Пример #11
0
def quarter_plot(x, dates=None, ylabel=None, ax=None):
    """
    Seasonal plot of quarterly data

    Parameters
    ----------
    x : array-like
        Seasonal data to plot. If dates is None, x must be a pandas object
        with a PeriodIndex or DatetimeIndex with a monthly frequency.
    dates : array-like, optional
        If `x` is not a pandas object, then dates must be supplied.
    ylabel : str, optional
        The label for the y-axis. Will attempt to use the `name` attribute
        of the Series.
    ax : matplotlib.axes, optional
        Existing axes instance.

    Returns
    -------
    matplotlib.Figure
    """
    from pandas import DataFrame

    if dates is None:
        from statsmodels.tools.data import _check_period_index
        _check_period_index(x, freq="Q")
    else:
        from pandas import Series, PeriodIndex
        x = Series(x, index=PeriodIndex(dates, freq="Q"))

    xticklabels = ['q1', 'q2', 'q3', 'q4']
    return seasonal_plot(x.groupby(lambda y : y.quarter), xticklabels,
                         ylabel=ylabel, ax=ax)
Пример #12
0
    def test_repr_unicode(self):
        s = Series([u'\u03c3'] * 10)
        repr(s)

        a = Series([u"\u05d0"] * 1000)
        a.name = 'title1'
        repr(a)
Пример #13
0
    def test_constructor_series(self):
        index1 = ['d', 'b', 'a', 'c']
        index2 = sorted(index1)
        s1 = Series([4, 7, -5, 3], index=index1)
        s2 = Series(s1, index=index2)

        assert_series_equal(s2, s1.sort_index())
Пример #14
0
    def test_getitem_setitem_datetime_tz_pytz(self):
        tm._skip_if_no_pytz()
        from pytz import timezone as tz

        from pandas import date_range

        N = 50
        # testing with timezone, GH #2785
        rng = date_range('1/1/1990', periods=N, freq='H', tz='US/Eastern')
        ts = Series(np.random.randn(N), index=rng)

        # also test Timestamp tz handling, GH #2789
        result = ts.copy()
        result["1990-01-01 09:00:00+00:00"] = 0
        result["1990-01-01 09:00:00+00:00"] = ts[4]
        assert_series_equal(result, ts)

        result = ts.copy()
        result["1990-01-01 03:00:00-06:00"] = 0
        result["1990-01-01 03:00:00-06:00"] = ts[4]
        assert_series_equal(result, ts)

        # repeat with datetimes
        result = ts.copy()
        result[datetime(1990, 1, 1, 9, tzinfo=tz('UTC'))] = 0
        result[datetime(1990, 1, 1, 9, tzinfo=tz('UTC'))] = ts[4]
        assert_series_equal(result, ts)

        result = ts.copy()

        # comparison dates with datetime MUST be localized!
        date = tz('US/Central').localize(datetime(1990, 1, 1, 3))
        result[date] = 0
        result[date] = ts[4]
        assert_series_equal(result, ts)
Пример #15
0
    def test_shift_dst(self):
        # GH 13926
        dates = date_range('2016-11-06', freq='H', periods=10, tz='US/Eastern')
        s = Series(dates)

        res = s.shift(0)
        tm.assert_series_equal(res, s)
        self.assertEqual(res.dtype, 'datetime64[ns, US/Eastern]')

        res = s.shift(1)
        exp_vals = [NaT] + dates.asobject.values.tolist()[:9]
        exp = Series(exp_vals)
        tm.assert_series_equal(res, exp)
        self.assertEqual(res.dtype, 'datetime64[ns, US/Eastern]')

        res = s.shift(-2)
        exp_vals = dates.asobject.values.tolist()[2:] + [NaT, NaT]
        exp = Series(exp_vals)
        tm.assert_series_equal(res, exp)
        self.assertEqual(res.dtype, 'datetime64[ns, US/Eastern]')

        for ex in [10, -10, 20, -20]:
            res = s.shift(ex)
            exp = Series([NaT] * 10, dtype='datetime64[ns, US/Eastern]')
            tm.assert_series_equal(res, exp)
            self.assertEqual(res.dtype, 'datetime64[ns, US/Eastern]')
Пример #16
0
    def test_all_values_single_bin(self):
        # 2070
        index = period_range(start="2012-01-01", end="2012-12-31", freq="M")
        s = Series(np.random.randn(len(index)), index=index)

        result = s.resample("A", how='mean')
        tm.assert_almost_equal(result[0], s.mean())
Пример #17
0
    def test_tshift(self):
        # PeriodIndex
        ps = tm.makePeriodSeries()
        shifted = ps.tshift(1)
        unshifted = shifted.tshift(-1)

        assert_series_equal(unshifted, ps)

        shifted2 = ps.tshift(freq='B')
        assert_series_equal(shifted, shifted2)

        shifted3 = ps.tshift(freq=datetools.bday)
        assert_series_equal(shifted, shifted3)

        self.assertRaises(ValueError, ps.tshift, freq='M')

        # DatetimeIndex
        shifted = self.ts.tshift(1)
        unshifted = shifted.tshift(-1)

        assert_series_equal(self.ts, unshifted)

        shifted2 = self.ts.tshift(freq=self.ts.index.freq)
        assert_series_equal(shifted, shifted2)

        inferred_ts = Series(self.ts.values, Index(np.asarray(self.ts.index)),
                             name='ts')
        shifted = inferred_ts.tshift(1)
        unshifted = shifted.tshift(-1)
        assert_series_equal(shifted, self.ts.tshift(1))
        assert_series_equal(unshifted, inferred_ts)

        no_freq = self.ts[[0, 5, 7]]
        self.assertRaises(ValueError, no_freq.tshift)
Пример #18
0
    def test_upsample_with_limit(self):
        rng = date_range('1/1/2000', periods=3, freq='5t')
        ts = Series(np.random.randn(len(rng)), rng)

        result = ts.resample('t', fill_method='ffill', limit=2)
        expected = ts.reindex(result.index, method='ffill', limit=2)
        assert_series_equal(result, expected)
Пример #19
0
    def test_quarterly_resampling(self):
        rng = period_range('2000Q1', periods=10, freq='Q-DEC')
        ts = Series(np.arange(10), index=rng)

        result = ts.resample('A')
        exp = ts.to_timestamp().resample('A').to_period()
        assert_series_equal(result, exp)
Пример #20
0
    def test_append_concat(self):
        rng = date_range('5/8/2012 1:45', periods=10, freq='5T')
        ts = Series(np.random.randn(len(rng)), rng)
        df = DataFrame(np.random.randn(len(rng), 4), index=rng)

        result = ts.append(ts)
        result_df = df.append(df)
        ex_index = DatetimeIndex(np.tile(rng.values, 2))
        tm.assert_index_equal(result.index, ex_index)
        tm.assert_index_equal(result_df.index, ex_index)

        appended = rng.append(rng)
        tm.assert_index_equal(appended, ex_index)

        appended = rng.append([rng, rng])
        ex_index = DatetimeIndex(np.tile(rng.values, 3))
        tm.assert_index_equal(appended, ex_index)

        # different index names
        rng1 = rng.copy()
        rng2 = rng.copy()
        rng1.name = 'foo'
        rng2.name = 'bar'
        assert rng1.append(rng1).name == 'foo'
        assert rng1.append(rng2).name is None
Пример #21
0
def test_first_last_nth_dtypes(df_mixed_floats):

    df = df_mixed_floats.copy()
    df['E'] = True
    df['F'] = 1

    # tests for first / last / nth
    grouped = df.groupby('A')
    first = grouped.first()
    expected = df.loc[[1, 0], ['B', 'C', 'D', 'E', 'F']]
    expected.index = Index(['bar', 'foo'], name='A')
    expected = expected.sort_index()
    assert_frame_equal(first, expected)

    last = grouped.last()
    expected = df.loc[[5, 7], ['B', 'C', 'D', 'E', 'F']]
    expected.index = Index(['bar', 'foo'], name='A')
    expected = expected.sort_index()
    assert_frame_equal(last, expected)

    nth = grouped.nth(1)
    expected = df.loc[[3, 2], ['B', 'C', 'D', 'E', 'F']]
    expected.index = Index(['bar', 'foo'], name='A')
    expected = expected.sort_index()
    assert_frame_equal(nth, expected)

    # GH 2763, first/last shifting dtypes
    idx = lrange(10)
    idx.append(9)
    s = Series(data=lrange(11), index=idx, name='IntCol')
    assert s.dtype == 'int64'
    f = s.groupby(level=0).first()
    assert f.dtype == 'int64'
Пример #22
0
def test_nat_operations():
    # GH 8617
    s = Series([0, pd.NaT], dtype='m8[ns]')
    exp = s[0]
    assert s.median() == exp
    assert s.min() == exp
    assert s.max() == exp
Пример #23
0
    def test_endswith(self):
        values = Series(['om', NA, 'foo_nom', 'nom', 'bar_foo', NA, 'foo'])

        result = values.str.endswith('foo')
        exp = Series([False, NA, False, False, True, NA, True])
        tm.assert_series_equal(result, exp)

        # mixed
        mixed = ['a', NA, 'b', True, datetime.today(), 'foo', None, 1, 2.]
        rs = strings.str_endswith(mixed, 'f')
        xp = [False, NA, False, NA, NA, False, NA, NA, NA]
        tm.assert_almost_equal(rs, xp)

        rs = Series(mixed).str.endswith('f')
        tm.assert_isinstance(rs, Series)
        tm.assert_almost_equal(rs, xp)

        # unicode
        values = Series([u('om'), NA, u('foo_nom'), u('nom'), u('bar_foo'), NA,
                         u('foo')])

        result = values.str.endswith('foo')
        exp = Series([False, NA, False, False, True, NA, True])
        tm.assert_series_equal(result, exp)

        result = values.str.endswith('foo', na=False)
        tm.assert_series_equal(result, exp.fillna(False).astype(bool))
Пример #24
0
    def test_valid_dt_with_missing_values(self):

        from datetime import date, time

        # GH 8689
        s = Series(date_range('20130101', periods=5, freq='D'))
        s.iloc[2] = pd.NaT

        for attr in ['microsecond', 'nanosecond', 'second', 'minute', 'hour',
                     'day']:
            expected = getattr(s.dt, attr).copy()
            expected.iloc[2] = np.nan
            result = getattr(s.dt, attr)
            tm.assert_series_equal(result, expected)

        result = s.dt.date
        expected = Series(
            [date(2013, 1, 1), date(2013, 1, 2), np.nan, date(2013, 1, 4),
             date(2013, 1, 5)], dtype='object')
        tm.assert_series_equal(result, expected)

        result = s.dt.time
        expected = Series(
            [time(0), time(0), np.nan, time(0), time(0)], dtype='object')
        tm.assert_series_equal(result, expected)
Пример #25
0
def shiftTs():
    dates = [datetime(2014,1,2), datetime(2014,1,3), datetime(2014,1,4), datetime(2014,1,5)]
    ts1 = Series(np.arange(4)+2, index=dates)
    #ts1 = ts1/ts1.shift(1) - 1
    print (ts1)
    ts1 = ts1.shift(1, freq='M')
    print (ts1)
Пример #26
0
    def test_timedelta(self):
        converter = lambda x: pd.to_timedelta(x, unit='ms')

        s = Series([timedelta(23), timedelta(seconds=5)])
        self.assertEqual(s.dtype, 'timedelta64[ns]')

        result = pd.read_json(s.to_json(), typ='series').apply(converter)
        assert_series_equal(result, s)

        s = Series([timedelta(23), timedelta(seconds=5)],
                   index=pd.Index([0, 1]))
        self.assertEqual(s.dtype, 'timedelta64[ns]')
        result = pd.read_json(s.to_json(), typ='series').apply(converter)
        assert_series_equal(result, s)

        frame = DataFrame([timedelta(23), timedelta(seconds=5)])
        self.assertEqual(frame[0].dtype, 'timedelta64[ns]')
        assert_frame_equal(frame, pd.read_json(frame.to_json())
                           .apply(converter))

        frame = DataFrame({'a': [timedelta(days=23), timedelta(seconds=5)],
                           'b': [1, 2],
                           'c': pd.date_range(start='20130101', periods=2)})

        result = pd.read_json(frame.to_json(date_unit='ns'))
        result['a'] = pd.to_timedelta(result.a, unit='ns')
        result['c'] = pd.to_datetime(result.c)
        assert_frame_equal(frame, result)
Пример #27
0
 def test_mixed_index_assignment(self):
     # GH 19860
     s = Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 1, 2])
     s.at['a'] = 11
     assert s.iat[0] == 11
     s.at[1] = 22
     assert s.iat[3] == 22
Пример #28
0
    def test_constructor_with_datetimelike(self):

        # 12077
        # constructor wwth a datetimelike and NaT

        for dtl in [date_range('1995-01-01 00:00:00', periods=5, freq='s'),
                    date_range('1995-01-01 00:00:00', periods=5,
                               freq='s', tz='US/Eastern'),
                    timedelta_range('1 day', periods=5, freq='s')]:

            s = Series(dtl)
            c = Categorical(s)
            expected = type(dtl)(s)
            expected.freq = None
            tm.assert_index_equal(c.categories, expected)
            tm.assert_numpy_array_equal(c.codes, np.arange(5, dtype='int8'))

            # with NaT
            s2 = s.copy()
            s2.iloc[-1] = NaT
            c = Categorical(s2)
            expected = type(dtl)(s2.dropna())
            expected.freq = None
            tm.assert_index_equal(c.categories, expected)

            exp = np.array([0, 1, 2, 3, -1], dtype=np.int8)
            tm.assert_numpy_array_equal(c.codes, exp)

            result = repr(c)
            assert 'NaT' in result
Пример #29
0
    def test_annual_upsample(self):
        targets = ['D', 'B', 'M']

        for month in MONTHS:
            ts = _simple_pts('1/1/1990', '12/31/1995', freq='A-%s' % month)

            for targ, conv, meth in product(targets, ['start', 'end'],
                                            ['ffill', 'bfill']):
                result = ts.resample(targ, fill_method=meth,
                                     convention=conv)
                expected = result.to_timestamp(targ, how=conv)
                expected = expected.asfreq(targ, meth).to_period()
                assert_series_equal(result, expected)

        df = DataFrame({'a' : ts})
        rdf = df.resample('D', fill_method='ffill')
        exp = df['a'].resample('D', fill_method='ffill')
        assert_series_equal(rdf['a'], exp)


        rng = period_range('2000', '2003', freq='A-DEC')
        ts = Series([1, 2, 3, 4], index=rng)

        result = ts.resample('M', fill_method='ffill')
        ex_index = period_range('2000-01', '2003-12', freq='M')

        expected = ts.asfreq('M', how='start').reindex(ex_index,
                                                       method='ffill')
        assert_series_equal(result, expected)
Пример #30
0
    def test_astype(self):
        s = Series(np.random.randn(5), name='foo')

        for dtype in ['float32', 'float64', 'int64', 'int32']:
            astyped = s.astype(dtype)
            self.assertEqual(astyped.dtype, dtype)
            self.assertEqual(astyped.name, s.name)
Пример #31
0
 def test_axis_alias(self):
     s = Series([1, 2, np.nan])
     tm.assert_series_equal(s.dropna(axis="rows"), s.dropna(axis="index"))
     assert s.dropna().sum("rows") == 3
     assert s._get_axis_number("rows") == 0
     assert s._get_axis_name("rows") == "index"
Пример #32
0
 def test_cat_accessor_updates_on_inplace(self):
     s = Series(list("abc")).astype("category")
     s.drop(0, inplace=True)
     s.cat.remove_unused_categories(inplace=True)
     assert len(s.cat.categories) == 2
Пример #33
0
 def test_cat_accessor_no_new_attributes(self):
     # https://github.com/pandas-dev/pandas/issues/10673
     c = Series(list("aabbcde")).astype("category")
     with pytest.raises(AttributeError,
                        match="You cannot add any new attribute"):
         c.cat.xlabel = "a"
Пример #34
0
 def test_getname_categorical_accessor(self, method):
     # GH 17509
     s = Series([1, 2, 3], name="A").astype("category")
     expected = "A"
     result = method(s).name
     assert result == expected
Пример #35
0
 def test_get_values_deprecation(self):
     s = Series(range(9))
     with tm.assert_produces_warning(FutureWarning):
         res = s.get_values()
     tm.assert_numpy_array_equal(res, s.values)
Пример #36
0
 def test_integer_series_size(self):
     # GH 25580
     s = Series(range(9))
     assert s.size == 9
     s = Series(range(9), dtype="Int64")
     assert s.size == 9
Пример #37
0
    def test_ndarray_compat(self):

        # test numpy compat with Series as sub-class of NDFrame
        tsdf = DataFrame(
            np.random.randn(1000, 3),
            columns=["A", "B", "C"],
            index=date_range("1/1/2000", periods=1000),
        )

        def f(x):
            return x[x.idxmax()]

        result = tsdf.apply(f)
        expected = tsdf.max()
        tm.assert_series_equal(result, expected)

        # .item()
        with tm.assert_produces_warning(FutureWarning):
            s = Series([1])
            result = s.item()
            assert result == 1
            assert s.item() == s.iloc[0]

        # using an ndarray like function
        s = Series(np.random.randn(10))
        result = Series(np.ones_like(s))
        expected = Series(1, index=range(10), dtype="float64")
        tm.assert_series_equal(result, expected)

        # ravel
        s = Series(np.random.randn(10))
        tm.assert_almost_equal(s.ravel(order="F"), s.values.ravel(order="F"))

        # compress
        # GH 6658
        s = Series([0, 1.0, -1], index=list("abc"))
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = np.compress(s > 0, s)
        tm.assert_series_equal(result, Series([1.0], index=["b"]))

        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = np.compress(s < -1, s)
        # result empty Index(dtype=object) as the same as original
        exp = Series([], dtype="float64", index=Index([], dtype="object"))
        tm.assert_series_equal(result, exp)

        s = Series([0, 1.0, -1], index=[0.1, 0.2, 0.3])
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = np.compress(s > 0, s)
        tm.assert_series_equal(result, Series([1.0], index=[0.2]))

        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = np.compress(s < -1, s)
        # result empty Float64Index as the same as original
        exp = Series([], dtype="float64", index=Index([], dtype="float64"))
        tm.assert_series_equal(result, exp)
Пример #38
0
                   columns=['year', 'state', 'pop', 'debt'],
                   index=['one', 'two', 'three', 'four', 'five'])
print(frame2.ix['three'])  # 一行值
print(frame2)
print(frame2['state'])  # 获取一列值,结果为一个series
print(frame2.ix['one'])
print(frame2.year)
print(frame2['pop'].ix['three'])
frame2['debt'] = 16.5  # 修改一整列
print(frame2)
frame2.debt = np.arange(5)  # 用numpy数组修改元素
print(frame2)
print("-" * 40)

#用Series指定要修改的索引及其对应的值,没有指定的默认数据用NaN。'
val = Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame2['debt'] = val
print(frame2)
print("-" * 40)
#
# #赋值给新列'
# frame2['eastern'] = (frame2.state == 'beijing')  # 如果state等于beijing为True
# print(frame2)
# print(frame2.columns)
# print("-"*40)
#
# #DataFrame转置'
pop = {
    'shangsha': {
        2001: 2.4,
        2002: 2.9
Пример #39
0
    def test_comparisons(self, data, reverse, base):
        cat_rev = Series(
            Categorical(data, categories=reverse, ordered=True))
        cat_rev_base = Series(
            Categorical(base, categories=reverse, ordered=True))
        cat = Series(Categorical(data, ordered=True))
        cat_base = Series(
            Categorical(base, categories=cat.cat.categories, ordered=True))
        s = Series(base)
        a = np.array(base)

        # comparisons need to take categories ordering into account
        res_rev = cat_rev > cat_rev_base
        exp_rev = Series([True, False, False])
        tm.assert_series_equal(res_rev, exp_rev)

        res_rev = cat_rev < cat_rev_base
        exp_rev = Series([False, False, True])
        tm.assert_series_equal(res_rev, exp_rev)

        res = cat > cat_base
        exp = Series([False, False, True])
        tm.assert_series_equal(res, exp)

        scalar = base[1]
        res = cat > scalar
        exp = Series([False, False, True])
        exp2 = cat.values > scalar
        tm.assert_series_equal(res, exp)
        tm.assert_numpy_array_equal(res.values, exp2)
        res_rev = cat_rev > scalar
        exp_rev = Series([True, False, False])
        exp_rev2 = cat_rev.values > scalar
        tm.assert_series_equal(res_rev, exp_rev)
        tm.assert_numpy_array_equal(res_rev.values, exp_rev2)

        # Only categories with same categories can be compared
        with pytest.raises(TypeError):
            cat > cat_rev

        # categorical cannot be compared to Series or numpy array, and also
        # not the other way around
        msg = ("Cannot compare a Categorical for op __gt__ with type"
               r" <class 'numpy\.ndarray'>")
        with pytest.raises(TypeError, match=msg):
            cat > s
        with pytest.raises(TypeError, match=msg):
            cat_rev > s
        with pytest.raises(TypeError, match=msg):
            cat > a
        with pytest.raises(TypeError, match=msg):
            cat_rev > a

        with pytest.raises(TypeError, match=msg):
            s < cat
        with pytest.raises(TypeError, match=msg):
            s < cat_rev

        with pytest.raises(TypeError, match=msg):
            a < cat
        with pytest.raises(TypeError, match=msg):
            a < cat_rev
Пример #40
0
 def test_raise_on_info(self):
     s = Series(np.random.randn(10))
     msg = "'Series' object has no attribute 'info'"
     with pytest.raises(AttributeError, match=msg):
         s.info()
Пример #41
0
class TestCategoricalOps:

    def test_compare_frame(self):
        # GH#24282 check that Categorical.__cmp__(DataFrame) defers to frame
        data = ["a", "b", 2, "a"]
        cat = Categorical(data)

        df = DataFrame(cat)

        for op in [operator.eq, operator.ne, operator.ge,
                   operator.gt, operator.le, operator.lt]:
            with pytest.raises(ValueError):
                # alignment raises unless we transpose
                op(cat, df)

        result = cat == df.T
        expected = DataFrame([[True, True, True, True]])
        tm.assert_frame_equal(result, expected)

        result = cat[::-1] != df.T
        expected = DataFrame([[False, True, True, False]])
        tm.assert_frame_equal(result, expected)

    def test_datetime_categorical_comparison(self):
        dt_cat = Categorical(date_range('2014-01-01', periods=3), ordered=True)
        tm.assert_numpy_array_equal(dt_cat > dt_cat[0],
                                    np.array([False, True, True]))
        tm.assert_numpy_array_equal(dt_cat[0] < dt_cat,
                                    np.array([False, True, True]))

    def test_reflected_comparison_with_scalars(self):
        # GH8658
        cat = Categorical([1, 2, 3], ordered=True)
        tm.assert_numpy_array_equal(cat > cat[0],
                                    np.array([False, True, True]))
        tm.assert_numpy_array_equal(cat[0] < cat,
                                    np.array([False, True, True]))

    def test_comparison_with_unknown_scalars(self):
        # https://github.com/pandas-dev/pandas/issues/9836#issuecomment-92123057
        # and following comparisons with scalars not in categories should raise
        # for unequal comps, but not for equal/not equal
        cat = Categorical([1, 2, 3], ordered=True)

        msg = ("Cannot compare a Categorical for op __{}__ with a scalar,"
               " which is not a category")
        with pytest.raises(TypeError, match=msg.format('lt')):
            cat < 4
        with pytest.raises(TypeError, match=msg.format('gt')):
            cat > 4
        with pytest.raises(TypeError, match=msg.format('gt')):
            4 < cat
        with pytest.raises(TypeError, match=msg.format('lt')):
            4 > cat

        tm.assert_numpy_array_equal(cat == 4,
                                    np.array([False, False, False]))
        tm.assert_numpy_array_equal(cat != 4,
                                    np.array([True, True, True]))

    def test_comparison_of_ordered_categorical_with_nan_to_scalar(
            self, compare_operators_no_eq_ne):
        # https://github.com/pandas-dev/pandas/issues/26504
        # BUG: fix ordered categorical comparison with missing values (#26504 )
        # and following comparisons with scalars in categories with missing
        # values should be evaluated as False

        cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True)
        scalar = 2
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", RuntimeWarning)
            expected = getattr(np.array(cat),
                               compare_operators_no_eq_ne)(scalar)
        actual = getattr(cat, compare_operators_no_eq_ne)(scalar)
        tm.assert_numpy_array_equal(actual, expected)

    def test_comparison_of_ordered_categorical_with_nan_to_listlike(
            self, compare_operators_no_eq_ne):
        # https://github.com/pandas-dev/pandas/issues/26504
        # and following comparisons of missing values in ordered Categorical
        # with listlike should be evaluated as False

        cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True)
        other = Categorical([2, 2, 2, 2], categories=[1, 2, 3], ordered=True)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", RuntimeWarning)
            expected = getattr(np.array(cat), compare_operators_no_eq_ne)(2)
        actual = getattr(cat, compare_operators_no_eq_ne)(other)
        tm.assert_numpy_array_equal(actual, expected)

    @pytest.mark.parametrize('data,reverse,base', [
        (list("abc"), list("cba"), list("bbb")),
        ([1, 2, 3], [3, 2, 1], [2, 2, 2])]
    )
    def test_comparisons(self, data, reverse, base):
        cat_rev = Series(
            Categorical(data, categories=reverse, ordered=True))
        cat_rev_base = Series(
            Categorical(base, categories=reverse, ordered=True))
        cat = Series(Categorical(data, ordered=True))
        cat_base = Series(
            Categorical(base, categories=cat.cat.categories, ordered=True))
        s = Series(base)
        a = np.array(base)

        # comparisons need to take categories ordering into account
        res_rev = cat_rev > cat_rev_base
        exp_rev = Series([True, False, False])
        tm.assert_series_equal(res_rev, exp_rev)

        res_rev = cat_rev < cat_rev_base
        exp_rev = Series([False, False, True])
        tm.assert_series_equal(res_rev, exp_rev)

        res = cat > cat_base
        exp = Series([False, False, True])
        tm.assert_series_equal(res, exp)

        scalar = base[1]
        res = cat > scalar
        exp = Series([False, False, True])
        exp2 = cat.values > scalar
        tm.assert_series_equal(res, exp)
        tm.assert_numpy_array_equal(res.values, exp2)
        res_rev = cat_rev > scalar
        exp_rev = Series([True, False, False])
        exp_rev2 = cat_rev.values > scalar
        tm.assert_series_equal(res_rev, exp_rev)
        tm.assert_numpy_array_equal(res_rev.values, exp_rev2)

        # Only categories with same categories can be compared
        with pytest.raises(TypeError):
            cat > cat_rev

        # categorical cannot be compared to Series or numpy array, and also
        # not the other way around
        msg = ("Cannot compare a Categorical for op __gt__ with type"
               r" <class 'numpy\.ndarray'>")
        with pytest.raises(TypeError, match=msg):
            cat > s
        with pytest.raises(TypeError, match=msg):
            cat_rev > s
        with pytest.raises(TypeError, match=msg):
            cat > a
        with pytest.raises(TypeError, match=msg):
            cat_rev > a

        with pytest.raises(TypeError, match=msg):
            s < cat
        with pytest.raises(TypeError, match=msg):
            s < cat_rev

        with pytest.raises(TypeError, match=msg):
            a < cat
        with pytest.raises(TypeError, match=msg):
            a < cat_rev

    @pytest.mark.parametrize('ctor', [
        lambda *args, **kwargs: Categorical(*args, **kwargs),
        lambda *args, **kwargs: Series(Categorical(*args, **kwargs)),
    ])
    def test_unordered_different_order_equal(self, ctor):
        # https://github.com/pandas-dev/pandas/issues/16014
        c1 = ctor(['a', 'b'], categories=['a', 'b'], ordered=False)
        c2 = ctor(['a', 'b'], categories=['b', 'a'], ordered=False)
        assert (c1 == c2).all()

        c1 = ctor(['a', 'b'], categories=['a', 'b'], ordered=False)
        c2 = ctor(['b', 'a'], categories=['b', 'a'], ordered=False)
        assert (c1 != c2).all()

        c1 = ctor(['a', 'a'], categories=['a', 'b'], ordered=False)
        c2 = ctor(['b', 'b'], categories=['b', 'a'], ordered=False)
        assert (c1 != c2).all()

        c1 = ctor(['a', 'a'], categories=['a', 'b'], ordered=False)
        c2 = ctor(['a', 'b'], categories=['b', 'a'], ordered=False)
        result = c1 == c2
        tm.assert_numpy_array_equal(np.array(result), np.array([True, False]))

    def test_unordered_different_categories_raises(self):
        c1 = Categorical(['a', 'b'], categories=['a', 'b'], ordered=False)
        c2 = Categorical(['a', 'c'], categories=['c', 'a'], ordered=False)

        with pytest.raises(TypeError, match=("Categoricals can "
                                             "only be compared")):
            c1 == c2

    def test_compare_different_lengths(self):
        c1 = Categorical([], categories=['a', 'b'])
        c2 = Categorical([], categories=['a'])

        msg = "Categories are different lengths"
        with pytest.raises(TypeError, match=msg):
            c1 == c2

    def test_compare_unordered_different_order(self):
        # https://github.com/pandas-dev/pandas/issues/16603#issuecomment-
        # 349290078
        a = pd.Categorical(['a'], categories=['a', 'b'])
        b = pd.Categorical(['b'], categories=['b', 'a'])
        assert not a.equals(b)

    def test_numeric_like_ops(self):

        df = DataFrame({'value': np.random.randint(0, 10000, 100)})
        labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)]
        cat_labels = Categorical(labels, labels)

        df = df.sort_values(by=['value'], ascending=True)
        df['value_group'] = pd.cut(df.value, range(0, 10500, 500),
                                   right=False, labels=cat_labels)

        # numeric ops should not succeed
        for op, str_rep in [('__add__', r'\+'),
                            ('__sub__', '-'),
                            ('__mul__', r'\*'),
                            ('__truediv__', '/')]:
            msg = r"Series cannot perform the operation {}".format(str_rep)
            with pytest.raises(TypeError, match=msg):
                getattr(df, op)(df)

        # reduction ops should not succeed (unless specifically defined, e.g.
        # min/max)
        s = df['value_group']
        for op in ['kurt', 'skew', 'var', 'std', 'mean', 'sum', 'median']:
            msg = "Categorical cannot perform the operation {}".format(op)
            with pytest.raises(TypeError, match=msg):
                getattr(s, op)(numeric_only=False)

        # mad technically works because it takes always the numeric data

        # numpy ops
        s = Series(Categorical([1, 2, 3, 4]))
        with pytest.raises(TypeError):
            np.sum(s)

        # numeric ops on a Series
        for op, str_rep in [('__add__', r'\+'),
                            ('__sub__', '-'),
                            ('__mul__', r'\*'),
                            ('__truediv__', '/')]:
            msg = r"Series cannot perform the operation {}".format(str_rep)
            with pytest.raises(TypeError, match=msg):
                getattr(s, op)(2)

        # invalid ufunc
        with pytest.raises(TypeError):
            np.log(s)

    def test_contains(self):
        # GH21508
        c = pd.Categorical(list('aabbca'), categories=list('cab'))

        assert 'b' in c
        assert 'z' not in c
        assert np.nan not in c
        with pytest.raises(TypeError):
            assert [1] in c

        # assert codes NOT in index
        assert 0 not in c
        assert 1 not in c

        c = pd.Categorical(list('aabbca') + [np.nan], categories=list('cab'))
        assert np.nan in c
Пример #42
0
def difference(dataset, interval=1):
	diff = list()
	for i in range(interval, len(dataset)):
		value = dataset[i] - dataset[i - interval]
		diff.append(value)
	return Series(diff)
Пример #43
0
    def test_clip_with_na_args(self):
        """Should process np.nan argument as None"""
        # GH#17276
        s = Series([1, 2, 3])

        tm.assert_series_equal(s.clip(np.nan), Series([1, 2, 3]))
        tm.assert_series_equal(s.clip(upper=np.nan, lower=np.nan), Series([1, 2, 3]))

        # GH#19992
        tm.assert_series_equal(s.clip(lower=[0, 4, np.nan]), Series([1, 4, 3]))
        tm.assert_series_equal(s.clip(upper=[1, np.nan, 1]), Series([1, 2, 1]))

        # GH#40420
        s = Series([1, 2, 3])
        result = s.clip(0, [np.nan, np.nan, np.nan])
        tm.assert_series_equal(s, result)
Пример #44
0
    def test_comparisons(self):
        result = self.factor[self.factor == 'a']
        expected = self.factor[np.asarray(self.factor) == 'a']
        tm.assert_categorical_equal(result, expected)

        result = self.factor[self.factor != 'a']
        expected = self.factor[np.asarray(self.factor) != 'a']
        tm.assert_categorical_equal(result, expected)

        result = self.factor[self.factor < 'c']
        expected = self.factor[np.asarray(self.factor) < 'c']
        tm.assert_categorical_equal(result, expected)

        result = self.factor[self.factor > 'a']
        expected = self.factor[np.asarray(self.factor) > 'a']
        tm.assert_categorical_equal(result, expected)

        result = self.factor[self.factor >= 'b']
        expected = self.factor[np.asarray(self.factor) >= 'b']
        tm.assert_categorical_equal(result, expected)

        result = self.factor[self.factor <= 'b']
        expected = self.factor[np.asarray(self.factor) <= 'b']
        tm.assert_categorical_equal(result, expected)

        n = len(self.factor)

        other = self.factor[np.random.permutation(n)]
        result = self.factor == other
        expected = np.asarray(self.factor) == np.asarray(other)
        tm.assert_numpy_array_equal(result, expected)

        result = self.factor == 'd'
        expected = np.repeat(False, len(self.factor))
        tm.assert_numpy_array_equal(result, expected)

        # comparisons with categoricals
        cat_rev = Categorical(
            ["a", "b", "c"], categories=["c", "b", "a"], ordered=True)
        cat_rev_base = Categorical(
            ["b", "b", "b"], categories=["c", "b", "a"], ordered=True)
        cat = Categorical(["a", "b", "c"], ordered=True)
        cat_base = Categorical(
            ["b", "b", "b"], categories=cat.categories, ordered=True)

        # comparisons need to take categories ordering into account
        res_rev = cat_rev > cat_rev_base
        exp_rev = np.array([True, False, False])
        tm.assert_numpy_array_equal(res_rev, exp_rev)

        res_rev = cat_rev < cat_rev_base
        exp_rev = np.array([False, False, True])
        tm.assert_numpy_array_equal(res_rev, exp_rev)

        res = cat > cat_base
        exp = np.array([False, False, True])
        tm.assert_numpy_array_equal(res, exp)

        # Only categories with same categories can be compared
        with pytest.raises(TypeError):
            cat > cat_rev

        cat_rev_base2 = Categorical(
            ["b", "b", "b"], categories=["c", "b", "a", "d"])

        with pytest.raises(TypeError):
            cat_rev > cat_rev_base2

        # Only categories with same ordering information can be compared
        cat_unorderd = cat.set_ordered(False)
        assert not (cat > cat).any()

        with pytest.raises(TypeError):
            cat > cat_unorderd

        # comparison (in both directions) with Series will raise
        s = Series(["b", "b", "b"])
        msg = ("Cannot compare a Categorical for op __gt__ with type"
               r" <class 'numpy\.ndarray'>")
        with pytest.raises(TypeError, match=msg):
            cat > s
        with pytest.raises(TypeError, match=msg):
            cat_rev > s
        with pytest.raises(TypeError, match=msg):
            s < cat
        with pytest.raises(TypeError, match=msg):
            s < cat_rev

        # comparison with numpy.array will raise in both direction, but only on
        # newer numpy versions
        a = np.array(["b", "b", "b"])
        with pytest.raises(TypeError, match=msg):
            cat > a
        with pytest.raises(TypeError, match=msg):
            cat_rev > a

        # Make sure that unequal comparison take the categories order in
        # account
        cat_rev = Categorical(
            list("abc"), categories=list("cba"), ordered=True)
        exp = np.array([True, False, False])
        res = cat_rev > "b"
        tm.assert_numpy_array_equal(res, exp)

        # check that zero-dim array gets unboxed
        res = cat_rev > np.array("b")
        tm.assert_numpy_array_equal(res, exp)
Пример #45
0
def _unary_op(op, this, null_value=False):
    # type: (str, GeoSeries, Any) -> Series
    """Unary operation that returns a Series"""
    return Series([getattr(geom, op, null_value) for geom in this.geometry],
                  index=this.index, dtype=np.dtype(type(null_value)))
Пример #46
0
    def test_grouper_creation_bug(self):

        # GH 8795
        df = DataFrame({'A': [0, 0, 1, 1, 2, 2], 'B': [1, 2, 3, 4, 5, 6]})
        g = df.groupby('A')
        expected = g.sum()

        g = df.groupby(pd.Grouper(key='A'))
        result = g.sum()
        assert_frame_equal(result, expected)

        result = g.apply(lambda x: x.sum())
        assert_frame_equal(result, expected)

        g = df.groupby(pd.Grouper(key='A', axis=0))
        result = g.sum()
        assert_frame_equal(result, expected)

        # GH14334
        # pd.Grouper(key=...) may be passed in a list
        df = DataFrame({
            'A': [0, 0, 0, 1, 1, 1],
            'B': [1, 1, 2, 2, 3, 3],
            'C': [1, 2, 3, 4, 5, 6]
        })
        # Group by single column
        expected = df.groupby('A').sum()
        g = df.groupby([pd.Grouper(key='A')])
        result = g.sum()
        assert_frame_equal(result, expected)

        # Group by two columns
        # using a combination of strings and Grouper objects
        expected = df.groupby(['A', 'B']).sum()

        # Group with two Grouper objects
        g = df.groupby([pd.Grouper(key='A'), pd.Grouper(key='B')])
        result = g.sum()
        assert_frame_equal(result, expected)

        # Group with a string and a Grouper object
        g = df.groupby(['A', pd.Grouper(key='B')])
        result = g.sum()
        assert_frame_equal(result, expected)

        # Group with a Grouper object and a string
        g = df.groupby([pd.Grouper(key='A'), 'B'])
        result = g.sum()
        assert_frame_equal(result, expected)

        # GH8866
        s = Series(
            np.arange(8, dtype='int64'),
            index=pd.MultiIndex.from_product(
                [list('ab'),
                 range(2),
                 date_range('20130101', periods=2)],
                names=['one', 'two', 'three']))
        result = s.groupby(pd.Grouper(level='three', freq='M')).sum()
        expected = Series([28],
                          index=Index([Timestamp('2013-01-31')],
                                      freq='M',
                                      name='three'))
        assert_series_equal(result, expected)

        # just specifying a level breaks
        result = s.groupby(pd.Grouper(level='one')).sum()
        expected = s.groupby(level='one').sum()
        assert_series_equal(result, expected)
Пример #47
0
def detect(data, args):
    in_file = data['r2_path']
    out_prefix = data['sample_id']
    out_file = out_prefix + "_polyA.dat.gz"
    out_name_false = out_prefix + "_none.dat.gz"
    counts = Counter()
    num_line = 0
    logger.my_logger.info("reading file %s" % in_file)
    logger.my_logger.info("creating files %s %s" % (out_file, out_name_false))
    data['detect'] = out_file
    if os.path.exists(out_file):
        return data
    with file_transaction(out_file) as tx_out_file:
        with open_fastq(in_file) as handle, gzip.open(tx_out_file,
                                                      'w') as out, gzip.open(
                                                          out_name_false,
                                                          'w') as out_false:
            for line in handle:
                #print line
                num_line += 1
                if num_line % 1000000 == 0:
                    logger.my_logger.info("read %s lines:" % num_line)
                if line.startswith("@HISEQ"):
                    #print line
                    name = line.strip()
                    seq = handle.next().strip()
                    handle.next().strip()
                    qual = handle.next().strip()
                    find = _adapter(seq, qual)
                    #print "%s %s" % (seq, find)
                    if find:
                        seq, qual = find
                        ns = poly_A_percentage(seq)
                        #ns = polyA(seq)
                        if ns:
                            if ns[1] - ns[0] >= 6:
                                #print "positions are" + str(ns[0]) + ".." + str(ns[1])
                                mod = seq[:ns[0]]
                                seq_polyA = seq[ns[0]:ns[1]]
                                seq_gene = seq[ns[1]:]
                                qual_polyA = qual[ns[0]:ns[1]]
                                qual_gene = qual[ns[1]:]
                                #print "%s\t%s\t%s\t%s\t%s\t%s\n" % (name,mod,sf,qf)
                                out.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" %
                                          (name, ns[0], ns[1], mod, seq_polyA,
                                           qual_polyA, seq_gene, qual_gene))
                                counts['polyA'] += 1
                                if len(mod) > 0:
                                    counts['mod'] += 1
                            else:
                                counts['shortA'] += 1
                                out_false.write("%s\t%s\t%s\t%s\n" %
                                                ("shortA", name, seq, qual))
                        else:
                            counts['noA'] += 1
                            out_false.write("%s\t%s\t%s\t%s\n" %
                                            ("None", name, seq, qual))
                    else:
                        out_false.write("%s\t%s\t%s\t%s\n" %
                                        ("No_tag", name, seq, qual))
                        counts['notag'] += 1
        with file_transaction(out_prefix + ".stat") as tx_stat_file:
            df = Series(counts)
            df.to_csv(tx_stat_file, sep="\t")
        logger.my_logger.info("%s" % counts)
    return data
Пример #48
0
 def is_ring(self):
     """Returns a ``Series`` of ``dtype('bool')`` with value ``True`` for
     features that are closed."""
     # operates on the exterior, so can't use _unary_op()
     return Series([geom.exterior.is_ring for geom in self.geometry],
                   index=self.index)
#def clus_name(clus_num):
#    clus_map = {0: "a", 1: "b", 2: "c",3: "d"}
#    return clus_map[clus_num]
#
#df_clus['clus_profile']=df_clus['cluster'].apply(lambda row: clus_name(row))



with open('us_counties.topo.json') as json_data:
    d = json.load(json_data)

county_list=[]
for i in range(0,len(d['objects']['us_counties.geo']['geometries'])):
    county_list.append(d['objects']['us_counties.geo']['geometries'][i]['properties']['FIPS'])

county_list=Series(county_list)


county_list=DataFrame(county_list,columns=["FIPS"])
df_counties=pd.merge(county_list, df_clus, left_on="FIPS", right_on="FIPS",how='left')
df_counties['cluster']=df_counties['cluster'].fillna(-1)
df_counties['cluster']=df_counties['cluster']+1

os.chdir('C:\Users\Vaishnavi\Documents\Notebooks')

df_output= df_counties[['FIPS','cluster']]
df_output=df_output.sort_values(by='FIPS')
df_output.to_csv('Cluster_counties.csv', sep=',')


Пример #50
0
    def test_set_index_cast_datetimeindex(self):
        df = DataFrame({'A': [datetime(2000, 1, 1) + timedelta(i)
                              for i in range(1000)],
                        'B': np.random.randn(1000)})

        idf = df.set_index('A')
        assert isinstance(idf.index, pd.DatetimeIndex)

        # don't cast a DatetimeIndex WITH a tz, leave as object
        # GH 6032
        i = (pd.DatetimeIndex(
            to_datetime(['2013-1-1 13:00',
                         '2013-1-2 14:00'], errors="raise"))
             .tz_localize('US/Pacific'))
        df = DataFrame(np.random.randn(2, 1), columns=['A'])

        expected = Series(np.array([pd.Timestamp('2013-01-01 13:00:00-0800',
                                                 tz='US/Pacific'),
                                    pd.Timestamp('2013-01-02 14:00:00-0800',
                                                 tz='US/Pacific')],
                                   dtype="object"))

        # convert index to series
        result = Series(i)
        assert_series_equal(result, expected)

        # assignt to frame
        df['B'] = i
        result = df['B']
        assert_series_equal(result, expected, check_names=False)
        assert result.name == 'B'

        # keep the timezone
        result = i.to_series(keep_tz=True)
        assert_series_equal(result.reset_index(drop=True), expected)

        # convert to utc
        df['C'] = i.to_series().reset_index(drop=True)
        result = df['C']
        comp = pd.DatetimeIndex(expected.values).copy()
        comp.tz = None
        tm.assert_numpy_array_equal(result.values, comp.values)

        # list of datetimes with a tz
        df['D'] = i.to_pydatetime()
        result = df['D']
        assert_series_equal(result, expected, check_names=False)
        assert result.name == 'D'

        # GH 6785
        # set the index manually
        import pytz
        df = DataFrame(
            [{'ts': datetime(2014, 4, 1, tzinfo=pytz.utc), 'foo': 1}])
        expected = df.set_index('ts')
        df.index = df['ts']
        df.pop('ts')
        assert_frame_equal(df, expected)

        # GH 3950
        # reset_index with single level
        for tz in ['UTC', 'Asia/Tokyo', 'US/Eastern']:
            idx = pd.date_range('1/1/2011', periods=5,
                                freq='D', tz=tz, name='idx')
            df = pd.DataFrame(
                {'a': range(5), 'b': ['A', 'B', 'C', 'D', 'E']}, index=idx)

            expected = pd.DataFrame({'idx': [datetime(2011, 1, 1),
                                             datetime(2011, 1, 2),
                                             datetime(2011, 1, 3),
                                             datetime(2011, 1, 4),
                                             datetime(2011, 1, 5)],
                                     'a': range(5),
                                     'b': ['A', 'B', 'C', 'D', 'E']},
                                    columns=['idx', 'a', 'b'])
            expected['idx'] = expected['idx'].apply(
                lambda d: pd.Timestamp(d, tz=tz))
            assert_frame_equal(df.reset_index(), expected)
Пример #51
0
    ns = len(y)
    weights = np.ones(ns) * mu
    for k in range(ns):
        weights[k] = weights[k]**k
    weights = np.flip(weights, 0)

    # Fitting SVR to the dataset
    from sklearn.linear_model import LinearRegression
    lr = AR()
    #lr.fit(x, y, sample_weight=weights)
    lr.fit(y)
    y_pred = lr.predict(pred)
    return y_pred


series = Series.from_csv('daily-minimum-temperatures.csv', header=0)
# split dataset
X = series.values
train, test = X[1:len(X) - 7], X[len(X) - 7:]
# train autoregression
model = AR(train)
model_fit = model.fit()
print('Lag: %s' % model_fit.k_ar)
print('Coefficients: %s' % model_fit.params)
# make predictions
predictions = model_fit.predict(start=len(train),
                                end=len(train) + len(test) - 1,
                                dynamic=False)
for i in range(len(predictions)):
    print('predicted=%f, expected=%f' % (predictions[i], test[i]))
error = mean_squared_error(test, predictions)
Пример #52
0
                WHERE date = %(date)s
                '''
        # ----- Run query
        games = read_sql(query, db, params={'date': today})
        # ----- Filter to games that haven't started
        games = games[[
            item.hour > dt.datetime.now().hour for item in games['game_time']
        ]]
        # ----- Only continue if games exist
        if games.shape[0] != 0:
            lines = df_ml.merge(games, on='id')
            # ----- Find the best book to place a home and an away bet at:
            away_books = [item for item in lines.columns if 'odds_a' in item]
            away_best, away_sports_book, away_odds = list(), list(), list()
            for game in lines[away_books].iterrows():
                values = add_lines(Series(game[1]))
                away_best.append(max(values))
                away_sports_book.append(away_books[values.index(
                    max(values))].replace('odds_a', ''))
                away_odds.append(game[1].values[values.index(max(values))])
            home_books = [item for item in lines.columns if 'odds_h' in item]
            home_best, home_sports_book, home_odds = list(), list(), list()
            for game in lines[home_books].iterrows():
                values = add_lines(Series(game[1]))
                home_best.append(max(values))
                home_sports_book.append(home_books[values.index(
                    max(values))].replace('odds_h', ''))
                home_odds.append(game[1].values[values.index(max(values))])

            final_lines = lines[[
                'id', 'away_chance_winning', 'home_chance_winning'
Пример #53
0
for i in range(3, 10):
    print(i)

#遍历字符串
for letter in 'python':
    print('现在是:', letter)

#遍历列表
fruits = ['banana', 'apple', 'mango']

for fruit in fruits:
    print('now is:', fruit)

#遍历序列

x = Series(['a', True, 1], index=['frist', 'second', 'third'])
x[0]
x['second']
x[2]

for i in x.values:
    print(i)

for v in x:
    print("x中的值:", v)

for index in x.index:
    print("x中的索引:", index)
    print("x中的值:", x[index])
    print("_____________")
Пример #54
0
def lagou_spider_keyword(keyword):
    #将搜索字符串转换为utf-8编码,之后进行lagou.com搜索url构造
    keywordbyte=keyword.encode('utf-8')
    keywordindex=str(keywordbyte).replace(r'\x','%').replace(r"'","")
    keywordindex=re.sub('^b','',keywordindex)

    #计算总共有多少搜索结果页
    i =0
    type='true'
    url='http://www.lagou.com/jobs/positionAjax.json?px=default&first='+type+'&kd='+keywordindex+'&pn='+str(i+1)
    with request.urlopen(url) as f:
        data=f.read()
        urlcount=int(json.loads(str(data,encoding='utf-8',errors='ignore'))["content"]["totalPageCount"])
        print('本次搜索页面共计%d'%urlcount)

    #开始正式抓取
    for i in list(range(0,urlcount)):

        #构造页面
        if i ==0 :
            type='true'
        else:
            type='false'
        url='http://www.lagou.com/jobs/positionAjax.json?px=default&first='+type+'&kd='+keywordindex+'&pn='+str(i+1)
        with request.urlopen(url) as f:
            data=f.read()

        #读取json数据,开始解析
        try:
            jsondata=json.loads(str(data,encoding='utf-8',errors='ignore'))["content"]['result']

            for t in list(range(len(jsondata))):
                #把company描述的列表合并为一个字符串
                jsondata[t]['companyLabelList2']='-'.join(jsondata[t]['companyLabelList'])
                jsondata[t].pop('companyLabelList')

                #将每一行数据做成Series,之后再合并
                if t == 0:
                    rdata=DataFrame(Series(data=jsondata[t])).T
                else:
                    rdata=pd.concat([rdata,DataFrame(Series(data=jsondata[t])).T])
            #重新给rdata编码
            rdata.index=range(1,len(rdata)+1)
            rdata['keyword']=keyword
            rdata['salarymin']=0
            rdata['salarymax']=0
            rdata['url']=''
            rdata['jd']=''#职位描述
            rdata['handle_perc']=''#简历及时处理率,在七天内处理完简历占所有简历的比例
            rdata['handle_day']=''#完成简历处理平均天数
            for klen in list(range(len(rdata['salary']))):
                rdata.ix[klen+1,'salarymin'] = re.search('^(\d*?)k',rdata['salary'].iloc[klen]).group(1)
                #如果工资的最大值没有写,如(8k以上),则列为空值
                if re.search('-(\d*?)k$',rdata['salary'].iloc[klen]) != None:
                    rdata.ix[klen+1,'salarymax'] = re.search('-(\d*?)k$',rdata['salary'].iloc[klen]).group(1)
                else:
                    rdata.ix[klen+1,'salarymax'] = ''
                #增加url一列,便于后续抓取jd内容
                rdata.ix[klen+1,'url'] =  'http://www.lagou.com/jobs/%s.html'% rdata.ix[klen+1,'positionId']

                #对url进行二次抓取,把jd抓进来
                with request.urlopen(rdata.ix[klen+1,'url']) as f:
                    data_url=f.read()
                    soup_url=BeautifulSoup(data_url,'html5lib')
                    strings_url=soup_url.find('dd',class_='job_bt').strings
                    rdata.ix[klen+1,'jd']=''.join(strings_url).encode('gbk','ignore').decode('gbk','ignore').replace(' ','')
                    temp=soup_url.find_all('span',class_='data')
                    if re.search('>(\w*%)<',str(temp[0])) == None:
Пример #55
0
def hilo(high,
         low,
         close,
         high_length=None,
         low_length=None,
         mamode=None,
         offset=None,
         **kwargs):
    """Indicator: Gann HiLo (HiLo)"""
    # Validate Arguments
    high_length = int(high_length) if high_length and high_length > 0 else 13
    low_length = int(low_length) if low_length and low_length > 0 else 21
    mamode = mamode.lower() if isinstance(mamode, str) else "sma"
    _length = max(high_length, low_length)
    high = verify_series(high, _length)
    low = verify_series(low, _length)
    close = verify_series(close, _length)
    offset = get_offset(offset)

    if high is None or low is None or close is None: return

    # Calculate Result
    m = close.size
    hilo = Series(npNaN, index=close.index)
    long = Series(npNaN, index=close.index)
    short = Series(npNaN, index=close.index)

    high_ma = ma(mamode, high, length=high_length)
    low_ma = ma(mamode, low, length=low_length)

    for i in range(1, m):
        if close.iloc[i] > high_ma.iloc[i - 1]:
            hilo.iloc[i] = long.iloc[i] = low_ma.iloc[i]
        elif close.iloc[i] < low_ma.iloc[i - 1]:
            hilo.iloc[i] = short.iloc[i] = high_ma.iloc[i]
        else:
            hilo.iloc[i] = hilo.iloc[i - 1]
            long.iloc[i] = short.iloc[i] = hilo.iloc[i - 1]

    # Offset
    if offset != 0:
        hilo = hilo.shift(offset)
        long = long.shift(offset)
        short = short.shift(offset)

    # Handle fills
    if "fillna" in kwargs:
        hilo.fillna(kwargs["fillna"], inplace=True)
        long.fillna(kwargs["fillna"], inplace=True)
        short.fillna(kwargs["fillna"], inplace=True)
    if "fill_method" in kwargs:
        hilo.fillna(method=kwargs["fill_method"], inplace=True)
        long.fillna(method=kwargs["fill_method"], inplace=True)
        short.fillna(method=kwargs["fill_method"], inplace=True)

    # Name & Category
    _props = f"_{high_length}_{low_length}"
    data = {
        f"HILO{_props}": hilo,
        f"HILOl{_props}": long,
        f"HILOs{_props}": short
    }
    df = DataFrame(data, index=close.index)

    df.name = f"HILO{_props}"
    df.category = "overlap"

    return df
Пример #56
0
 def pipeline(self, ds: pd.Series) -> pd.Series:
     return ds.pipe(self.pipe_location).pipe(self.pipe_vaccine).pipe(self.pipe_source)
Пример #57
0
#Filter and select data

import numpy as np
import pandas as pd

from pandas import Series, DataFrame

series_obj = Series(np.arange(8),
                    index=[
                        'row 1',
                        'row 2',
                        'row 3',
                        'row 4',
                        'row 5',
                        'row 6',
                        'row 7',
                        'row 8',
                    ])
print(series_obj)
print(series_obj['row 3'])
print(series_obj[[0, 7]])

np.random.seed(25)
DF_obj = DataFrame(
    np.random.rand(36).reshape(6, 6),
    index=['row 1', 'row 2', 'row 3', 'row 4', 'row 5', 'row 6'],
    columns=[
        'column 1', 'column 2', 'column 3', 'column 4', 'column 5', 'column 6'
    ])
print(DF_obj)
print(DF_obj.ix[['row 1', 'row 2'], ['column 2', 'column 4']])
Пример #58
0
class TestSeriesToCSV():
    def read_csv(self, path, **kwargs):
        params = dict(squeeze=True, index_col=0, header=None, parse_dates=True)
        params.update(**kwargs)

        header = params.get("header")
        out = pd.read_csv(path, **params)

        if header is None:
            out.name = out.index.name = None

        return out

    def test_from_csv_deprecation(self, datetime_series):
        # see gh-17812
        with ensure_clean() as path:
            datetime_series.to_csv(path, header=False)

            with tm.assert_produces_warning(FutureWarning,
                                            check_stacklevel=False):
                ts = self.read_csv(path)
                depr_ts = Series.from_csv(path)
                assert_series_equal(depr_ts, ts)

    @pytest.mark.parametrize("arg", ["path", "header", "both"])
    def test_to_csv_deprecation(self, arg, datetime_series):
        # see gh-19715
        with ensure_clean() as path:
            if arg == "path":
                kwargs = dict(path=path, header=False)
            elif arg == "header":
                kwargs = dict(path_or_buf=path)
            else:  # Both discrepancies match.
                kwargs = dict(path=path)

            with tm.assert_produces_warning(FutureWarning):
                datetime_series.to_csv(**kwargs)

                # Make sure roundtrip still works.
                ts = self.read_csv(path)
                assert_series_equal(datetime_series, ts, check_names=False)

    def test_from_csv(self, datetime_series, string_series):

        with ensure_clean() as path:
            datetime_series.to_csv(path, header=False)
            ts = self.read_csv(path)
            assert_series_equal(datetime_series, ts, check_names=False)

            assert ts.name is None
            assert ts.index.name is None

            with tm.assert_produces_warning(FutureWarning,
                                            check_stacklevel=False):
                depr_ts = Series.from_csv(path)
                assert_series_equal(depr_ts, ts)

            # see gh-10483
            datetime_series.to_csv(path, header=True)
            ts_h = self.read_csv(path, header=0)
            assert ts_h.name == "ts"

            string_series.to_csv(path, header=False)
            series = self.read_csv(path)
            assert_series_equal(string_series, series, check_names=False)

            assert series.name is None
            assert series.index.name is None

            string_series.to_csv(path, header=True)
            series_h = self.read_csv(path, header=0)
            assert series_h.name == "series"

            with open(path, "w") as outfile:
                outfile.write("1998-01-01|1.0\n1999-01-01|2.0")

            series = self.read_csv(path, sep="|")
            check_series = Series({
                datetime(1998, 1, 1): 1.0,
                datetime(1999, 1, 1): 2.0
            })
            assert_series_equal(check_series, series)

            series = self.read_csv(path, sep="|", parse_dates=False)
            check_series = Series({"1998-01-01": 1.0, "1999-01-01": 2.0})
            assert_series_equal(check_series, series)

    def test_to_csv(self, datetime_series):
        import io

        with ensure_clean() as path:
            datetime_series.to_csv(path, header=False)

            with io.open(path, newline=None) as f:
                lines = f.readlines()
            assert (lines[1] != '\n')

            datetime_series.to_csv(path, index=False, header=False)
            arr = np.loadtxt(path)
            assert_almost_equal(arr, datetime_series.values)

    def test_to_csv_unicode_index(self):
        buf = StringIO()
        s = Series(["\u05d0", "d2"], index=["\u05d0", "\u05d1"])

        s.to_csv(buf, encoding="UTF-8", header=False)
        buf.seek(0)

        s2 = self.read_csv(buf, index_col=0, encoding="UTF-8")
        assert_series_equal(s, s2)

    def test_to_csv_float_format(self):

        with ensure_clean() as filename:
            ser = Series([0.123456, 0.234567, 0.567567])
            ser.to_csv(filename, float_format="%.2f", header=False)

            rs = self.read_csv(filename)
            xp = Series([0.12, 0.23, 0.57])
            assert_series_equal(rs, xp)

    def test_to_csv_list_entries(self):
        s = Series(['jack and jill', 'jesse and frank'])

        split = s.str.split(r'\s+and\s+')

        buf = StringIO()
        split.to_csv(buf, header=False)

    def test_to_csv_path_is_none(self):
        # GH 8215
        # Series.to_csv() was returning None, inconsistent with
        # DataFrame.to_csv() which returned string
        s = Series([1, 2, 3])
        csv_str = s.to_csv(path_or_buf=None, header=False)
        assert isinstance(csv_str, str)

    @pytest.mark.parametrize(
        's,encoding',
        [
            (Series([0.123456, 0.234567, 0.567567],
                    index=['A', 'B', 'C'],
                    name='X'), None),
            # GH 21241, 21118
            (Series(['abc', 'def', 'ghi'], name='X'), 'ascii'),
            (Series(["123", "你好", "世界"], name="中文"), 'gb2312'),
            (Series(["123", "Γειά σου", "Κόσμε"], name="Ελληνικά"), 'cp737')
        ])
    def test_to_csv_compression(self, s, encoding, compression):

        with ensure_clean() as filename:

            s.to_csv(filename,
                     compression=compression,
                     encoding=encoding,
                     header=True)
            # test the round trip - to_csv -> read_csv
            result = pd.read_csv(filename,
                                 compression=compression,
                                 encoding=encoding,
                                 index_col=0,
                                 squeeze=True)
            assert_series_equal(s, result)

            # test the round trip using file handle - to_csv -> read_csv
            f, _handles = _get_handle(filename,
                                      'w',
                                      compression=compression,
                                      encoding=encoding)
            with f:
                s.to_csv(f, encoding=encoding, header=True)
            result = pd.read_csv(filename,
                                 compression=compression,
                                 encoding=encoding,
                                 index_col=0,
                                 squeeze=True)
            assert_series_equal(s, result)

            # explicitly ensure file was compressed
            with tm.decompress_file(filename, compression) as fh:
                text = fh.read().decode(encoding or 'utf8')
                assert s.name in text

            with tm.decompress_file(filename, compression) as fh:
                assert_series_equal(
                    s,
                    pd.read_csv(fh,
                                index_col=0,
                                squeeze=True,
                                encoding=encoding))
Пример #59
0
 def test_pindex_qaccess(self):
     pi = PeriodIndex(['2Q05', '3Q05', '4Q05', '1Q06', '2Q06'], freq='Q')
     s = Series(np.random.rand(len(pi)), index=pi).cumsum()
     # Todo: fix these accessors!
     self.assertEqual(s['05Q4'], s[2])
Пример #60
0
def video_avf_rec(user_id,num_of_rec):
    rated_by_user = trailer_seen.TrailerSeen.query.filter_by(seen_by=user_id, is_skipped=0)
    movies_seen=trailer_seen.TrailerSeen.query.filter_by(seen_by=user_id)
    rated_imdb = {}
    movies_to_exclude = []
    for r in rated_by_user:
        rated_imdb.update({r.imdb_id: r.rate})
    for r in movies_seen:
        movies_to_exclude.append(r.imdb_id)

    final_array = []
    i = 0
    for row in video_avf_sim.itertuples():

        num = float(0)
        den = float(0)
        i += 1
        neigh_splitted = row[3].split(",")
        for j in neigh_splitted:
            imdb_sim = j.split(":")
            imdb = imdb_sim[0]
            if rated_imdb.get(imdb, None):
                num += float(rated_imdb.get(imdb)) * float(imdb_sim[1])
                den += float(imdb_sim[1])
        if den == 0:
            final_v = float(0)
        else:
            final_v = float(num / den)

        if not final_v:
            final_v = float(0)

        final_array.append((row[1], final_v, row[2]))

    dtype = [('IMDB_ID', 'S10'), ('PREDICTED_VOTE', float), ('IMDB_VOTES', int)]

    numpy_final = numpy.array(final_array, dtype=dtype)
    numpy_final = numpy.sort(numpy_final, order=['PREDICTED_VOTE'])
    numpy_final = numpy_final[::-1]

    all_table = get_table("all_table")()
    all_table = all_table[~all_table["IMDB_ID"].isin(Series(movies_to_exclude))]
    all_table.reset_index(drop=True, inplace=True)

    final = {}

    safe_iter = 0

    while (len(final) < num_of_rec) and (safe_iter < 20):
        rec = numpy_final[safe_iter]

        movie = all_table[all_table["IMDB_ID"] == rec[0]].copy()
        if len(movie.index):
            movie.reset_index(drop=True, inplace=True)
            movie = movie.iloc[0]
            movie["REC_TYPE"] = "AUDIO"
            movie["PREDICTED_VOTE"]=rec[1]

            z = movie.to_json()
            safe_iter += 1
            final.update({len(final): z})
        else:
            safe_iter += 1

    return final