Пример #1
0
def proximity(features, pos_columns=['x', 'y']):
    """Find the distance to each feature's nearest neighbor.

    Parameters
    ----------
    features : DataFrame
    pos_columns : list of column names
        ['x', 'y'] by default

    Returns
    -------
    proximity : DataFrame
        distance to each particle's nearest neighbor,
        indexed by particle if 'particle' column is present in input

    Example
    -------
    Find the proximity of each particle to its nearest neighbor in every frame.

    >>> prox = t.groupby('frame').apply(proximity).reset_index()
    >>> avg_prox = prox.groupby('particle')['proximity'].mean()

    And filter the trajectories...

    >>> particle_nos = avg_prox[avg_prox > 20].index
    >>> t_filtered = t[t['particle'].isin(particle_nos)]
    """
    leaf_size = max(1, int(np.round(np.log10(len(features)))))
    tree = cKDTree(features[['x', 'y']].copy(), leaf_size)
    proximity = tree.query(tree.data, 2)[0][:, 1]
    result = DataFrame({'proximity': proximity})
    if 'particle' in features:
        result.set_index(features['particle'], inplace=True)
    return result
Пример #2
0
    def test_reset_index_multiindex_nan(self):
        # GH6322, testing reset_index on MultiIndexes
        # when we have a nan or all nan
        df = DataFrame({'A': ['a', 'b', 'c'],
                        'B': [0, 1, np.nan],
                        'C': np.random.rand(3)})
        rs = df.set_index(['A', 'B']).reset_index()
        tm.assert_frame_equal(rs, df)

        df = DataFrame({'A': [np.nan, 'b', 'c'],
                        'B': [0, 1, 2],
                        'C': np.random.rand(3)})
        rs = df.set_index(['A', 'B']).reset_index()
        tm.assert_frame_equal(rs, df)

        df = DataFrame({'A': ['a', 'b', 'c'],
                        'B': [0, 1, 2],
                        'C': [np.nan, 1.1, 2.2]})
        rs = df.set_index(['A', 'B']).reset_index()
        tm.assert_frame_equal(rs, df)

        df = DataFrame({'A': ['a', 'b', 'c'],
                        'B': [np.nan, np.nan, np.nan],
                        'C': np.random.rand(3)})
        rs = df.set_index(['A', 'B']).reset_index()
        tm.assert_frame_equal(rs, df)
Пример #3
0
def _create_df(sheet, start_row, start_col, end_row, end_col, reindex=False):
    df = DataFrame(sheet[start_row+1:end_row, start_col:end_col].value,
                   columns=sheet[start_row, start_col:end_col].value)

    if reindex:
        df.set_index(keys=df.iloc[:, 0], inplace=True)
    return df
Пример #4
0
    def get_data(stock, start = None, end = None, interval='d'):
        params = dict(s=stock)
        format = "%Y-%m-%d"
        if start is not None:
            date = datetime.datetime.strptime(start, format)
            params['a'] = date.month - 1
            params['b'] = date.day
            params['c'] = date.year

        if end is not None:
            date = datetime.datetime.strptime(end, format)
            params['d'] = date.month - 1
            params['e'] = date.day
            params['f'] = date.year


        params['g'] = interval

        response = requests.get(YahooAPI.base_url, params=params)
        content = response.content.split('\n')
        headers = content[0].split(',')
        lines = [line.split(',') for line in content[1:-1]]  # last line empty
        import pdb
        pdb.set_trace()
        df = DataFrame(lines, columns=headers)
        df['Date'] = pd.to_datetime(df['Date'], format=format)
        df.set_index('Date', inplace = True)
        return df
Пример #5
0
    def test_to_csv_decimal(self):
        # GH 781
        df = DataFrame({'col1': [1], 'col2': ['a'], 'col3': [10.1]})

        expected_default = ',col1,col2,col3\n0,1,a,10.1\n'
        assert df.to_csv() == expected_default

        expected_european_excel = ';col1;col2;col3\n0;1;a;10,1\n'
        assert df.to_csv(decimal=',', sep=';') == expected_european_excel

        expected_float_format_default = ',col1,col2,col3\n0,1,a,10.10\n'
        assert df.to_csv(float_format='%.2f') == expected_float_format_default

        expected_float_format = ';col1;col2;col3\n0;1;a;10,10\n'
        assert df.to_csv(decimal=',', sep=';',
                         float_format='%.2f') == expected_float_format

        # GH 11553: testing if decimal is taken into account for '0.0'
        df = pd.DataFrame({'a': [0, 1.1], 'b': [2.2, 3.3], 'c': 1})
        expected = 'a,b,c\n0^0,2^2,1\n1^1,3^3,1\n'
        assert df.to_csv(index=False, decimal='^') == expected

        # same but for an index
        assert df.set_index('a').to_csv(decimal='^') == expected

        # same for a multi-index
        assert df.set_index(['a', 'b']).to_csv(decimal="^") == expected
Пример #6
0
def test_resample_timegrouper():
    # GH 7227
    dates1 = [datetime(2014, 10, 1), datetime(2014, 9, 3),
              datetime(2014, 11, 5), datetime(2014, 9, 5),
              datetime(2014, 10, 8), datetime(2014, 7, 15)]

    dates2 = dates1[:2] + [pd.NaT] + dates1[2:4] + [pd.NaT] + dates1[4:]
    dates3 = [pd.NaT] + dates1 + [pd.NaT]

    for dates in [dates1, dates2, dates3]:
        df = DataFrame(dict(A=dates, B=np.arange(len(dates))))
        result = df.set_index('A').resample('M').count()
        exp_idx = pd.DatetimeIndex(['2014-07-31', '2014-08-31',
                                    '2014-09-30',
                                    '2014-10-31', '2014-11-30'],
                                   freq='M', name='A')
        expected = DataFrame({'B': [1, 0, 2, 2, 1]}, index=exp_idx)
        assert_frame_equal(result, expected)

        result = df.groupby(pd.Grouper(freq='M', key='A')).count()
        assert_frame_equal(result, expected)

        df = DataFrame(dict(A=dates, B=np.arange(len(dates)), C=np.arange(
            len(dates))))
        result = df.set_index('A').resample('M').count()
        expected = DataFrame({'B': [1, 0, 2, 2, 1], 'C': [1, 0, 2, 2, 1]},
                             index=exp_idx, columns=['B', 'C'])
        assert_frame_equal(result, expected)

        result = df.groupby(pd.Grouper(freq='M', key='A')).count()
        assert_frame_equal(result, expected)
Пример #7
0
    def test_index_with_nan(self):
        #  GH 2850
        df = DataFrame(
            {
                "id1": {0: "1a3", 1: "9h4"},
                "id2": {0: np.nan, 1: "d67"},
                "id3": {0: "78d", 1: "79d"},
                "value": {0: 123, 1: 64},
            }
        )

        # multi-index
        y = df.set_index(["id1", "id2", "id3"])
        result = y.to_string()
        expected = u"             value\nid1 id2 id3       \n1a3 NaN 78d    123\n9h4 d67 79d     64"
        self.assert_(result == expected)

        # index
        y = df.set_index("id2")
        result = y.to_string()
        expected = u"     id1  id3  value\nid2                 \nNaN  1a3  78d    123\nd67  9h4  79d     64"
        self.assert_(result == expected)

        # all-nan in mi
        df2 = df.copy()
        df2.ix[:, "id2"] = np.nan
        y = df2.set_index("id2")
        result = y.to_string()
        expected = u"     id1  id3  value\nid2                 \nNaN  1a3  78d    123\nNaN  9h4  79d     64"
        self.assert_(result == expected)
Пример #8
0
    def load_frame(cls, session):
        """
        Load part of the table into a well-formatted pandas.DataFrame.

        session can be any object with the execute method.
        """
        sample = cls.__table__
        job = Job.__table__
        result = Result.__table__
        analysis = AnalysisConfiguration.__table__
        control = ControlConfiguration.__table__
        experiment = Experiment.__table__
        stmt = select([sample.c.id, sample.c.control,
                result.c.point, control.c.type, control.c.direction,
                experiment.c.strain, job.c.preparation, job.c.sampling,
                job.c.projection, job.c.measure, job.c.delay,
                analysis.c.version]).where(and_(
                sample.c.result_id == result.c.id,
                result.c.job_id == job.c.id,
                job.c.analysis_id == analysis.c.id,
                job.c.control_id == control.c.id,
                job.c.experiment_id == experiment.c.id))
        query = session.execute(stmt)
        df = DataFrame(iter(query), columns=query.keys())
        df.set_index("id", inplace=True)
        return df
Пример #9
0
    def test_construction_with_categorical_index(self):

        ci = tm.makeCategoricalIndex(10)

        # with Categorical
        df = DataFrame({'A': np.random.randn(10),
                        'B': ci.values})
        idf = df.set_index('B')
        str(idf)
        tm.assert_index_equal(idf.index, ci, check_names=False)
        assert idf.index.name == 'B'

        # from a CategoricalIndex
        df = DataFrame({'A': np.random.randn(10),
                        'B': ci})
        idf = df.set_index('B')
        str(idf)
        tm.assert_index_equal(idf.index, ci, check_names=False)
        assert idf.index.name == 'B'

        idf = df.set_index('B').reset_index().set_index('B')
        str(idf)
        tm.assert_index_equal(idf.index, ci, check_names=False)
        assert idf.index.name == 'B'

        new_df = idf.reset_index()
        new_df.index = df.B
        tm.assert_index_equal(new_df.index, ci, check_names=False)
        assert idf.index.name == 'B'
Пример #10
0
    def test_dti_set_index_reindex(self):
        # GH 6631
        df = DataFrame(np.random.random(6))
        idx1 = date_range('2011/01/01', periods=6, freq='M', tz='US/Eastern')
        idx2 = date_range('2013', periods=6, freq='A', tz='Asia/Tokyo')

        df = df.set_index(idx1)
        tm.assert_index_equal(df.index, idx1)
        df = df.reindex(idx2)
        tm.assert_index_equal(df.index, idx2)

        # 11314
        # with tz
        index = date_range(datetime(2015, 10, 1),
                           datetime(2015, 10, 1, 23),
                           freq='H', tz='US/Eastern')
        df = DataFrame(np.random.randn(24, 1), columns=['a'], index=index)
        new_index = date_range(datetime(2015, 10, 2),
                               datetime(2015, 10, 2, 23),
                               freq='H', tz='US/Eastern')

        # TODO: unused?
        result = df.set_index(new_index)  # noqa

        assert new_index.freq == index.freq
Пример #11
0
def stats(request):
    stats_by = request.GET.get('by', 'category')

    trx = Transaction.objects.filter(amount__lt=0).exclude(category__name='Credit Card Payments')
    original_df = DataFrame(data=[{k: getattr(t, k) for k in ('date', 'category', 'amount')} for t in trx])

    df = original_df.set_index('date').groupby('category').resample('M', how='sum')

    chart_df = df.reset_index()\
                 .pivot_table(values='amount', index=['date'], columns=['category'], aggfunc=numpy.sum)\
                 .replace(numpy.NaN, 0)

    months = [x.strftime('%Y-%m-%d') for x in chart_df.index]
    chart_series = [
        {'name': category, 'type': 'column', 'data': [abs(float(a)) for a in amounts]}
        for category,amounts in chart_df.iteritems()]

    table_df = df.reset_index()\
                 .pivot_table(values='amount', index=['category'], columns=['date'], aggfunc=numpy.sum)\
                 .replace(numpy.NaN, 0)#.reset_index()
    table_data = [(category, list(amounts)) for category,amounts in chart_df.iteritems()]
    total_df = original_df.set_index('date').resample('M', how='sum').transpose()
    table_data.append(('Total', total_df.values[0]))

    return render_to_response('transactions/stats.html', {
        'months_json': json.dumps(months),
        'chart_series_json': json.dumps(chart_series),
        'chart_df': chart_df,
        'months': months,
        'table_data': table_data,
    })
Пример #12
0
    def calculate_top_10_solutions(self):
        '''calcualte all schemes and select top 10 solutions'''
        
        columns = ['name','rate','money']

        if isfile( learning_progres_csv ):
            scheme_profit = read_csv(learning_progres_csv)
        else:
            scheme_profit = DataFrame(columns = columns)            
        scheme_profit.set_index('name',inplace = True)

        with open(learning_progres_csv, 'w+') as csvfile:
            writer = csv.DictWriter(csvfile,delimiter=',',fieldnames = columns)
            writer.writeheader()
            csvfile.flush()
            for sc in self.generate_all_schemes():
                if sc.name not in scheme_profit.index:
                    e = evaluator(sc)
                    scheme_profit.ix[sc.name] = rate,money = e.calculate()
                    writer.writerow({'name':sc.name,'rate':rate,'money':money})
                    csvfile.flush()
                    if self.log:
                        print(sc.name + ' - ' + str(money) + ' \t rate = ' + str(rate))
                else:
                    writer.writerow({'name':sc.name,'rate':scheme_profit.rate[sc.name],'money':scheme_profit.money[sc.name]})
                    if self.log:
                        print(sc.name + ' - ' + str(scheme_profit.money[sc.name]) + ' \t rate = ' + str(scheme_profit.rate[sc.name]))
                    csvfile.flush()

        #TODO:write into scheme
        scheme_profit.sort(['money'],ascending = False)
        return scheme_profit[:10].to_dict()
Пример #13
0
    def _parse(cls, body):
        matched = re.search(r'<div class="col_r" style="">(.*?)</div>', body, re.MULTILINE | re.DOTALL | re.UNICODE)
        if matched is None or len(matched.groups()) == 0:
            raise ValueError("no matched data found.")

        lines = matched.group(1).strip().split("\n")

        value_pattern = re.compile(r">(.*?)<", re.UNICODE)
        data_array = []
        stock_name = cls._get_stock_name(body)
        for line in lines:
            if r"<tr" not in line:
                continue

            data = []
            line = line.strip()
            for value in re.findall(value_pattern, line):
                value = cls._normalize(value)
                if isinstance(value, string_types) and len(value) == 0:
                    continue
                data.append(value)
            if len(data) > 0:
                data_array.append(data)

        if data_array:
            data_array.insert(0, [stock_name] * len(data_array[0]))
            data_array = np.array(data_array).T
        df = DataFrame(data_array, columns=NETEASE_STOCK_INFO_COLUMNS)
        df.set_index("date", inplace=True)
        return df
Пример #14
0
def build_dataframe(days=10, fill_value=1., values={}, end_date=dt.date.today(), date_index=True):
    ''' Constructs and returns a DataFrame in the form of those that
    are returned by Pandas DataReader. It doesn't take weekends or
    holidays into account, so weeked dates will generate values
    as well.
    
    Options are as follows:

    days: the number of rows to return. Defaults to 10
    fill_value: the value to fill each cell with (excluding date),
        defaults to 1
    values: A dictionary containing values with which to populate
        columns of the new dataframe.
        For example: values={'Adj Close': [5,6,7,8,9,10]}
        When one or more columns are specified, the number of rows in
        the new dataframe will be the length of the short column.
    end_date: The end of the range of dates comprising the
        dataframe. Takes a datetime.date. The start date is derived
        from a combination of this and the days parameter. Defaults to
        today's date.
    date_index: A boolean flag of whether the returned dataframe should
        set the date as the index (instead of the default numerical 
        index). If True, the dataframe will perfectly mimic that which
        is returned by Pandas DataReader. Default is True.

    In addition, you may specify a non OHLC column, such as RSI, and
    it will be added to the typical OHLC dataframe that gets created.
    '''
    columns = ['Open','High','Low','Close','Adj Close','Volume']


    # determine the minimum number of rows in values
    if len(values) > 0:
        # create a helper list of key/len(value) tuples
        helper = [(key, len(value)) for key, value in values.items()]
        helper.sort(key=lambda x: x[1])
        days = helper[0][1]
    else:
        ''' For some rason, values persisted across function calls
        when not declaring inside the function. I thought scoping
        rules would've deleted it after the function call, but I guess
        function parameters aren't killed?
        '''
        values = {} 
    for i in columns:
        if i in values:
            values[i] = values[i][:days] 
        else:
            values[i] = [fill_value] * days

    dateList = [end_date - dt.timedelta(days=i) for i in range(days)]
    # necessary so the dataframe flows from oldest to most recent when
    # read from top to bottom, like DataReader
    dateList.reverse()  
    values['Date'] = DatetimeIndex(dateList)
    df = DataFrame(values, index=range(days))
    if date_index == True:
        df.set_index(keys='Date', drop=True, inplace=True)
    return df
Пример #15
0
    def parallel_cumulative_blame(self, branch='master', limit=None, skip=None, num_datapoints=None, committer=True,
                                  workers=1, ignore_globs=None, include_globs=None):
        """
        Returns the blame at every revision of interest. Index is a datetime, column per committer, with number of lines
        blamed to each committer at each timestamp as data.

        :param branch: (optional, default 'master') the branch to work in
        :param limit: (optional, default None), the maximum number of revisions to return, None for no limit
        :param skip: (optional, default None), the number of revisions to skip. Ex: skip=2 returns every other revision, None for no skipping.
        :param num_datapoints: (optional, default=None) if limit and skip are none, and this isn't, then num_datapoints evenly spaced revs will be used
        :param committer: (optional, defualt=True) true if committer should be reported, false if author
        :param ignore_globs: (optional, default=None) a list of globs to ignore, default none excludes nothing
        :param include_globs: (optinal, default=None) a list of globs to include, default of None includes everything.
        :param workers: (optional, default=1) integer, the number of workers to use in the threadpool, -1 for one per core.
        :return: DataFrame

        """

        if not _has_joblib:
            raise ImportError('''Must have joblib installed to use parallel_cumulative_blame(), please use
            cumulative_blame() instead.''')

        revs = self.revs(branch=branch, limit=limit, skip=skip, num_datapoints=num_datapoints)

        if self.verbose:
            print('Beginning processing for cumulative blame:')

        revisions = json.loads(revs.to_json(orient='index'))
        revisions = [revisions[key] for key in revisions]

        ds = Parallel(n_jobs=workers, backend='threading', verbose=5)(
            delayed(_parallel_cumulative_blame_func)
            (self, x, committer, ignore_globs, include_globs) for x in revisions
        )

        revs = DataFrame(ds)
        del revs['rev']

        revs['date'] = to_datetime(revs['date'].map(datetime.datetime.fromtimestamp))
        revs.set_index(keys=['date'], drop=True, inplace=True)
        revs = revs.fillna(0.0)

        # drop 0 cols
        for col in revs.columns.values:
            if col != 'col':
                if revs[col].sum() == 0:
                    del revs[col]

        # drop 0 rows
        keep_idx = []
        committers = [x for x in revs.columns.values if x != 'date']
        for idx, row in revs.iterrows():
            if sum([row[x] for x in committers]) > 0:
                keep_idx.append(idx)

        revs = revs.ix[keep_idx]
        revs.sort_index(ascending=False, inplace=True)

        return revs
Пример #16
0
    def test_set_index_cast_datetimeindex(self):
        df = DataFrame({'A': [datetime(2000, 1, 1) + timedelta(i)
                              for i in range(1000)],
                        'B': np.random.randn(1000)})

        idf = df.set_index('A')
        assert isinstance(idf.index, pd.DatetimeIndex)

        # don't cast a DatetimeIndex WITH a tz, leave as object
        # GH 6032
        i = (pd.DatetimeIndex(
            to_datetime(['2013-1-1 13:00',
                         '2013-1-2 14:00'], errors="raise"))
             .tz_localize('US/Pacific'))
        df = DataFrame(np.random.randn(2, 1), columns=['A'])

        expected = Series(np.array([pd.Timestamp('2013-01-01 13:00:00-0800',
                                                 tz='US/Pacific'),
                                    pd.Timestamp('2013-01-02 14:00:00-0800',
                                                 tz='US/Pacific')],
                                   dtype="object"))

        # convert index to series
        result = Series(i)
        assert_series_equal(result, expected)

        # assignt to frame
        df['B'] = i
        result = df['B']
        assert_series_equal(result, expected, check_names=False)
        assert result.name == 'B'

        # keep the timezone
        result = i.to_series(keep_tz=True)
        assert_series_equal(result.reset_index(drop=True), expected)

        # convert to utc
        df['C'] = i.to_series().reset_index(drop=True)
        result = df['C']
        comp = pd.DatetimeIndex(expected.values)
        comp = comp.tz_localize(None)
        tm.assert_numpy_array_equal(result.values, comp.values)

        # list of datetimes with a tz
        df['D'] = i.to_pydatetime()
        result = df['D']
        assert_series_equal(result, expected, check_names=False)
        assert result.name == 'D'

        # GH 6785
        # set the index manually
        import pytz
        df = DataFrame(
            [{'ts': datetime(2014, 4, 1, tzinfo=pytz.utc), 'foo': 1}])
        expected = df.set_index('ts')
        df.index = df['ts']
        df.pop('ts')
        assert_frame_equal(df, expected)
Пример #17
0
 def test_set_index_timezone(self):
     # GH 12358
     # tz-aware Series should retain the tz
     i = pd.to_datetime(["2014-01-01 10:10:10"],
                        utc=True).tz_convert('Europe/Rome')
     df = DataFrame({'i': i})
     assert df.set_index(i).index[0].hour == 11
     assert pd.DatetimeIndex(pd.Series(df.i))[0].hour == 11
     assert df.set_index(df.i).index[0].hour == 11
Пример #18
0
 def ledger(self, from_date = None, to_date = None, freq = None):
     """
     Show the cash ledger
     """
     df = DataFrame(self._cash)[self._columns]
     df.set_index("TS", inplace = True)
     df.sort_index(inplace = True)
     df['balance'] = df['A'].cumsum()
     return df.reset_index() # Hack to make decorator work
Пример #19
0
 def test_join_segfault(self):
     # 1532
     df1 = DataFrame({'a': [1, 1], 'b': [1, 2], 'x': [1, 2]})
     df2 = DataFrame({'a': [2, 2], 'b': [1, 2], 'y': [1, 2]})
     df1 = df1.set_index(['a', 'b'])
     df2 = df2.set_index(['a', 'b'])
     # it works!
     for how in ['left', 'right', 'outer']:
         df1.join(df2, how=how)
Пример #20
0
 def test_set_index_nonuniq(self):
     df = DataFrame({'A': ['foo', 'foo', 'foo', 'bar', 'bar'],
                     'B': ['one', 'two', 'three', 'one', 'two'],
                     'C': ['a', 'b', 'c', 'd', 'e'],
                     'D': np.random.randn(5),
                     'E': np.random.randn(5)})
     with assertRaisesRegexp(ValueError, 'Index has duplicate keys'):
         df.set_index('A', verify_integrity=True, inplace=True)
     self.assertIn('A', df)
Пример #21
0
def setIndexDataFrame():
    df = DataFrame({'a': range(7), 'b':range(7,0,-1),
                    'c':['one','one','one','two','two','two','two'],
                    'd':[0,1,2,0,1,2,3]})
    print (df)
    df2 = df.set_index(['c','d'])
    print (df2)
    df3 = df.set_index(['c','d'], drop=False)
    print (df3  )
Пример #22
0
 def test_date_index_query_with_NaT_duplicates(self):
     engine, parser = self.engine, self.parser
     n = 10
     df = DataFrame(np.random.randn(n, 3))
     df['dates1'] = date_range('1/1/2012', periods=n)
     df['dates3'] = date_range('1/1/2014', periods=n)
     df.loc[np.random.rand(n) > 0.5, 'dates1'] = pd.NaT
     df.set_index('dates1', inplace=True, drop=True)
     with pytest.raises(NotImplementedError):
         df.query('index < 20130101 < dates3', engine=engine, parser=parser)
Пример #23
0
    def test_append_preserve_index_name(self):
        # #980
        df1 = DataFrame(data=None, columns=['A', 'B', 'C'])
        df1 = df1.set_index(['A'])
        df2 = DataFrame(data=[[1, 4, 7], [2, 5, 8], [3, 6, 9]],
                        columns=['A', 'B', 'C'])
        df2 = df2.set_index(['A'])

        result = df1.append(df2)
        self.assertEqual(result.index.name, 'A')
Пример #24
0
    def test_pandas_extend_index(self):
        d1 = DataFrame(data=[2, 4, 6, 8], columns=["A"], index=[1, 2, 3, 4])
        d1.index.name = "first"

        d1["second"] = "default"
        d1.set_index(["second"], append=True, inplace=True)
        self.assertEqual(d1.index.names, ["first", "second"])

        d1 = d1.reorder_levels(["second", "first"])
        self.assertEqual(d1.index.names, ["second", "first"])
Пример #25
0
    def test_period_set_index_reindex(self):
        # GH 6631
        df = DataFrame(np.random.random(6))
        idx1 = period_range('2011/01/01', periods=6, freq='M')
        idx2 = period_range('2013', periods=6, freq='A')

        df = df.set_index(idx1)
        tm.assert_index_equal(df.index, idx1)
        df = df.set_index(idx2)
        tm.assert_index_equal(df.index, idx2)
Пример #26
0
def aggregate_chunks(mod_features_df, modality):
    without_info_df = mod_features_df.query('field != "info"')
    cnt_df = DataFrame([list(mod_features_df.loc[('info', 'count'), :].values)] * len(without_info_df),
                       index=without_info_df.index)
    agg_df = without_info_df * cnt_df
    agg_df = DataFrame(agg_df.sum(axis=1) / cnt_df.sum(axis=1), index=without_info_df.index)
    agg_df['modality'] = modality
    agg_df.set_index('modality', append=True, inplace=True)
    agg_df = agg_df.reorder_levels(['modality', 'field', 'feature'])
    return agg_df
Пример #27
0
def build_state_data(where_inner="", where_outer=""):
    """
    Generates a bar graph of complaint counts by state
    """
    query = COMPLAINTS_BY_STATE.format(where_inner, where_outer)
    cur.execute(query)
    cc_by_state = DataFrame(cur.fetchall(),
                            columns=['state', 'complaint_count'])
    cc_by_state.set_index('state', drop=False)

    return cc_by_state
Пример #28
0
    def test_sort_multi_index(self):
        # GH 25775, testing that sorting by index works with a multi-index.
        df = DataFrame({'a': [3, 1, 2], 'b': [0, 0, 0],
                        'c': [0, 1, 2], 'd': list('abc')})
        result = df.set_index(list('abc')).sort_index(level=list('ba'))

        expected = DataFrame({'a': [1, 2, 3], 'b': [0, 0, 0],
                              'c': [1, 2, 0], 'd': list('bca')})
        expected = expected.set_index(list('abc'))

        tm.assert_frame_equal(result, expected)
Пример #29
0
 def test_date_index_query(self):
     engine, parser = self.engine, self.parser
     n = 10
     df = DataFrame(np.random.randn(n, 3))
     df['dates1'] = date_range('1/1/2012', periods=n)
     df['dates3'] = date_range('1/1/2014', periods=n)
     df.set_index('dates1', inplace=True, drop=True)
     res = df.query('(index < 20130101) & (20130101 < dates3)',
                    engine=engine, parser=parser)
     expec = df[(df.index < '20130101') & ('20130101' < df.dates3)]
     assert_frame_equal(res, expec)
Пример #30
0
def _pricing_iter(csvdir, symbols, metadata, divs_splits, show_progress):
    with maybe_show_progress(symbols, show_progress,
                             label='Loading custom pricing data: ') as it:
        files = os.listdir(csvdir)
        for sid, symbol in enumerate(it):
            logger.debug('%s: sid %s' % (symbol, sid))

            try:
                fname = [fname for fname in files
                         if '%s.csv' % symbol in fname][0]
            except IndexError:
                raise ValueError("%s.csv file is not in %s" % (symbol, csvdir))

            dfr = read_csv(os.path.join(csvdir, fname),
                           parse_dates=[0],
                           infer_datetime_format=True,
                           index_col=0).sort_index()

            start_date = dfr.index[0]
            end_date = dfr.index[-1]

            # The auto_close date is the day after the last trade.
            ac_date = end_date + Timedelta(days=1)
            metadata.iloc[sid] = start_date, end_date, ac_date, symbol

            if 'split' in dfr.columns:
                tmp = 1. / dfr[dfr['split'] != 1.0]['split']
                split = DataFrame(data=tmp.index.tolist(),
                                  columns=['effective_date'])
                split['ratio'] = tmp.tolist()
                split['sid'] = sid

                splits = divs_splits['splits']
                index = Index(range(splits.shape[0],
                                    splits.shape[0] + split.shape[0]))
                split.set_index(index, inplace=True)
                divs_splits['splits'] = splits.append(split)

            if 'dividend' in dfr.columns:
                # ex_date   amount  sid record_date declared_date pay_date
                tmp = dfr[dfr['dividend'] != 0.0]['dividend']
                div = DataFrame(data=tmp.index.tolist(), columns=['ex_date'])
                div['record_date'] = NaT
                div['declared_date'] = NaT
                div['pay_date'] = NaT
                div['amount'] = tmp.tolist()
                div['sid'] = sid

                divs = divs_splits['divs']
                ind = Index(range(divs.shape[0], divs.shape[0] + div.shape[0]))
                div.set_index(ind, inplace=True)
                divs_splits['divs'] = divs.append(div)

            yield sid, dfr
Пример #31
0
    def transform_dataframe(self, dataframe):
        """
        Use matplotlib to compute boxplot statistics on e.g. timeseries data.
        """
        grouping = self.get_grouping(dataframe)
        group_field = self.get_group_field()
        header_fields = self.get_header_fields()

        if "series" in grouping:
            # Unstack so each series is a column
            for i in range(len(header_fields) + 1):
                dataframe = dataframe.unstack()

        groups = {col: dataframe[col] for col in dataframe.columns}

        if "year" in grouping:
            interval = "year"
        elif "month" in grouping:
            interval = "month"
        else:
            interval = None

        # Compute stats for each column, potentially grouped by year
        all_stats = []
        for header, series in groups.items():
            if interval:
                series_stats = self.boxplots_for_interval(series, interval)
            else:
                interval = None
                series_stats = [self.compute_boxplot(series)]

            series_infos = []
            for series_stat in series_stats:
                series_info = {}
                if isinstance(header, tuple):
                    value_name = header[0]
                    col_values = header[1:]
                else:
                    value_name = header
                    col_values = []
                col_names = zip(dataframe.columns.names[1:], col_values)
                for col_name, value in col_names:
                    series_info[col_name] = value
                for stat_name, val in series_stat.items():
                    if stat_name == interval:
                        series_info[stat_name] = val
                    else:
                        series_info[value_name + '-' + stat_name] = val
                series_infos.append(series_info)
            all_stats += series_infos

        dataframe = DataFrame(all_stats)
        if 'series' in grouping:
            index = header_fields + [group_field]
            unstack = len(header_fields)
            if interval:
                index = [interval] + index
                unstack += 1
        else:
            index = [interval]
            unstack = 0

        dataframe.set_index(index, inplace=True)
        dataframe.columns.name = ''
        for i in range(unstack):
            dataframe = dataframe.unstack()

        # Remove blank columns
        dataframe = dataframe.dropna(axis=1, how='all')
        return dataframe
Пример #32
0
def parse_ht_xml(xml,
                 ht_fun,
                 select=None,
                 corr_csv=r'C:\ecan\hilltop\ht_corrections.csv'):
    """
    Function to read a Hilltop xml file and apply a function on each individual site time series. The input to the function is a single pandas time series. The output should be a Series or DataFrame. Specific sites with specific mtypes can be passed in the form of a two column DataFrame with headers as 'site' and 'mtype'.
    """

    ### Base parameters
    rem_s = 10958 * 24 * 60 * 60
    corr = read_csv(corr_csv)
    xml_name = basename(xml)

    ### Select corrections
    corr1 = corr[corr.file_name == xml_name]

    ### Parse xml
    root = etree.iterparse(xml, tag='Measurement')

    ### Iterate
    results1 = []
    for event, elem in root:
        ## Get data
        site = elem.values()[0]
        mtype = elem.find('DataSource').values()[0]

        if (select is not None):
            if (not isinstance(select, DataFrame)):
                raise ValueError(
                    'Make sure the input is a DataFrame with two columns!')
            elif all(select.columns == ['site', 'mtype']):
                site_check = any([
                    set([site, mtype
                         ]) == set([select.loc[i].site, select.loc[i].mtype])
                    for i in select.index
                ])
                if not site_check:
                    continue


#        units = elem.find('DataSource').find('ItemInfo').find('Units').text
        site_data = [j.text.split() for j in elem.find('Data').findall('V')]

        ## Convert to dataframe
        o2 = DataFrame(site_data, columns=['date', 'val'])
        o2.loc[:, ['date', 'val']] = o2.loc[:, ['date', 'val']].astype(float)
        o2.loc[:, 'date'] = to_datetime(o2.loc[:, 'date'] - rem_s, unit='s')
        o2.set_index('date', inplace=True)

        ## Make corrections
        corr_index = (corr1.orig_site == site) & (corr1.orig_mtype == mtype)
        if any(corr_index):
            site, mtype = corr1.loc[
                corr_index, ['new_site', 'new_mtype']].values.tolist()[0]

        ## Clear element from memory
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]

        ## Do stats
        stats1 = ht_fun(o2, mtype, site)

        ## Add additional site specific columns/data
        #        stats1.loc[:, 'site'] = site
        #        stats1.loc[:, 'mtype'] = mtype
        #        stats1.loc[:, 'units'] = units

        ## Append
        results1.append(stats1)

    ### Combine data
    df_out = concat(results1)

    ### Return
    return (df_out)
Пример #33
0
    def test_pivot_timegrouper(self):
        df = DataFrame({
            'Branch':
            'A A A A A A A B'.split(),
            'Buyer':
            'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
            'Quantity': [1, 3, 5, 1, 8, 1, 9, 3],
            'Date': [
                datetime.datetime(2013, 1, 1),
                datetime.datetime(2013, 1, 1),
                datetime.datetime(2013, 10, 1),
                datetime.datetime(2013, 10, 2),
                datetime.datetime(2013, 10, 1),
                datetime.datetime(2013, 10, 2),
                datetime.datetime(2013, 12, 2),
                datetime.datetime(2013, 12, 2),
            ]
        }).set_index('Date')

        expected = DataFrame(np.array([10, 18, 3],
                                      dtype='int64').reshape(1, 3),
                             index=[datetime.datetime(2013, 12, 31)],
                             columns='Carl Joe Mark'.split())
        expected.index.name = 'Date'
        expected.columns.name = 'Buyer'

        result = pivot_table(df,
                             index=Grouper(freq='A'),
                             columns='Buyer',
                             values='Quantity',
                             aggfunc=np.sum)
        tm.assert_frame_equal(result, expected)

        result = pivot_table(df,
                             index='Buyer',
                             columns=Grouper(freq='A'),
                             values='Quantity',
                             aggfunc=np.sum)
        tm.assert_frame_equal(result, expected.T)

        expected = DataFrame(np.array([1, np.nan, 3, 9, 18,
                                       np.nan]).reshape(2, 3),
                             index=[
                                 datetime.datetime(2013, 1, 1),
                                 datetime.datetime(2013, 7, 1)
                             ],
                             columns='Carl Joe Mark'.split())
        expected.index.name = 'Date'
        expected.columns.name = 'Buyer'

        result = pivot_table(df,
                             index=Grouper(freq='6MS'),
                             columns='Buyer',
                             values='Quantity',
                             aggfunc=np.sum)
        tm.assert_frame_equal(result, expected)

        result = pivot_table(df,
                             index='Buyer',
                             columns=Grouper(freq='6MS'),
                             values='Quantity',
                             aggfunc=np.sum)
        tm.assert_frame_equal(result, expected.T)

        # passing the name
        df = df.reset_index()
        result = pivot_table(df,
                             index=Grouper(freq='6MS', key='Date'),
                             columns='Buyer',
                             values='Quantity',
                             aggfunc=np.sum)
        tm.assert_frame_equal(result, expected)

        result = pivot_table(df,
                             index='Buyer',
                             columns=Grouper(freq='6MS', key='Date'),
                             values='Quantity',
                             aggfunc=np.sum)
        tm.assert_frame_equal(result, expected.T)

        self.assertRaises(
            KeyError, lambda: pivot_table(df,
                                          index=Grouper(freq='6MS', key='foo'),
                                          columns='Buyer',
                                          values='Quantity',
                                          aggfunc=np.sum))
        self.assertRaises(
            KeyError,
            lambda: pivot_table(df,
                                index='Buyer',
                                columns=Grouper(freq='6MS', key='foo'),
                                values='Quantity',
                                aggfunc=np.sum))

        # passing the level
        df = df.set_index('Date')
        result = pivot_table(df,
                             index=Grouper(freq='6MS', level='Date'),
                             columns='Buyer',
                             values='Quantity',
                             aggfunc=np.sum)
        tm.assert_frame_equal(result, expected)

        result = pivot_table(df,
                             index='Buyer',
                             columns=Grouper(freq='6MS', level='Date'),
                             values='Quantity',
                             aggfunc=np.sum)
        tm.assert_frame_equal(result, expected.T)

        self.assertRaises(
            ValueError,
            lambda: pivot_table(df,
                                index=Grouper(freq='6MS', level='foo'),
                                columns='Buyer',
                                values='Quantity',
                                aggfunc=np.sum))
        self.assertRaises(
            ValueError,
            lambda: pivot_table(df,
                                index='Buyer',
                                columns=Grouper(freq='6MS', level='foo'),
                                values='Quantity',
                                aggfunc=np.sum))

        # double grouper
        df = DataFrame({
            'Branch':
            'A A A A A A A B'.split(),
            'Buyer':
            'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
            'Quantity': [1, 3, 5, 1, 8, 1, 9, 3],
            'Date': [
                datetime.datetime(2013, 11, 1, 13, 0),
                datetime.datetime(2013, 9, 1, 13, 5),
                datetime.datetime(2013, 10, 1, 20, 0),
                datetime.datetime(2013, 10, 2, 10, 0),
                datetime.datetime(2013, 11, 1, 20, 0),
                datetime.datetime(2013, 10, 2, 10, 0),
                datetime.datetime(2013, 10, 2, 12, 0),
                datetime.datetime(2013, 12, 5, 14, 0)
            ],
            'PayDay': [
                datetime.datetime(2013, 10, 4, 0, 0),
                datetime.datetime(2013, 10, 15, 13, 5),
                datetime.datetime(2013, 9, 5, 20, 0),
                datetime.datetime(2013, 11, 2, 10, 0),
                datetime.datetime(2013, 10, 7, 20, 0),
                datetime.datetime(2013, 9, 5, 10, 0),
                datetime.datetime(2013, 12, 30, 12, 0),
                datetime.datetime(2013, 11, 20, 14, 0),
            ]
        })

        result = pivot_table(df,
                             index=Grouper(freq='M', key='Date'),
                             columns=Grouper(freq='M', key='PayDay'),
                             values='Quantity',
                             aggfunc=np.sum)
        expected = DataFrame(np.array([
            np.nan, 3, np.nan, np.nan, 6, np.nan, 1, 9, np.nan, 9, np.nan,
            np.nan, np.nan, np.nan, 3, np.nan
        ]).reshape(4, 4),
                             index=[
                                 datetime.datetime(2013, 9, 30),
                                 datetime.datetime(2013, 10, 31),
                                 datetime.datetime(2013, 11, 30),
                                 datetime.datetime(2013, 12, 31)
                             ],
                             columns=[
                                 datetime.datetime(2013, 9, 30),
                                 datetime.datetime(2013, 10, 31),
                                 datetime.datetime(2013, 11, 30),
                                 datetime.datetime(2013, 12, 31)
                             ])
        expected.index.name = 'Date'
        expected.columns.name = 'PayDay'

        tm.assert_frame_equal(result, expected)

        result = pivot_table(df,
                             index=Grouper(freq='M', key='PayDay'),
                             columns=Grouper(freq='M', key='Date'),
                             values='Quantity',
                             aggfunc=np.sum)
        tm.assert_frame_equal(result, expected.T)

        tuples = [
            (datetime.datetime(2013, 9, 30), datetime.datetime(2013, 10, 31)),
            (datetime.datetime(2013, 10, 31), datetime.datetime(2013, 9, 30)),
            (datetime.datetime(2013, 10, 31), datetime.datetime(2013, 11, 30)),
            (datetime.datetime(2013, 10, 31), datetime.datetime(2013, 12, 31)),
            (datetime.datetime(2013, 11, 30), datetime.datetime(2013, 10, 31)),
            (datetime.datetime(2013, 12, 31), datetime.datetime(2013, 11, 30)),
        ]
        idx = MultiIndex.from_tuples(tuples, names=['Date', 'PayDay'])
        expected = DataFrame(np.array(
            [3, np.nan, 6, np.nan, 1, np.nan, 9, np.nan, 9, np.nan, np.nan,
             3]).reshape(6, 2),
                             index=idx,
                             columns=['A', 'B'])
        expected.columns.name = 'Branch'

        result = pivot_table(df,
                             index=[
                                 Grouper(freq='M', key='Date'),
                                 Grouper(freq='M', key='PayDay')
                             ],
                             columns=['Branch'],
                             values='Quantity',
                             aggfunc=np.sum)
        tm.assert_frame_equal(result, expected)

        result = pivot_table(df,
                             index=['Branch'],
                             columns=[
                                 Grouper(freq='M', key='Date'),
                                 Grouper(freq='M', key='PayDay')
                             ],
                             values='Quantity',
                             aggfunc=np.sum)
        tm.assert_frame_equal(result, expected.T)
Пример #34
0
    def test_set_index_datetime(self):
        # GH#3950
        df = DataFrame({
            "label": ["a", "a", "a", "b", "b", "b"],
            "datetime": [
                "2011-07-19 07:00:00",
                "2011-07-19 08:00:00",
                "2011-07-19 09:00:00",
                "2011-07-19 07:00:00",
                "2011-07-19 08:00:00",
                "2011-07-19 09:00:00",
            ],
            "value":
            range(6),
        })
        df.index = to_datetime(df.pop("datetime"), utc=True)
        df.index = df.index.tz_convert("US/Pacific")

        expected = DatetimeIndex(
            [
                "2011-07-19 07:00:00", "2011-07-19 08:00:00",
                "2011-07-19 09:00:00"
            ],
            name="datetime",
        )
        expected = expected.tz_localize("UTC").tz_convert("US/Pacific")

        df = df.set_index("label", append=True)
        tm.assert_index_equal(df.index.levels[0], expected)
        tm.assert_index_equal(df.index.levels[1],
                              Index(["a", "b"], name="label"))
        assert df.index.names == ["datetime", "label"]

        df = df.swaplevel(0, 1)
        tm.assert_index_equal(df.index.levels[0],
                              Index(["a", "b"], name="label"))
        tm.assert_index_equal(df.index.levels[1], expected)
        assert df.index.names == ["label", "datetime"]

        df = DataFrame(np.random.random(6))
        idx1 = DatetimeIndex(
            [
                "2011-07-19 07:00:00",
                "2011-07-19 08:00:00",
                "2011-07-19 09:00:00",
                "2011-07-19 07:00:00",
                "2011-07-19 08:00:00",
                "2011-07-19 09:00:00",
            ],
            tz="US/Eastern",
        )
        idx2 = DatetimeIndex(
            [
                "2012-04-01 09:00",
                "2012-04-01 09:00",
                "2012-04-01 09:00",
                "2012-04-02 09:00",
                "2012-04-02 09:00",
                "2012-04-02 09:00",
            ],
            tz="US/Eastern",
        )
        idx3 = date_range("2011-01-01 09:00", periods=6, tz="Asia/Tokyo")
        idx3 = idx3._with_freq(None)

        df = df.set_index(idx1)
        df = df.set_index(idx2, append=True)
        df = df.set_index(idx3, append=True)

        expected1 = DatetimeIndex(
            [
                "2011-07-19 07:00:00", "2011-07-19 08:00:00",
                "2011-07-19 09:00:00"
            ],
            tz="US/Eastern",
        )
        expected2 = DatetimeIndex(["2012-04-01 09:00", "2012-04-02 09:00"],
                                  tz="US/Eastern")

        tm.assert_index_equal(df.index.levels[0], expected1)
        tm.assert_index_equal(df.index.levels[1], expected2)
        tm.assert_index_equal(df.index.levels[2], idx3)

        # GH#7092
        tm.assert_index_equal(df.index.get_level_values(0), idx1)
        tm.assert_index_equal(df.index.get_level_values(1), idx2)
        tm.assert_index_equal(df.index.get_level_values(2), idx3)
Пример #35
0
def apply2features(df: pd.DataFrame, features: List,
                   processor: Callable) -> pd.DataFrame:
    not_features = [col for col in df.columns if col not in features]
    return processor(df.set_index(not_features)).reset_index()
Пример #36
0
    def test_to_latex_multiindex(self):
        df = DataFrame({("x", "y"): ["a"]})
        result = df.to_latex()
        expected = r"""\begin{tabular}{ll}
\toprule
{} &  x \\
{} &  y \\
\midrule
0 &  a \\
\bottomrule
\end{tabular}
"""

        assert result == expected

        result = df.T.to_latex()
        expected = r"""\begin{tabular}{lll}
\toprule
  &   &  0 \\
\midrule
x & y &  a \\
\bottomrule
\end{tabular}
"""

        assert result == expected

        df = DataFrame.from_dict({
            ("c1", 0):
            pd.Series({x: x
                       for x in range(4)}),
            ("c1", 1):
            pd.Series({x: x + 4
                       for x in range(4)}),
            ("c2", 0):
            pd.Series({x: x
                       for x in range(4)}),
            ("c2", 1):
            pd.Series({x: x + 4
                       for x in range(4)}),
            ("c3", 0):
            pd.Series({x: x
                       for x in range(4)}),
        }).T
        result = df.to_latex()
        expected = r"""\begin{tabular}{llrrrr}
\toprule
   &   &  0 &  1 &  2 &  3 \\
\midrule
c1 & 0 &  0 &  1 &  2 &  3 \\
   & 1 &  4 &  5 &  6 &  7 \\
c2 & 0 &  0 &  1 &  2 &  3 \\
   & 1 &  4 &  5 &  6 &  7 \\
c3 & 0 &  0 &  1 &  2 &  3 \\
\bottomrule
\end{tabular}
"""

        assert result == expected

        # GH 14184
        df = df.T
        df.columns.names = ["a", "b"]
        result = df.to_latex()
        expected = r"""\begin{tabular}{lrrrrr}
\toprule
a & \multicolumn{2}{l}{c1} & \multicolumn{2}{l}{c2} & c3 \\
b &  0 &  1 &  0 &  1 &  0 \\
\midrule
0 &  0 &  4 &  0 &  4 &  0 \\
1 &  1 &  5 &  1 &  5 &  1 \\
2 &  2 &  6 &  2 &  6 &  2 \\
3 &  3 &  7 &  3 &  7 &  3 \\
\bottomrule
\end{tabular}
"""
        assert result == expected

        # GH 10660
        df = pd.DataFrame({
            "a": [0, 0, 1, 1],
            "b": list("abab"),
            "c": [1, 2, 3, 4]
        })
        result = df.set_index(["a", "b"]).to_latex()
        expected = r"""\begin{tabular}{llr}
\toprule
  &   &  c \\
a & b &    \\
\midrule
0 & a &  1 \\
  & b &  2 \\
1 & a &  3 \\
  & b &  4 \\
\bottomrule
\end{tabular}
"""

        assert result == expected

        result = df.groupby("a").describe().to_latex()
        expected = r"""\begin{tabular}{lrrrrrrrr}
\toprule
{} & \multicolumn{8}{l}{c} \\
{} & count & mean &       std &  min &   25\% &  50\% &   75\% &  max \\
a &       &      &           &      &       &      &       &      \\
\midrule
0 &   2.0 &  1.5 &  0.707107 &  1.0 &  1.25 &  1.5 &  1.75 &  2.0 \\
1 &   2.0 &  3.5 &  0.707107 &  3.0 &  3.25 &  3.5 &  3.75 &  4.0 \\
\bottomrule
\end{tabular}
"""

        assert result == expected
Пример #37
0
    def test_set_index2(self):
        df = DataFrame({'A': ['foo', 'foo', 'foo', 'bar', 'bar'],
                        'B': ['one', 'two', 'three', 'one', 'two'],
                        'C': ['a', 'b', 'c', 'd', 'e'],
                        'D': np.random.randn(5),
                        'E': np.random.randn(5)})

        # new object, single-column
        result = df.set_index('C')
        result_nodrop = df.set_index('C', drop=False)

        index = Index(df['C'], name='C')

        expected = df.loc[:, ['A', 'B', 'D', 'E']]
        expected.index = index

        expected_nodrop = df.copy()
        expected_nodrop.index = index

        assert_frame_equal(result, expected)
        assert_frame_equal(result_nodrop, expected_nodrop)
        self.assertEqual(result.index.name, index.name)

        # inplace, single
        df2 = df.copy()

        df2.set_index('C', inplace=True)

        assert_frame_equal(df2, expected)

        df3 = df.copy()
        df3.set_index('C', drop=False, inplace=True)

        assert_frame_equal(df3, expected_nodrop)

        # create new object, multi-column
        result = df.set_index(['A', 'B'])
        result_nodrop = df.set_index(['A', 'B'], drop=False)

        index = MultiIndex.from_arrays([df['A'], df['B']], names=['A', 'B'])

        expected = df.loc[:, ['C', 'D', 'E']]
        expected.index = index

        expected_nodrop = df.copy()
        expected_nodrop.index = index

        assert_frame_equal(result, expected)
        assert_frame_equal(result_nodrop, expected_nodrop)
        self.assertEqual(result.index.names, index.names)

        # inplace
        df2 = df.copy()
        df2.set_index(['A', 'B'], inplace=True)
        assert_frame_equal(df2, expected)

        df3 = df.copy()
        df3.set_index(['A', 'B'], drop=False, inplace=True)
        assert_frame_equal(df3, expected_nodrop)

        # corner case
        with assertRaisesRegexp(ValueError, 'Index has duplicate keys'):
            df.set_index('A', verify_integrity=True)

        # append
        result = df.set_index(['A', 'B'], append=True)
        xp = df.reset_index().set_index(['index', 'A', 'B'])
        xp.index.names = [None, 'A', 'B']
        assert_frame_equal(result, xp)

        # append to existing multiindex
        rdf = df.set_index(['A'], append=True)
        rdf = rdf.set_index(['B', 'C'], append=True)
        expected = df.set_index(['A', 'B', 'C'], append=True)
        assert_frame_equal(rdf, expected)

        # Series
        result = df.set_index(df.C)
        self.assertEqual(result.index.name, 'C')
Пример #38
0
def _data_to_frame(data, header, index_col, infer_types, skiprows):
    """Parse a BeautifulSoup table into a DataFrame.

    Parameters
    ----------
    data : tuple of lists
        The raw data to be placed into a DataFrame. This is a list of lists of
        strings or unicode. If it helps, it can be thought of as a matrix of
        strings instead.

    header : int or None
        An integer indicating the row to use for the column header or None
        indicating no header will be used.

    index_col : int or None
        An integer indicating the column to use for the index or None
        indicating no column will be used.

    infer_types : bool
        Whether to convert numbers and dates.

    skiprows : collections.Container or int or slice
        Iterable used to skip rows.

    Returns
    -------
    df : DataFrame
        A DataFrame containing the data from `data`

    Raises
    ------
    ValueError
        * If `skiprows` is not found in the rows of the parsed DataFrame.

    Raises
    ------
    ValueError
        * If `skiprows` is not found in the rows of the parsed DataFrame.

    See Also
    --------
    read_html

    Notes
    -----
    The `data` parameter is guaranteed not to be a list of empty lists.
    """
    thead, tbody, tfoot = data
    columns = thead or None
    df = DataFrame(tbody, columns=columns)

    if skiprows is not None:
        it = _get_skiprows_iter(skiprows)

        try:
            df = df.drop(it)
        except ValueError:
            raise ValueError('Labels {0} not found when trying to skip'
                             ' rows'.format(it))

    # convert to numbers/dates where possible
    # must be sequential since dates trump numbers if both args are given
    if infer_types:
        df = df.convert_objects(convert_numeric=True)
        df = df.convert_objects(convert_dates='coerce')

    if header is not None:
        header_rows = df.iloc[header]

        if header_rows.ndim == 2:
            names = header_rows.index
            df.columns = MultiIndex.from_arrays(header_rows.values,
                                                names=names)
        else:
            df.columns = header_rows

        df = df.drop(df.index[header])

    if index_col is not None:
        cols = df.columns[index_col]

        try:
            cols = cols.tolist()
        except AttributeError:
            pass

        # drop by default
        df.set_index(cols, inplace=True)
        if df.index.nlevels == 1:
            if isnull(df.index.name) or not df.index.name:
                df.index.name = None
        else:
            names = [name or None for name in df.index.names]
            df.index = MultiIndex.from_tuples(df.index.values, names=names)

    return df
Пример #39
0
def compute_fractal(begin_date, end_date):
    codes = get_all_codes()
    # codes = ['000151']

    # 计算每个股票的信号
    for index, code in enumerate(codes):
        try:
            # 获取后复权的价格,使用后复权的价格计算分型信号
            daily_cursor = DB_CONN['daily_hfq'].find(
                {
                    'code': code,
                    'date': {
                        '$gte': begin_date,
                        '$lte': end_date
                    }
                },
                sort=[('date', ASCENDING)],
                projection={
                    'date': True,
                    'high': True,
                    'low': True,
                    '_id': False
                })

            df_daily = DataFrame([daily for daily in daily_cursor])

            # 设置日期作为索引
            df_daily.set_index(['date'], inplace=True)

            # 通过shift,将前两天和后两天对齐到中间一天
            df_daily_shift_1 = df_daily.shift(1)
            df_daily_shift_2 = df_daily.shift(2)
            df_daily_shift_3 = df_daily.shift(3)
            df_daily_shift_4 = df_daily.shift(4)

            # 顶分型,中间日的最高价既大于前两天的最高价,也大于后两天的最高价
            df_daily['up'] = (df_daily_shift_3['high'] > df_daily_shift_1['high']) & \
                             (df_daily_shift_3['high'] > df_daily_shift_2['high']) & \
                             (df_daily_shift_3['high'] > df_daily_shift_4['high']) & \
                             (df_daily_shift_3['high'] > df_daily['high'])

            # 底分型,中间日的最低价既小于前两天的最低价,也小于后两天的最低价
            df_daily['down'] = (df_daily_shift_3['low'] < df_daily_shift_1['low']) & \
                               (df_daily_shift_3['low'] < df_daily_shift_2['low']) & \
                               (df_daily_shift_3['low'] < df_daily_shift_4['low']) & \
                               (df_daily_shift_3['low'] < df_daily['low'])

            # 只保留了出现顶分型和低分型信号的日期, 其他数据全部舍弃
            df_daily = df_daily[(df_daily['up'] | df_daily['down'])]

            # 抛掉不用的数据
            df_daily.drop(['high', 'low'], axis=1, inplace=True)
            # print(df_daily)
            '''
            up   down
date                    
2019-05-15  False   True
2019-05-16   True  False
2019-05-20   True  False
2019-05-23  False   True
            '''

            # 将信号保存到数据库 ,
            update_requests = []
            # 保存的数据结果时,code、date和信号的方向
            for date in df_daily.index:
                doc = {
                    'code': code,
                    'date': date,
                    # up: 顶分型, down:底分型
                    'direction': 'up' if df_daily.loc[date]['up'] else 'down'
                }

                # 保存时以code、date和direction做条件,那么就需要在这三个字段上建立索引
                # db.fractal_signal.createIndex({'code': 1, 'date': 1, 'direction': 1})
                update_requests.append(
                    UpdateOne(doc, {'$set': doc}, upsert=True))

            if len(update_requests) > 0:
                update_result = DB_CONN['fractal'].bulk_write(update_requests,
                                                              ordered=False)
                print('Save Fractal, 第%d个, 股票代码:%s, 插入:%4d, 更新:%4d' %
                      (index + 1, code, update_result.upserted_count,
                       update_result.modified_count),
                      flush=True)
        except:
            print('错误发生: %s' % code, flush=True)
            traceback.print_exc()
Пример #40
0
    def test_equals(self):
        s1 = pd.Series([1, 2, 3], index=[0, 2, 1])
        s2 = s1.copy()
        assert s1.equals(s2)

        s1[1] = 99
        assert not s1.equals(s2)

        # NaNs compare as equal
        s1 = pd.Series([1, np.nan, 3, np.nan], index=[0, 2, 1, 3])
        s2 = s1.copy()
        assert s1.equals(s2)

        s2[0] = 9.9
        assert not s1.equals(s2)

        idx = MultiIndex.from_tuples([(0, 'a'), (1, 'b'), (2, 'c')])
        s1 = Series([1, 2, np.nan], index=idx)
        s2 = s1.copy()
        assert s1.equals(s2)

        # Add object dtype column with nans
        index = np.random.random(10)
        df1 = DataFrame(np.random.random(10, ),
                        index=index,
                        columns=['floats'])
        df1['text'] = 'the sky is so blue. we could use more chocolate.'.split(
        )
        df1['start'] = date_range('2000-1-1', periods=10, freq='T')
        df1['end'] = date_range('2000-1-1', periods=10, freq='D')
        df1['diff'] = df1['end'] - df1['start']
        df1['bool'] = (np.arange(10) % 3 == 0)
        df1.loc[::2] = np.nan
        df2 = df1.copy()
        assert df1['text'].equals(df2['text'])
        assert df1['start'].equals(df2['start'])
        assert df1['end'].equals(df2['end'])
        assert df1['diff'].equals(df2['diff'])
        assert df1['bool'].equals(df2['bool'])
        assert df1.equals(df2)
        assert not df1.equals(object)

        # different dtype
        different = df1.copy()
        different['floats'] = different['floats'].astype('float32')
        assert not df1.equals(different)

        # different index
        different_index = -index
        different = df2.set_index(different_index)
        assert not df1.equals(different)

        # different columns
        different = df2.copy()
        different.columns = df2.columns[::-1]
        assert not df1.equals(different)

        # DatetimeIndex
        index = pd.date_range('2000-1-1', periods=10, freq='T')
        df1 = df1.set_index(index)
        df2 = df1.copy()
        assert df1.equals(df2)

        # MultiIndex
        df3 = df1.set_index(['text'], append=True)
        df2 = df1.set_index(['text'], append=True)
        assert df3.equals(df2)

        df2 = df1.set_index(['floats'], append=True)
        assert not df3.equals(df2)

        # NaN in index
        df3 = df1.set_index(['floats'], append=True)
        df2 = df1.set_index(['floats'], append=True)
        assert df3.equals(df2)

        # GH 8437
        a = pd.Series([False, np.nan])
        b = pd.Series([False, np.nan])
        c = pd.Series(index=range(2))
        d = pd.Series(index=range(2))
        e = pd.Series(index=range(2))
        f = pd.Series(index=range(2))
        c[:-1] = d[:-1] = e[0] = f[0] = False
        assert a.equals(a)
        assert a.equals(b)
        assert a.equals(c)
        assert a.equals(d)
        assert a.equals(e)
        assert e.equals(f)
Пример #41
0
    def __init__(self,
                 ps: ParamStore,
                 camp: str,
                 profile: pd.DataFrame,
                 profile_override_dict={}):
        self.ps = ps
        self.camp = camp
        disease_params = ps.get_disease_params()
        camp_params = ps.get_camp_params(camp)
        # ------------------------------------------------------------
        # disease params
        parameter_csv = disease_params
        model_params = parameter_csv[parameter_csv['Type'] ==
                                     'Model Parameter']
        model_params = model_params.loc[:, ['Name', 'Value']]
        control_data = parameter_csv[parameter_csv['Type'] == 'Control']
        self.model_params = model_params

        profile.set_index('Parameter', inplace=True)

        self.number_of_people_in_isoboxes = int(
            profile.loc['number_of_people_in_isoboxes', 'Value'])
        self.number_of_people_in_one_isobox = int(
            profile.loc['number_of_people_in_one_isobox', 'Value'])
        self.number_of_isoboxes = self.number_of_people_in_isoboxes / \
            self.number_of_people_in_one_isobox

        self.number_of_people_in_tents = int(
            profile.loc['number_of_people_in_tents', 'Value'])
        self.number_of_people_in_one_tent = int(
            profile.loc['number_of_people_in_one_tent', 'Value'])
        self.number_of_tents = self.number_of_people_in_tents / \
            self.number_of_people_in_one_tent

        self.total_population = self.number_of_people_in_isoboxes + \
            self.number_of_people_in_tents
        # float(profile.loc['permanently_asymptomatic_cases','Value'])
        self.permanently_asymptomatic_cases = 0.179
        self.age_and_gender = abm.read_age_gender(self.total_population)

        # float(profile.loc['area_covered_by_isoboxes','Value'])
        self.area_covered_by_isoboxes = 0.5
        # float(profile.loc['relative_strength_of_interaction','Value'])
        self.relative_strength_of_interaction = 0.2

        # float(profile.loc['smaller_movement_radius','Value'])
        self.smaller_movement_radius = 0.02
        # float(profile.loc['larger_movement_radius','Value'])
        self.larger_movement_radius = 0.1
        # float(profile.loc['overlapping_rages_radius','Value'])
        self.overlapping_rages_radius = 0.02

        self.number_of_steps = int(profile.loc['number_of_steps', 'Value'])
        self.number_of_states = 14
        self.track_states = np.zeros(
            (self.number_of_steps, self.number_of_states))
        self.ACTIVATE_INTERVENTION = profile.loc['ACTIVATE_INTERVENTION',
                                                 'Value']
        # int(profile.loc['total_number_of_hospitalized','Value'])
        self.total_number_of_hospitalized = 0

        self.num_toilet_visit = int(profile.loc['num_toilet_visit', 'Value'])
        self.num_toilet_contact = int(profile.loc['num_toilet_contact',
                                                  'Value'])
        self.num_food_visit = int(profile.loc['num_food_visit', 'Value'])
        self.num_food_contact = int(profile.loc['num_food_contact', 'Value'])
        self.pct_food_visit = float(profile.loc['pct_food_visit', 'Value'])

        # float(profile.loc['transmission_reduction','Value'])
        self.transmission_reduction = 1

        # float(profile.loc['probability_infecting_person_in_household_per_day','Value'])
        self.probability_infecting_person_in_household_per_day = 0.33
        # float(profile.loc['probability_infecting_person_in_foodline_per_day','Value'])
        self.probability_infecting_person_in_foodline_per_day = 0.407
        # float(profile.loc['probability_infecting_person_in_toilet_per_day','Value'])
        self.probability_infecting_person_in_toilet_per_day = 0.099
        # float(profile.loc['probability_infecting_person_in_moving_per_day','Value'])
        self.probability_infecting_person_in_moving_per_day = 0.017

        # float(profile.loc['probability_spotting_symptoms_per_day','Value'])
        self.probability_spotting_symptoms_per_day = 0.05
        self.clearday = int(profile.loc['clearday', 'Value'])
        tb = profile.loc['toilets_blocks', 'Value'].split(',')
        self.toilets_blocks = (int(tb[0]), int(tb[1]))
        fb = profile.loc['foodline_blocks', 'Value'].split(',')
        self.foodline_blocks = (int(fb[0]), int(fb[1]))

        self.population = abm.form_population_matrix(
            self.total_population, self.number_of_isoboxes,
            self.number_of_people_in_isoboxes, self.number_of_tents,
            self.number_of_people_in_tents,
            self.permanently_asymptomatic_cases, self.age_and_gender)

        self.households_location = abm.place_households(
            self.population[:, 0].astype(int), self.area_covered_by_isoboxes,
            self.number_of_isoboxes)

        self.toilets_location, self.toilets_numbers, self.toilets_sharing = \
            abm.position_toilet(self.households_location,
                                self.toilets_blocks[0], self.toilets_blocks[1])
        self.foodpoints_location, self.foodpoints_numbers, self.foodpoints_sharing = \
            abm.position_foodline(
                self.households_location, self.foodline_blocks[0], self.foodline_blocks[1])
        self.ethnical_corellations = abm.create_ethnic_groups(
            self.households_location, self.relative_strength_of_interaction)
        self.local_interaction_space = abm.interaction_neighbours(
            self.households_location, self.smaller_movement_radius,
            self.larger_movement_radius, self.overlapping_rages_radius,
            self.ethnical_corellations)

        self.control_dict = {}
Пример #42
0
def parse_table_schema(json, precise_float):
    """
    Builds a DataFrame from a given schema

    Parameters
    ----------
    json :
        A JSON table schema
    precise_float : boolean
        Flag controlling precision when decoding string to double values, as
        dictated by ``read_json``

    Returns
    -------
    df : DataFrame

    Raises
    ------
    NotImplementedError
        If the JSON table schema contains either timezone or timedelta data

    Notes
    -----
        Because :func:`DataFrame.to_json` uses the string 'index' to denote a
        name-less :class:`Index`, this function sets the name of the returned
        :class:`DataFrame` to ``None`` when said string is encountered with a
        normal :class:`Index`. For a :class:`MultiIndex`, the same limitation
        applies to any strings beginning with 'level_'. Therefore, an
        :class:`Index` name of 'index'  and :class:`MultiIndex` names starting
        with 'level_' are not supported.

    See Also
    --------
    build_table_schema : Inverse function.
    pandas.read_json
    """
    table = loads(json, precise_float=precise_float)
    col_order = [field["name"] for field in table["schema"]["fields"]]
    df = DataFrame(table["data"], columns=col_order)[col_order]

    dtypes = {
        field["name"]: convert_json_field_to_pandas_type(field)
        for field in table["schema"]["fields"]
    }

    # No ISO constructor for Timedelta as of yet, so need to raise
    if "timedelta64" in dtypes.values():
        raise NotImplementedError(
            'table="orient" can not yet read ISO-formatted Timedelta data')

    df = df.astype(dtypes)

    if "primaryKey" in table["schema"]:
        df = df.set_index(table["schema"]["primaryKey"])
        if len(df.index.names) == 1:
            if df.index.name == "index":
                df.index.name = None
        else:
            df.index.names = [
                None if x.startswith("level_") else x for x in df.index.names
            ]

    return df
Пример #43
0
    def test_set_index_nan(self):

        # GH 3586
        df = DataFrame({
            'PRuid': {
                17: 'nonQC',
                18: 'nonQC',
                19: 'nonQC',
                20: '10',
                21: '11',
                22: '12',
                23: '13',
                24: '24',
                25: '35',
                26: '46',
                27: '47',
                28: '48',
                29: '59',
                30: '10'
            },
            'QC': {
                17: 0.0,
                18: 0.0,
                19: 0.0,
                20: np.nan,
                21: np.nan,
                22: np.nan,
                23: np.nan,
                24: 1.0,
                25: np.nan,
                26: np.nan,
                27: np.nan,
                28: np.nan,
                29: np.nan,
                30: np.nan
            },
            'data': {
                17: 7.9544899999999998,
                18: 8.0142609999999994,
                19: 7.8591520000000008,
                20: 0.86140349999999999,
                21: 0.87853110000000001,
                22: 0.8427041999999999,
                23: 0.78587700000000005,
                24: 0.73062459999999996,
                25: 0.81668560000000001,
                26: 0.81927080000000008,
                27: 0.80705009999999999,
                28: 0.81440240000000008,
                29: 0.80140849999999997,
                30: 0.81307740000000006
            },
            'year': {
                17: 2006,
                18: 2007,
                19: 2008,
                20: 1985,
                21: 1985,
                22: 1985,
                23: 1985,
                24: 1985,
                25: 1985,
                26: 1985,
                27: 1985,
                28: 1985,
                29: 1985,
                30: 1986
            }
        }).reset_index()

        result = df.set_index(['year', 'PRuid',
                               'QC']).reset_index().reindex(columns=df.columns)
        tm.assert_frame_equal(result, df)
Пример #44
0
    def test_set_index_cast_datetimeindex(self):
        df = DataFrame({'A': [datetime(2000, 1, 1) + timedelta(i)
                              for i in range(1000)],
                        'B': np.random.randn(1000)})

        idf = df.set_index('A')
        tm.assertIsInstance(idf.index, pd.DatetimeIndex)

        # don't cast a DatetimeIndex WITH a tz, leave as object
        # GH 6032
        i = (pd.DatetimeIndex(
            pd.tseries.tools.to_datetime(['2013-1-1 13:00',
                                          '2013-1-2 14:00'], errors="raise"))
             .tz_localize('US/Pacific'))
        df = DataFrame(np.random.randn(2, 1), columns=['A'])

        expected = Series(np.array([pd.Timestamp('2013-01-01 13:00:00-0800',
                                                 tz='US/Pacific'),
                                    pd.Timestamp('2013-01-02 14:00:00-0800',
                                                 tz='US/Pacific')],
                                   dtype="object"))

        # convert index to series
        result = Series(i)
        assert_series_equal(result, expected)

        # assignt to frame
        df['B'] = i
        result = df['B']
        assert_series_equal(result, expected, check_names=False)
        self.assertEqual(result.name, 'B')

        # keep the timezone
        result = i.to_series(keep_tz=True)
        assert_series_equal(result.reset_index(drop=True), expected)

        # convert to utc
        df['C'] = i.to_series().reset_index(drop=True)
        result = df['C']
        comp = pd.DatetimeIndex(expected.values).copy()
        comp.tz = None
        self.assert_numpy_array_equal(result.values, comp.values)

        # list of datetimes with a tz
        df['D'] = i.to_pydatetime()
        result = df['D']
        assert_series_equal(result, expected, check_names=False)
        self.assertEqual(result.name, 'D')

        # GH 6785
        # set the index manually
        import pytz
        df = DataFrame(
            [{'ts': datetime(2014, 4, 1, tzinfo=pytz.utc), 'foo': 1}])
        expected = df.set_index('ts')
        df.index = df['ts']
        df.pop('ts')
        assert_frame_equal(df, expected)

        # GH 3950
        # reset_index with single level
        for tz in ['UTC', 'Asia/Tokyo', 'US/Eastern']:
            idx = pd.date_range('1/1/2011', periods=5,
                                freq='D', tz=tz, name='idx')
            df = pd.DataFrame(
                {'a': range(5), 'b': ['A', 'B', 'C', 'D', 'E']}, index=idx)

            expected = pd.DataFrame({'idx': [datetime(2011, 1, 1),
                                             datetime(2011, 1, 2),
                                             datetime(2011, 1, 3),
                                             datetime(2011, 1, 4),
                                             datetime(2011, 1, 5)],
                                     'a': range(5),
                                     'b': ['A', 'B', 'C', 'D', 'E']},
                                    columns=['idx', 'a', 'b'])
            expected['idx'] = expected['idx'].apply(
                lambda d: pd.Timestamp(d, tz=tz))
            assert_frame_equal(df.reset_index(), expected)
Пример #45
0
    def test_to_csv_quoting(self):
        df = DataFrame({
            'c_bool': [True, False],
            'c_float': [1.0, 3.2],
            'c_int': [42, np.nan],
            'c_string': ['a', 'b,c'],
        })

        expected_rows = [
            ',c_bool,c_float,c_int,c_string', '0,True,1.0,42.0,a',
            '1,False,3.2,,"b,c"'
        ]
        expected = tm.convert_rows_list_to_csv_str(expected_rows)

        result = df.to_csv()
        assert result == expected

        result = df.to_csv(quoting=None)
        assert result == expected

        expected_rows = [
            ',c_bool,c_float,c_int,c_string', '0,True,1.0,42.0,a',
            '1,False,3.2,,"b,c"'
        ]
        expected = tm.convert_rows_list_to_csv_str(expected_rows)

        result = df.to_csv(quoting=csv.QUOTE_MINIMAL)
        assert result == expected

        expected_rows = [
            '"","c_bool","c_float","c_int","c_string"',
            '"0","True","1.0","42.0","a"', '"1","False","3.2","","b,c"'
        ]
        expected = tm.convert_rows_list_to_csv_str(expected_rows)

        result = df.to_csv(quoting=csv.QUOTE_ALL)
        assert result == expected

        # see gh-12922, gh-13259: make sure changes to
        # the formatters do not break this behaviour
        expected_rows = [
            '"","c_bool","c_float","c_int","c_string"', '0,True,1.0,42.0,"a"',
            '1,False,3.2,"","b,c"'
        ]
        expected = tm.convert_rows_list_to_csv_str(expected_rows)
        result = df.to_csv(quoting=csv.QUOTE_NONNUMERIC)
        assert result == expected

        msg = "need to escape, but no escapechar set"
        with pytest.raises(csv.Error, match=msg):
            df.to_csv(quoting=csv.QUOTE_NONE)

        with pytest.raises(csv.Error, match=msg):
            df.to_csv(quoting=csv.QUOTE_NONE, escapechar=None)

        expected_rows = [
            ',c_bool,c_float,c_int,c_string', '0,True,1.0,42.0,a',
            '1,False,3.2,,b!,c'
        ]
        expected = tm.convert_rows_list_to_csv_str(expected_rows)
        result = df.to_csv(quoting=csv.QUOTE_NONE, escapechar='!')
        assert result == expected

        expected_rows = [
            ',c_bool,c_ffloat,c_int,c_string', '0,True,1.0,42.0,a',
            '1,False,3.2,,bf,c'
        ]
        expected = tm.convert_rows_list_to_csv_str(expected_rows)
        result = df.to_csv(quoting=csv.QUOTE_NONE, escapechar='f')
        assert result == expected

        # see gh-3503: quoting Windows line terminators
        # presents with encoding?
        text_rows = ['a,b,c', '1,"test \r\n",3']
        text = tm.convert_rows_list_to_csv_str(text_rows)
        df = pd.read_csv(StringIO(text))

        buf = StringIO()
        df.to_csv(buf, encoding='utf-8', index=False)
        assert buf.getvalue() == text

        # xref gh-7791: make sure the quoting parameter is passed through
        # with multi-indexes
        df = pd.DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [5, 6]})
        df = df.set_index(['a', 'b'])

        expected_rows = ['"a","b","c"', '"1","3","5"', '"2","4","6"']
        expected = tm.convert_rows_list_to_csv_str(expected_rows)
        assert df.to_csv(quoting=csv.QUOTE_ALL) == expected
Пример #46
0
              index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'],[1, 2, 3, 1, 2, 3, 1, 2, 2, 3]])
data.index
data['b']
data['b':'c']
data.ix[['b','d']]
data.unstack()
data.unstack().stack()

frame = DataFrame(np.arange(12).reshape((4,3)),
                  index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                  columns=[['Ohio', 'Ohio', 'Colorado'],
                           ['Green', 'Red', 'Green']])

frame.index.names = ['key1', 'key2']
frame.columns.names = ['state', 'color']
frame['Ohio']

#Reordering and sorting levels
frame.swaplevel('key1', 'key2')
#summary by level
frame.sum(level = 'key2')
frame.sum(level = 'color', axis=1)

#Using a dataframe's columns
frame = DataFrame({'a': range(7), 'b': range(7, 0, -1),
                   'c': ['one', 'one', 'one', 'two', 'two', 'two', 'two'],
                   'd': [0, 1, 2, 0, 1, 2, 3]})

frame2 = frame.set_index(['c','d'])
frame.set_index(['c','d'], drop=False)
frame2.reset_index()
def test_groupby_resample_interpolate():
    # GH 35325
    d = {"price": [10, 11, 9], "volume": [50, 60, 50]}

    df = DataFrame(d)

    df["week_starting"] = date_range("01/01/2018", periods=3, freq="W")

    result = (
        df.set_index("week_starting")
        .groupby("volume")
        .resample("1D")
        .interpolate(method="linear")
    )

    msg = "containing strings is deprecated"
    with tm.assert_produces_warning(FutureWarning, match=msg):
        expected_ind = pd.MultiIndex.from_tuples(
            [
                (50, "2018-01-07"),
                (50, Timestamp("2018-01-08")),
                (50, Timestamp("2018-01-09")),
                (50, Timestamp("2018-01-10")),
                (50, Timestamp("2018-01-11")),
                (50, Timestamp("2018-01-12")),
                (50, Timestamp("2018-01-13")),
                (50, Timestamp("2018-01-14")),
                (50, Timestamp("2018-01-15")),
                (50, Timestamp("2018-01-16")),
                (50, Timestamp("2018-01-17")),
                (50, Timestamp("2018-01-18")),
                (50, Timestamp("2018-01-19")),
                (50, Timestamp("2018-01-20")),
                (50, Timestamp("2018-01-21")),
                (60, Timestamp("2018-01-14")),
            ],
            names=["volume", "week_starting"],
        )

    expected = DataFrame(
        data={
            "price": [
                10.0,
                9.928571428571429,
                9.857142857142858,
                9.785714285714286,
                9.714285714285714,
                9.642857142857142,
                9.571428571428571,
                9.5,
                9.428571428571429,
                9.357142857142858,
                9.285714285714286,
                9.214285714285714,
                9.142857142857142,
                9.071428571428571,
                9.0,
                11.0,
            ],
            "volume": [50.0] * 15 + [60],
        },
        index=expected_ind,
    )
    tm.assert_frame_equal(result, expected)
Пример #48
0
    def test_reset_index_level(self):
        df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["A", "B", "C", "D"])

        for levels in ["A", "B"], [0, 1]:
            # With MultiIndex
            result = df.set_index(["A", "B"]).reset_index(level=levels[0])
            tm.assert_frame_equal(result, df.set_index("B"))

            result = df.set_index(["A", "B"]).reset_index(level=levels[:1])
            tm.assert_frame_equal(result, df.set_index("B"))

            result = df.set_index(["A", "B"]).reset_index(level=levels)
            tm.assert_frame_equal(result, df)

            result = df.set_index(["A", "B"]).reset_index(level=levels, drop=True)
            tm.assert_frame_equal(result, df[["C", "D"]])

            # With single-level Index (GH 16263)
            result = df.set_index("A").reset_index(level=levels[0])
            tm.assert_frame_equal(result, df)

            result = df.set_index("A").reset_index(level=levels[:1])
            tm.assert_frame_equal(result, df)

            result = df.set_index(["A"]).reset_index(level=levels[0], drop=True)
            tm.assert_frame_equal(result, df[["B", "C", "D"]])

        # Missing levels - for both MultiIndex and single-level Index:
        for idx_lev in ["A", "B"], ["A"]:
            with pytest.raises(KeyError, match=r"(L|l)evel \(?E\)?"):
                df.set_index(idx_lev).reset_index(level=["A", "E"])
            with pytest.raises(IndexError, match="Too many levels"):
                df.set_index(idx_lev).reset_index(level=[0, 1, 2])
Пример #49
0
#print(rows)

conn.close()


# In[46]:

import pandas as pd
from pandas import Series, DataFrame
db1=DataFrame(db)

db1['datetime']=db1['date_index'].apply(lambda x: pd.to_datetime(str(x), format='%Y%m%d%H%M%S'))  #db1에 datetime 이라는 index를 설정해주기 위해 datetime 이라는 열을 설정
#  .apply(lambda x: ~~~ 의 의미는  내가 x를 다룰 건데, ~~~ 이런식으로 할꺼야 라는 뜻
# %Y%m%d 형식으로 된 X를 pandas의 to_datetime 함수를 통해 datetime object로 변환
db1['message_num']=1   #메세지 수를 합산하기 위해 만든 컬럼
db1.set_index(db1['datetime'], inplace=True)  #datetime 컬럼을 index로 만듬 

db1=db1.drop('datetime',1)  #기존에 만들었던 datetime 컬럼을 삭제
db1=db1.drop('date_index', 1)
db1=db1.drop('date', 1)
db1


# <a id='the_destination2'></a>
# ## 1.2 DB 인덱스 수정

# In[47]:

import pymysql.cursors
import numpy as np
conn = pymysql.connect(host='169.56.124.93', user='******' , password='******', charset='utf8')
Пример #50
0
    def test_multiindex_assignment(self):

        # GH3777 part 2

        # mixed dtype
        df = DataFrame(
            np.random.randint(5, 10, size=9).reshape(3, 3),
            columns=list("abc"),
            index=[[4, 4, 8], [8, 10, 12]],
        )
        df["d"] = np.nan
        arr = np.array([0.0, 1.0])

        df.loc[4, "d"] = arr
        tm.assert_series_equal(df.loc[4, "d"],
                               Series(arr, index=[8, 10], name="d"))

        # single dtype
        df = DataFrame(
            np.random.randint(5, 10, size=9).reshape(3, 3),
            columns=list("abc"),
            index=[[4, 4, 8], [8, 10, 12]],
        )

        df.loc[4, "c"] = arr
        exp = Series(arr, index=[8, 10], name="c", dtype="float64")
        tm.assert_series_equal(df.loc[4, "c"], exp)

        # scalar ok
        df.loc[4, "c"] = 10
        exp = Series(10, index=[8, 10], name="c", dtype="float64")
        tm.assert_series_equal(df.loc[4, "c"], exp)

        # invalid assignments
        with pytest.raises(ValueError):
            df.loc[4, "c"] = [0, 1, 2, 3]

        with pytest.raises(ValueError):
            df.loc[4, "c"] = [0]

        # groupby example
        NUM_ROWS = 100
        NUM_COLS = 10
        col_names = [
            "A" + num for num in map(str,
                                     np.arange(NUM_COLS).tolist())
        ]
        index_cols = col_names[:5]

        df = DataFrame(
            np.random.randint(5, size=(NUM_ROWS, NUM_COLS)),
            dtype=np.int64,
            columns=col_names,
        )
        df = df.set_index(index_cols).sort_index()
        grp = df.groupby(level=index_cols[:4])
        df["new_col"] = np.nan

        f_index = np.arange(5)

        def f(name, df2):
            return Series(np.arange(df2.shape[0]),
                          name=df2.index.values[0]).reindex(f_index)

        # TODO(wesm): unused?
        # new_df = pd.concat([f(name, df2) for name, df2 in grp], axis=1).T

        # we are actually operating on a copy here
        # but in this case, that's ok
        for name, df2 in grp:
            new_vals = np.arange(df2.shape[0])
            df.loc[name, "new_col"] = new_vals
Пример #51
0
            corr_prob_ret_sim = np.append(corr_prob_ret_sim,
                                          ret_versus_prob,
                                          axis=0)
        sim += 1
        elapsed_time = time.time() - start_time
        print('Tempo de simulação:', elapsed_time)

    return performance_modelo, ret_medio_ibov_sim, ret_medio_port_sim, ret_medio_port_long_sim, ret_medio_port_short_sim, corr_prob_ret_sim, datas_teste_sim


# carrega do IBOVESPA e dados históricos
compomentes = ler_base_componetes()
base_total = carrega_dados()
datas = DataFrame(base_total['data'].drop_duplicates().values,
                  columns=['data'])
datas = datas.set_index(['data'])
datas = datas.sort_index(axis=0)
limit_inf = '19990202 18:00:000'
limit_sup = '20171230 18:00:000'
datas = datas.loc[limit_inf:limit_sup]
datas = datas.sort_index(axis=0)
datas = datas.reset_index(['data'])

# lista de variáveis para o modelo (features) e para aplicação de logaritmo (cols)
#features = ['data', 'codigo', 'retorno', 'acao_close', 'roe', 'pl', 'irf', 'sharpe', 'petroleo_close', 'dolar_close','dji_close', 'sp500_close', 'risco_brasil', 'ibov_fut_close']
#cols = ['acao_close', 'petroleo_close', 'dolar_close', 'dji_close', 'sp500_close', 'risco_brasil', 'ibov_fut_close']
features = [
    'data', 'codigo', 'retorno', 'acao_close', 'roe', 'sharpe', 'dolar_close',
    'sp500_close', 'ibov_fut_close'
]
cols = ['acao_close', 'dolar_close', 'sp500_close', 'ibov_fut_close']
Пример #52
0
    def test_set_index_nan(self):

        # GH 3586
        df = DataFrame(
            {
                "PRuid": {
                    17: "nonQC",
                    18: "nonQC",
                    19: "nonQC",
                    20: "10",
                    21: "11",
                    22: "12",
                    23: "13",
                    24: "24",
                    25: "35",
                    26: "46",
                    27: "47",
                    28: "48",
                    29: "59",
                    30: "10",
                },
                "QC": {
                    17: 0.0,
                    18: 0.0,
                    19: 0.0,
                    20: np.nan,
                    21: np.nan,
                    22: np.nan,
                    23: np.nan,
                    24: 1.0,
                    25: np.nan,
                    26: np.nan,
                    27: np.nan,
                    28: np.nan,
                    29: np.nan,
                    30: np.nan,
                },
                "data": {
                    17: 7.9544899999999998,
                    18: 8.0142609999999994,
                    19: 7.8591520000000008,
                    20: 0.86140349999999999,
                    21: 0.87853110000000001,
                    22: 0.8427041999999999,
                    23: 0.78587700000000005,
                    24: 0.73062459999999996,
                    25: 0.81668560000000001,
                    26: 0.81927080000000008,
                    27: 0.80705009999999999,
                    28: 0.81440240000000008,
                    29: 0.80140849999999997,
                    30: 0.81307740000000006,
                },
                "year": {
                    17: 2006,
                    18: 2007,
                    19: 2008,
                    20: 1985,
                    21: 1985,
                    22: 1985,
                    23: 1985,
                    24: 1985,
                    25: 1985,
                    26: 1985,
                    27: 1985,
                    28: 1985,
                    29: 1985,
                    30: 1986,
                },
            }
        ).reset_index()

        result = (
            df.set_index(["year", "PRuid", "QC"])
            .reset_index()
            .reindex(columns=df.columns)
        )
        tm.assert_frame_equal(result, df)
Пример #53
0
def QA_fetch_index_min(code,
                       start,
                       end,
                       format='numpy',
                       frequence='1min',
                       collections=DATABASE.index_min):
    '获取股票分钟线'
    if frequence in ['1min', '1m']:
        frequence = '1min'
    elif frequence in ['5min', '5m']:
        frequence = '5min'
    elif frequence in ['15min', '15m']:
        frequence = '15min'
    elif frequence in ['30min', '30m']:
        frequence = '30min'
    elif frequence in ['60min', '60m']:
        frequence = '60min'
    __data = []
    code = QA_util_code_tolist(code)
    cursor = collections.find(
        {
            'code': {
                '$in': code
            },
            "time_stamp": {
                "$gte": QA_util_time_stamp(start),
                "$lte": QA_util_time_stamp(end)
            },
            'type': frequence
        },
        batch_size=10000)
    if format in ['dict', 'json']:
        return [data for data in cursor]
    for item in cursor:

        __data.append([
            str(item['code']),
            float(item['open']),
            float(item['high']),
            float(item['low']),
            float(item['close']),
            int(item['up_count']),
            int(item['down_count']),
            float(item['vol']),
            float(item['amount']), item['datetime'], item['time_stamp'],
            item['date']
        ])

    __data = DataFrame(__data,
                       columns=[
                           'code', 'open', 'high', 'low', 'close', 'up_count',
                           'down_count', 'volume', 'amount', 'datetime',
                           'time_stamp', 'date'
                       ])

    __data['datetime'] = pd.to_datetime(__data['datetime'])
    __data = __data.set_index('datetime', drop=False)
    if format in ['numpy', 'np', 'n']:
        return numpy.asarray(__data)
    elif format in ['list', 'l', 'L']:
        return numpy.asarray(__data).tolist()
    elif format in ['P', 'p', 'pandas', 'pd']:
        return __data
Пример #54
0
    def test_inplace_return_self(self):
        # GH 1893

        data = DataFrame(
            {"a": ["foo", "bar", "baz", "qux"], "b": [0, 0, 1, 1], "c": [1, 2, 3, 4]}
        )

        def _check_f(base, f):
            result = f(base)
            assert result is None

        # -----DataFrame-----

        # set_index
        f = lambda x: x.set_index("a", inplace=True)
        _check_f(data.copy(), f)

        # reset_index
        f = lambda x: x.reset_index(inplace=True)
        _check_f(data.set_index("a"), f)

        # drop_duplicates
        f = lambda x: x.drop_duplicates(inplace=True)
        _check_f(data.copy(), f)

        # sort
        f = lambda x: x.sort_values("b", inplace=True)
        _check_f(data.copy(), f)

        # sort_index
        f = lambda x: x.sort_index(inplace=True)
        _check_f(data.copy(), f)

        # fillna
        f = lambda x: x.fillna(0, inplace=True)
        _check_f(data.copy(), f)

        # replace
        f = lambda x: x.replace(1, 0, inplace=True)
        _check_f(data.copy(), f)

        # rename
        f = lambda x: x.rename({1: "foo"}, inplace=True)
        _check_f(data.copy(), f)

        # -----Series-----
        d = data.copy()["c"]

        # reset_index
        f = lambda x: x.reset_index(inplace=True, drop=True)
        _check_f(data.set_index("a")["c"], f)

        # fillna
        f = lambda x: x.fillna(0, inplace=True)
        _check_f(d.copy(), f)

        # replace
        f = lambda x: x.replace(1, 0, inplace=True)
        _check_f(d.copy(), f)

        # rename
        f = lambda x: x.rename({1: "foo"}, inplace=True)
        _check_f(d.copy(), f)
Пример #55
0
def get_a_weights_prop(dim_names, df_total: pd.DataFrame):
    return df_total.set_index(dim_names)["total"]
Пример #56
0
    def test_int64_overflow_issues(self):

        # #2690, combinatorial explosion
        df1 = DataFrame(np.random.randn(1000, 7),
                        columns=list('ABCDEF') + ['G1'])
        df2 = DataFrame(np.random.randn(1000, 7),
                        columns=list('ABCDEF') + ['G2'])

        # it works!
        result = merge(df1, df2, how='outer')
        assert len(result) == 2000

        low, high, n = -1 << 10, 1 << 10, 1 << 20
        left = DataFrame(np.random.randint(low, high, (n, 7)),
                         columns=list('ABCDEFG'))
        left['left'] = left.sum(axis=1)

        # one-2-one match
        i = np.random.permutation(len(left))
        right = left.iloc[i].copy()
        right.columns = right.columns[:-1].tolist() + ['right']
        right.index = np.arange(len(right))
        right['right'] *= -1

        out = merge(left, right, how='outer')
        assert len(out) == len(left)
        assert_series_equal(out['left'], -out['right'], check_names=False)
        result = out.iloc[:, :-2].sum(axis=1)
        assert_series_equal(out['left'], result, check_names=False)
        assert result.name is None

        out.sort_values(out.columns.tolist(), inplace=True)
        out.index = np.arange(len(out))
        for how in ['left', 'right', 'outer', 'inner']:
            assert_frame_equal(out, merge(left, right, how=how, sort=True))

        # check that left merge w/ sort=False maintains left frame order
        out = merge(left, right, how='left', sort=False)
        assert_frame_equal(left, out[left.columns.tolist()])

        out = merge(right, left, how='left', sort=False)
        assert_frame_equal(right, out[right.columns.tolist()])

        # one-2-many/none match
        n = 1 << 11
        left = DataFrame(np.random.randint(low, high, (n, 7)).astype('int64'),
                         columns=list('ABCDEFG'))

        # confirm that this is checking what it is supposed to check
        shape = left.apply(Series.nunique).values
        assert is_int64_overflow_possible(shape)

        # add duplicates to left frame
        left = concat([left, left], ignore_index=True)

        right = DataFrame(np.random.randint(low, high,
                                            (n // 2, 7)).astype('int64'),
                          columns=list('ABCDEFG'))

        # add duplicates & overlap with left to the right frame
        i = np.random.choice(len(left), n)
        right = concat([right, right, left.iloc[i]], ignore_index=True)

        left['left'] = np.random.randn(len(left))
        right['right'] = np.random.randn(len(right))

        # shuffle left & right frames
        i = np.random.permutation(len(left))
        left = left.iloc[i].copy()
        left.index = np.arange(len(left))

        i = np.random.permutation(len(right))
        right = right.iloc[i].copy()
        right.index = np.arange(len(right))

        # manually compute outer merge
        ldict, rdict = defaultdict(list), defaultdict(list)

        for idx, row in left.set_index(list('ABCDEFG')).iterrows():
            ldict[idx].append(row['left'])

        for idx, row in right.set_index(list('ABCDEFG')).iterrows():
            rdict[idx].append(row['right'])

        vals = []
        for k, lval in ldict.items():
            rval = rdict.get(k, [np.nan])
            for lv, rv in product(lval, rval):
                vals.append(k + tuple([lv, rv]))

        for k, rval in rdict.items():
            if k not in ldict:
                for rv in rval:
                    vals.append(k + tuple([np.nan, rv]))

        def align(df):
            df = df.sort_values(df.columns.tolist())
            df.index = np.arange(len(df))
            return df

        def verify_order(df):
            kcols = list('ABCDEFG')
            assert_frame_equal(df[kcols].copy(),
                               df[kcols].sort_values(kcols, kind='mergesort'))

        out = DataFrame(vals, columns=list('ABCDEFG') + ['left', 'right'])
        out = align(out)

        jmask = {
            'left': out['left'].notna(),
            'right': out['right'].notna(),
            'inner': out['left'].notna() & out['right'].notna(),
            'outer': np.ones(len(out), dtype='bool')
        }

        for how in 'left', 'right', 'outer', 'inner':
            mask = jmask[how]
            frame = align(out[mask].copy())
            assert mask.all() ^ mask.any() or how == 'outer'

            for sort in [False, True]:
                res = merge(left, right, how=how, sort=sort)
                if sort:
                    verify_order(res)

                # as in GH9092 dtypes break with outer/right join
                assert_frame_equal(frame,
                                   align(res),
                                   check_dtype=how not in ('right', 'outer'))
Пример #57
0
 def test_interp_nan_idx(self):
     df = DataFrame({'A': [1, 2, np.nan, 4], 'B': [np.nan, 2, 3, 4]})
     df = df.set_index('A')
     with pytest.raises(NotImplementedError):
         df.interpolate(method='values')
Пример #58
0
    def test_multiindex_slicers_edges(self):
        # GH 8132
        # various edge cases
        df = DataFrame({
            "A": ["A0"] * 5 + ["A1"] * 5 + ["A2"] * 5,
            "B": ["B0", "B0", "B1", "B1", "B2"] * 3,
            "DATE": [
                "2013-06-11",
                "2013-07-02",
                "2013-07-09",
                "2013-07-30",
                "2013-08-06",
                "2013-06-11",
                "2013-07-02",
                "2013-07-09",
                "2013-07-30",
                "2013-08-06",
                "2013-09-03",
                "2013-10-01",
                "2013-07-09",
                "2013-08-06",
                "2013-09-03",
            ],
            "VALUES": [22, 35, 14, 9, 4, 40, 18, 4, 2, 5, 1, 2, 3, 4, 2],
        })

        df["DATE"] = pd.to_datetime(df["DATE"])
        df1 = df.set_index(["A", "B", "DATE"])
        df1 = df1.sort_index()

        # A1 - Get all values under "A0" and "A1"
        result = df1.loc[(slice("A1")), :]
        expected = df1.iloc[0:10]
        tm.assert_frame_equal(result, expected)

        # A2 - Get all values from the start to "A2"
        result = df1.loc[(slice("A2")), :]
        expected = df1
        tm.assert_frame_equal(result, expected)

        # A3 - Get all values under "B1" or "B2"
        result = df1.loc[(slice(None), slice("B1", "B2")), :]
        expected = df1.iloc[[2, 3, 4, 7, 8, 9, 12, 13, 14]]
        tm.assert_frame_equal(result, expected)

        # A4 - Get all values between 2013-07-02 and 2013-07-09
        result = df1.loc[(slice(None), slice(None),
                          slice("20130702", "20130709")), :]
        expected = df1.iloc[[1, 2, 6, 7, 12]]
        tm.assert_frame_equal(result, expected)

        # B1 - Get all values in B0 that are also under A0, A1 and A2
        result = df1.loc[(slice("A2"), slice("B0")), :]
        expected = df1.iloc[[0, 1, 5, 6, 10, 11]]
        tm.assert_frame_equal(result, expected)

        # B2 - Get all values in B0, B1 and B2 (similar to what #2 is doing for
        # the As)
        result = df1.loc[(slice(None), slice("B2")), :]
        expected = df1
        tm.assert_frame_equal(result, expected)

        # B3 - Get all values from B1 to B2 and up to 2013-08-06
        result = df1.loc[(slice(None), slice("B1", "B2"),
                          slice("2013-08-06")), :]
        expected = df1.iloc[[2, 3, 4, 7, 8, 9, 12, 13]]
        tm.assert_frame_equal(result, expected)

        # B4 - Same as A4 but the start of the date slice is not a key.
        #      shows indexing on a partial selection slice
        result = df1.loc[(slice(None), slice(None),
                          slice("20130701", "20130709")), :]
        expected = df1.iloc[[1, 2, 6, 7, 12]]
        tm.assert_frame_equal(result, expected)
Пример #59
0
from public.models import Metabolite, Reaction
from public.models import MetaboliteMapCoordinate, ReactionMapCoordinate

#------------------------
# Metabolite positions
#------------------------
m_df = DataFrame(columns=('id', 'model_type', 'wid', 'name', 'compartment',
                          'x', 'y'))
m_cor = MetaboliteMapCoordinate.objects.all()
for k, p in enumerate(m_cor):
    # get the associated metabolite (should be exactly 1)
    m = p.metabolites.all()[0]
    # add metabolite and position
    m_df.loc[k] = (m.id, m.model_type, m.wid, m.name, p.compartment, p.x, p.y)
m_df = m_df.set_index(m_df.id)
# set the data types
m_df[['id', 'x', 'y']] = m_df[['id', 'x', 'y']].astype(int)

m_df.head(10)

#------------------------
# Reaction positions
#------------------------
r_df = DataFrame(columns=('id', 'model_type', 'wid', 'name', 'path', 'value_x',
                          'value_y', 'label_x', 'label_y'))
r_cor = ReactionMapCoordinate.objects.all()
for k, p in enumerate(r_cor):
    # get reaction
    r = p.reactions.all()[0]
    r_df.loc[k] = (r.id, r.model_type, r.wid, r.name, p.path, p.value_x,
Пример #60
0

print(g.transform(normalize))
print(g.apply(normalize))
print(g.transform('mean'))

normalized = (df['value'] - g.transform('mean')) / g.transform('std')
print(normalized)

# 12.2.2分组的时间重新采样
N = 15
times = pd.date_range('2017-05-20 00:00', freq='1min', periods=N)
df = DataFrame({'time': times, 'value': np.arange(N)})

print(df)
print(df.set_index('time').resample('5min').count())
df2 = DataFrame({
    'time': times.repeat(3),
    'key': np.tile(['a', 'b', 'c'], N),
    'value': np.arange(N * 3.)
})
print(df2)
time_key = pd.TimeGrouper('5min')
resampled = (df2.set_index('time').groupby(['key', time_key]).sum())

print(resampled)
print(resampled.reset_index())

# 12.3方法链技术

# 12.3.1pipe方法