예제 #1
0
def all_Month_By_Cost(mydf):
    print(
        '\n---------------------------------------开始从2017年开始,By 季度 '
        '以Cost的方式计算所有数据,---------------------------------------------------------------\n'
    )

    mydf = mydf.merge(df_Cost)
    # 添加Total Cost这一列,Total Cost 等于每一行的物料号乘以数量
    mydf['Total Cost'] = mydf.iloc[:, 6] * mydf.iloc[0:, 13]
    mydf.loc[:, 'Input Date'] = pd.to_datetime(mydf.loc[:, 'Input Date'])

    # 每个季度,SPL的Summary
    df_ByOrderTime_All_From2007_ByQ = \
        mydf.groupby([Grouper(key='Input Date', freq='BQ'), 'Project Name'])[
            'Total Cost'].sum().rename('合计Cost').reset_index()

    # 每个SPL,在每个季度的Summary
    df_ByOrderTime_All_From2007_BySPL = \
        mydf.groupby(['Project Name', Grouper(key='Input Date', freq='BQ')])[
            'Total Cost'].sum().rename('合计Cost').reset_index()

    df_ByOrderTime_All_From2007_Total = mydf.groupby([
        Grouper(key='Input Date', freq='BQ')
    ])['Total Cost'].sum().rename('合计Cost').reset_index()

    return (
        df_ByOrderTime_All_From2007_ByQ,
        df_ByOrderTime_All_From2007_BySPL,
        df_ByOrderTime_All_From2007_Total,
    )
예제 #2
0
    def split_by_date(self, mode='day'):
        """
        Split trajectory into subtrajectories using regular time intervals.

        Parameters
        ----------
        mode : str
            Split mode

        Returns
        -------
        list
            List of trajectories
        """
        result = []
        if mode == 'day':
            grouped = self.df.groupby(Grouper(freq="D"))
        elif mode == 'month':
            grouped = self.df.groupby(Grouper(freq="M"))
        elif mode == 'year':
            grouped = self.df.groupby(Grouper(freq="Y"))
        else:
            raise ValueError(
                'Invalid split mode {}. Must be one of [day, month, year]'.
                format(mode))
        for key, values in grouped:
            if len(values) > 1:
                result.append(Trajectory(values, '{}_{}'.format(self.id, key)))
        return result
예제 #3
0
def create_boxplots_plots(csv_file_name):
    # load data for 70% - 30%
    series = read_csv('data/datasets/' + csv_file_name.split('.csv')[0] + '_dataset_70_30.csv',
                      header=0, index_col=0, parse_dates=True, squeeze=True)
    groups = series.groupby(Grouper(freq='A'))
    years = DataFrame()
    appended_data = []
    for name, group in groups:
        years[name.year] = group.values
        appended_data.append(years)
        years = DataFrame()

    appended_data = concat(appended_data)
    appended_data.boxplot()
    pyplot.show()

    # load data for 80% - 20%
    series = read_csv('data/datasets/' + csv_file_name.split('.csv')[0] + '_dataset_80_20.csv',
                      header=0, index_col=0, parse_dates=True, squeeze=True)
    groups = series.groupby(Grouper(freq='A'))
    years = DataFrame()
    appended_data = []
    for name, group in groups:
        years[name.year] = group.values
        appended_data.append(years)
        years = DataFrame()

    appended_data = concat(appended_data)
    appended_data.boxplot()
    pyplot.show()

    # load data for 90% - 10%
    series = read_csv('data/datasets/' + csv_file_name.split('.csv')[0] + '_dataset_90_10.csv',
                      header=0, index_col=0, parse_dates=True, squeeze=True)
    groups = series.groupby(Grouper(freq='A'))
    years = DataFrame()
    appended_data = []
    for name, group in groups:
        years[name.year] = group.values
        appended_data.append(years)
        years = DataFrame()

    appended_data = concat(appended_data)
    appended_data.boxplot()
    pyplot.show()

    # load data for 95% - 5%
    series = read_csv('data/datasets/' + csv_file_name.split('.csv')[0] + '_dataset_95_5.csv',
                      header=0, index_col=0, parse_dates=True, squeeze=True)
    groups = series.groupby(Grouper(freq='A'))
    years = DataFrame()
    appended_data = []
    for name, group in groups:
        years[name.year] = group.values
        appended_data.append(years)
        years = DataFrame()

    appended_data = concat(appended_data)
    appended_data.boxplot()
    pyplot.show()
예제 #4
0
def all_Month_By_Cost_booking(mydf):
    print(
        '\n---------------------------------------以Booking时间,开始从2017年开始,By 季度 '
        '以Booking时间,以Cost的方式计算所有数据,---------------------------------------------------------------\n'
    )
    mydf = mydf.dropna(subset=["财务入账时间"])
    mydf = mydf.merge(df_Cost)
    # 添加Total Cost这一列,Total Cost 等于每一行的物料号乘以数量
    mydf['Total Cost'] = mydf.iloc[:, 6] * mydf.iloc[0:, 13]
    mydf.loc[:, '财务入账时间'] = pd.to_datetime(mydf.loc[:, '财务入账时间'])
    # 每个季度,SPL的Summary
    df_ByBookingTime_All_From2007_ByQ = \
        mydf.groupby([Grouper(key='财务入账时间', freq='BQ'), 'Project Name'])[
            'Total Cost'].sum().rename('合计Cost').reset_index()

    # 每个SPL,在每个季度的Summary
    df_ByBookingTime_All_From2007_BySPL = \
        mydf.groupby(['Project Name', Grouper(key='财务入账时间', freq='BQ')])[
            'Total Cost'].sum().rename('合计Cost').reset_index()
    # 每个季度Total 可以分摊的数量
    df_ByBookingTime_All_From2007_Total = mydf.groupby([
        Grouper(key='财务入账时间', freq='BQ')
    ])['Total Cost'].sum().rename('合计Cost').reset_index()

    return (df_ByBookingTime_All_From2007_ByQ,
            df_ByBookingTime_All_From2007_BySPL,
            df_ByBookingTime_All_From2007_Total)
예제 #5
0
 def _split_traj(self, traj, mode='day', min_length=0):
     result = []
     if mode == 'day':
         grouped = traj.df.groupby(Grouper(freq="D"))
     elif mode == 'month':
         grouped = traj.df.groupby(Grouper(freq="M"))
     elif mode == 'year':
         grouped = traj.df.groupby(Grouper(freq="Y"))
     else:
         raise ValueError('Invalid split mode {}. Must be one of [day, month, year]'.format(mode))
     for key, values in grouped:
         if len(values) > 1:
             result.append(Trajectory(values, '{}_{}'.format(traj.id, key)))
     return TrajectoryCollection(result, min_length=min_length)
예제 #6
0
def plot_stacked():
    cols = [
        'Coal_MW', 'Gas_MW', 'Hidroelectric_MW', 'Nuclear_MW', 'Wind_MW',
        'Solar_MW', 'Biomass_MW'
    ]
    leg = []
    last_means = np.zeros(97)
    x = []
    xyrs = []

    for c in cols:
        groups = traind[[c,
                         "NewDate"]].groupby(Grouper(key='NewDate', freq="M"))

        m = []
        for i, (name, group) in enumerate(groups):
            m.append(np.mean(group[[c]].values.flatten()))
            if c == "Coal_MW" and name.month == 1:
                xyrs.append(name.year)
                x.append(i)

        plt.bar(range(97), m, 0.5, bottom=last_means)
        last_means += m

        leg.append(c)

    plt.ylabel('Production_MW')
    plt.title('Production')
    plt.gca().set_xticks(x)
    plt.gca().set_xticklabels(xyrs)
    plt.gcf().set_size_inches(25, 10)
    plt.legend(leg)
    plt.show()
def init():
    output_name = "boxplot_speed_diff_2019_04_dataset"

    one_series = series["speed_diff"]

    # one_year = one_series["2019"]
    # groups = one_year.groupby(Grouper(freq="M"))
    # months = concat([DataFrame(x[1].values) for x in groups], axis=1)
    # months = DataFrame(months)
    # months.columns = range(1, 13)
    # groups.plot()

    one_month = one_series["2019-04"]
    groups = one_month.groupby(Grouper(freq="D"))
    days = concat([DataFrame(x[1].values) for x in groups], axis=1)
    days = DataFrame(days)
    days.columns = range(1, 31)
    pyplot.gcf().set_size_inches(12, 7)
    days.boxplot()

    try:
        pyplot.savefig(os.path.join(OUTPUT_FOLDER, f"{output_name}.png"), format="png", dpi=300)
    except FileNotFoundError:
        os.makedirs(OUTPUT_FOLDER)
        pyplot.savefig(os.path.join(OUTPUT_FOLDER, f"{output_name}.png"), format="png", dpi=300)
예제 #8
0
def plot_grouped_by_year_data(df_weekly, title):
    """
    Plots weekly combined series (price series and cot report) using Plotly. Use trading weeks on X axis
    """
    x_series = np.arange(0, 54)
    groups = df_weekly['Settle'].groupby(Grouper(freq='A'))
    fig = tls.make_subplots(rows=sum(1 for (name, grp) in groups
                                     if len(grp.values) >= 52),
                            cols=1,
                            shared_xaxes=True,
                            print_grid=False)
    fig['layout'].update(height=600, width=899, title=title)
    for i, (name, group) in enumerate([(name, grp) for (name, grp) in groups
                                       if len(grp.values) >= 52]):
        # chart only data where we have full year (52 weeks)
        if len(group.values) >= 52:
            fig.append_trace(
                {
                    'x': x_series,
                    'y': group.values,
                    'type': 'scatter',
                    'name': name.year
                }, i + 1, 1)
            fig['layout']['yaxis' + str((i + 1))].update(showticklabels=False)

    cf.iplot(fig)
예제 #9
0
def test_series_groupby_value_counts_with_grouper():
    # GH28479
    df = DataFrame({
        "Timestamp": [
            1565083561,
            1565083561 + 86400,
            1565083561 + 86500,
            1565083561 + 86400 * 2,
            1565083561 + 86400 * 3,
            1565083561 + 86500 * 3,
            1565083561 + 86400 * 4,
        ],
        "Food":
        ["apple", "apple", "banana", "banana", "orange", "orange", "pear"],
    }).drop([3])

    df["Datetime"] = to_datetime(df["Timestamp"].apply(lambda t: str(t)),
                                 unit="s")
    dfg = df.groupby(Grouper(freq="1D", key="Datetime"))

    # have to sort on index because of unstable sort on values xref GH9212
    result = dfg["Food"].value_counts().sort_index()
    expected = dfg["Food"].apply(Series.value_counts).sort_index()
    expected.index.names = result.index.names

    tm.assert_series_equal(result, expected)
예제 #10
0
def groupFridgeData(fridgeData, freq):
    fridgeGrouped = fridgeData.groupby(Grouper(freq=freq, axis=0))
    intervalLength = float(freq.replace("s", ""))
    fridgeGrouped = fridgeGrouped.agg(
        lambda x: np.nan if (hasZerosOrNan(x)) else x.mean() * intervalLength)
    fridgeGrouped = fridgeGrouped.round(2)
    fridgeGrouped.rename({'power': 'energy'}, axis=1, inplace=True)
    return fridgeGrouped
예제 #11
0
파일: stock.py 프로젝트: pjrenzov/quant
 def RV(self):
     try:
         rv = self.stock_data['log_returns']
     except:
         self.returns(method='LOG')
         rv = self.stock_data['log_returns']
     return self.stock_data.groupby(Grouper(freq='M')).apply(
         self.realized_volatility).log_returns * np.sqrt(12)
예제 #12
0
    def add_data_dataframe(self, metadata, data, data_type=None, overwrite=False):
        """ Add data to this Datasource and segment the data by size.
            The data is stored as a tuple of the data and the daterange it covers.

            Args:
                metadata (dict): Metadata on the data for this Datasource
                data (Pandas.DataFrame): Data
                data_type (str, default=None): Placeholder for combination of this fn
                with add_footprint_data in the future
                overwrite (bool, default=False): Overwrite existing data
                None
        """
        from pandas import Grouper
        from HUGS.Processing import get_split_frequency

        # Store the metadata as labels
        # for k, v in metadata.items():
        #     self.add_metadata(key=k, value=v)

        # Ensure metadata values are all lowercase
        metadata = {k: v.lower() for k, v in metadata.items()}
        self._metadata.update(metadata)

        # Add in a type record for timeseries data
        # Can possibly combine this function and the add_footprint (and other)
        # functions in the future
        # Store the hashes of data we've seen previously in a dict?
        # Then also check that the data we're trying to input doesn't overwrite the data we
        # currently have
        # Be easiest to first check the dates covered by the data?

        # Check the daterange covered by this data and if we have an overlap
        #
        if self._data:
            # Exisiting data in Datsource
            start_data, end_data = self.daterange()
            # This is the data that we may want to add to the Datasource
            start_new, end_new = self.get_dataframe_daterange(data)

            # Check if there's overlap of data
            if start_new >= start_data and end_new <= end_data and overwrite is False:
                raise ValueError(
                    "The provided data overlaps dates covered by existing data"
                )

        # Need to check here if we've seen this data before
        freq = get_split_frequency(data)
        # Split into sections by splitting frequency
        group = data.groupby(Grouper(freq=freq))
        # Create a list tuples of the split dataframe and the daterange it covers
        # As some (years, months, weeks) may be empty we don't want those dataframes
        self._data = [
            (g, self.get_dataframe_daterange(g)) for _, g in group if len(g) > 0
        ]
        self.add_metadata(key="data_type", value="timeseries")
        self._data_type = "timeseries"
        # Use daterange() to update the recorded values
        self.update_daterange()
예제 #13
0
def showBoxPlot(df):
    groups = df.groupby(Grouper(freq='A'))
    years = DataFrame()

    for name, group in groups:
        years[name.year] = group.values

    years.boxplot()
    pyplot.show()
def main():
    # Update these as needed
    filter_pctl = 0.25
    workspace = '/home/arthur/Dropbox/projects/greenland/aoi_albedo_time_series/catchments/'
    if workspace[:-1] != '/':
        workspace = workspace + '/'
    csv_name = 'actual_albedo_catchment_6.2_ekholm_stats.csv'
    aoi_name = 'Catchment 6.2 (ValiObs filter = {x} Pctl)'.format(
        x=filter_pctl)
    dt_indx = pd.date_range('2000-01-01', '2020-12-31')
    csv_path = workspace + csv_name

    # set this value to filter out observations with fewer than the given percentile of valid observations

    # Define the fields of interest so we can ignore the rest
    fields = ['date', 'mean', 'valid_pixels_count']

    # Import raw APPEARS output
    ts_df = pd.read_csv(csv_path, usecols=fields, parse_dates=[1])

    # Make the date index, then group by it to make monthly averages
    ts_df['date'] = pd.to_datetime(ts_df['date'])
    ts_df['mean'].replace({pd.NaT: np.nan}, inplace=True)
    ts_df = filter_data_numobs(ts_df, filter_pctl)
    del ts_df['valid_pixels_count']

    # Simple masking by month due to small available pixels, so noisy even when szn-masked
    begin_month = 3
    end_month = 9
    szn_mask = (ts_df['date'].dt.month >=
                begin_month) & (ts_df['date'].dt.month <= end_month)
    ts_df = ts_df.loc[szn_mask]

    ts_df.set_index('date', inplace=True)

    series = ts_df.squeeze()
    strt_year = dt_indx[0].to_pydatetime().year  # do I need this stuff
    end_year = dt_indx[-1].to_pydatetime().year  # and this?
    nyears = end_year - strt_year  # and this?
    series = series.reindex(dt_indx, fill_value=np.NaN)

    groups = series.groupby(Grouper(freq='A'))
    years = DataFrame()

    # This is how the dataframe is set up with each column being a year of data, each row a doy
    for name, group in groups:
        years[name.year] = group.values[:364]

    # make columns into strings for easier plot labeling
    years.columns = years.columns.astype(str)

    box_plot(years, aoi_name, csv_path)
    # box_plot_anom(years, aoi_name, csv_path)
    # vert_stack_plot(years, nyears, strt_year, end_year, aoi_name, csv_path)
    # vert_stack_plot_anom(years, nyears, strt_year, end_year, aoi_name, csv_path)
    year_vs_avg_plot(years, aoi_name, csv_path)
예제 #15
0
 def _split_traj(self, traj, mode="day", min_length=0):
     result = []
     modes = {"hour": "H", "day": "D", "month": "M", "year": "Y"}
     if mode in modes.keys():
         mode = modes[mode]
     grouped = traj.df.groupby(Grouper(freq=mode))
     for key, values in grouped:
         if len(values) > 1:
             result.append(Trajectory(values, "{}_{}".format(traj.id, key)))
     return TrajectoryCollection(result, min_length=min_length)
    def _transform(ts: Series, attr_period: str, fun_type: str) -> Series:
        """
        Groups a timeseries by a specified time period.
        :param ts: Series. Timeseries.
        :param attr_period: String. Attribute period, PER_DAY for "d", PER_WEEK for "w",
        PER_MONTH for "m" and PER_YEAR for "y".
        :param fun_type: String. Function, either "sum" or "count", to be applied on each of the
        grouped periods.
        :return: Series. Grouped timeseries.
        """
        if fun_type == "sum":
            ts = ts.groupby(Grouper(freq=attr_period.upper())).sum()
        elif fun_type == "count":
            ts = ts.groupby(Grouper(freq=attr_period.upper())).count()
        else:
            raise NoProperOptionInIf

        ts = ts[ts != 0]  # To drop VALUE = 0 rows

        return ts
예제 #17
0
def resample(self, resample_code='A-JUN', fun='mean'):
    """
    Time series resampling function. Returns a Hydro class object with resampled data.

    Parameters
    ----------
    resample_code : str
        The Pandas resampling code for the resampling process (e.g. 'A' for annual, 'A-JUN' for annual ending in June (water year), 'D' for day, 'W' for week, 'M' for month')
    fun : str
        The function that should be applied. Any function that Pandas can handle in a groupby object.
    """
    from pandas.core.groupby import SeriesGroupBy, GroupBy
    from pandas import Grouper

    ### Set up parameters
    if fun in GroupBy.__dict__.keys():
        fun1 = GroupBy.__dict__[fun]
    elif fun in SeriesGroupBy.__dict__.keys():
        fun1 = SeriesGroupBy.__dict__[fun]
    else:
        raise ValueError('Need to use the right function name.')

    ### Run resampling
    data = self.data

    df1 = data.groupby([
        Grouper(level='mtype'),
        Grouper(level='site'),
        Grouper(level='time', freq=resample_code)
    ])
    df2 = fun1(df1)

    ### Recreate hydro class object
    new1 = self.add_data(df2.reset_index(),
                         'time',
                         'site',
                         'mtype',
                         'data',
                         dformat='long',
                         add=False)
    return (new1)
예제 #18
0
def group_by_time_frequency(data_x, data_y, columns, g_freq):

    data_in = pd.DataFrame(columns=columns, index=data_x)
    data_in['time'] = data_x
    data_in[columns[1:len(columns)]] = data_y
    groups = data_in.groupby(Grouper(freq=g_freq))

    d_groups = concat([DataFrame(x[1].values) for x in groups], axis=1)
    d_groups = DataFrame(d_groups)
    d_groups.columns = range(1, len(d_groups.columns) + 1)

    return groups, d_groups
예제 #19
0
def group_by_time_range(data_x, data_y, g_freq):

    series = pd.DataFrame(columns=['Date', 'DO'], index=data_x)
    series['Date'] = data_x
    series['DO'] = data_y
    groups = series.groupby(Grouper(freq=g_freq))

    g_data = concat([DataFrame(x[1].values) for x in groups], axis=1)
    g_data = DataFrame(g_data)
    g_data.columns = range(1, len(g_data.columns) + 1)

    return groups, g_data
예제 #20
0
    def __init__(self,
                 symbol,
                 timeframe,
                 agent,
                 fraction=(0, 0),
                 equity=100,
                 risk=0.1):
        self.symbol = symbol
        self.timeframe = timeframe
        new_file = 'D:/Data/' + symbol + '_' + timeframe + '.csv'

        self.data = read_csv(new_file,
                             header=0,
                             parse_dates=False,
                             skiprows=range(1, fraction[0] + 1),
                             nrows=fraction[1])

        # Generating Dataframes
        time = to_datetime(self.data['Date'],
                           format='%Y%m%d %H:%M:%S',
                           errors='coerce')
        self.data = self.data.iloc[:, 2:]
        self.data['Date'] = time
        self.data, n = get_technical_indicators(self.data)
        self.data['Index'] = self.data.index - n

        # Daily grouping
        self.days = [
            g for n, g in self.data.groupby(Grouper(key='Date', freq='D'))
        ]
        self.data = self.data.set_index(time.iloc[n:])
        self.l = len(self.data)
        self.trades = DataFrame(columns=[
            'Opentime', 'Closetime', 'Openindex', 'Closeindex', 'Open',
            'Close', 'Profit', 'Volume', 'Direction'
        ])

        # Properties
        self.market = {
            'Spread': 1.2,
            'MinLot': 0.1,
            'Leverage': 20,
            'MarginCall': 0.75
        }
        self.settings = {'Risk': risk, 'MaxDrawDown': 0.9, 'TradingPause': 0}

        self.equity_start = equity
        self.equity = equity
        self.agent = agent
        self.profits = []
        self.actions = []
        print('Initialized.')
예제 #21
0
    def __aggregate(self, df, freq):
        freq_int = int(re.search(r'\d+', freq).group())
        freq_dim = freq.replace(str(freq_int), "")

        new_freq = str(int(freq_int / self.scale_factor_)) + freq_dim
        df = df.groupby(Grouper(key='date', freq=new_freq))['size'].sum()
        df = df.fillna(0)
        df = pd.DataFrame(df)

        df['diff'] = np.arange(0, len(df))
        df['diff'] *= freq_int

        return df
예제 #22
0
def grp_ts_agg(df, grp_col, ts_col, freq_code):
    """
    Simple function to aggregate time series with dataframes with a single column of sites and a column of times.

    Arguments:\n
    df -- dataframe with a datetime column.\n
    grp_col -- Column name that contains the sites.\n
    ts_col -- The column name of the datetime column.\n
    freq_code -- The pandas frequency code for the aggregation (e.g. 'M', 'A-JUN').\n
    agg_fun -- Either 'mean' or 'sum'.
    """

    df1 = df.copy()
    if type(df[ts_col].iloc[0]) is Timestamp:
        df1.set_index(ts_col, inplace=True)
        if type(grp_col) is list:
            grp_col.extend([Grouper(level=ts_col, freq=freq_code)])
        else:
            grp_col = [grp_col, Grouper(level=ts_col, freq=freq_code)]
        df_grp = df1.groupby(grp_col)
        return df_grp
    else:
        print('Make one column a timeseries!')
예제 #23
0
def df_weekly_generation(df):
    """
    ---What it does---
        + Generates a dataframe by country and date.
    ---What it needs---
        + big_df: dataframe with columns.
        + condition_1: name of the country.
        + condition_2: starting date.
    ---What it returns---
        + country_df
    """
    new_df = df.groupby(Grouper(key="date", freq='7D')).mean()
    new_df["week"] = range(0, len(new_df))
    new_df.set_index("week", inplace=True)
    return new_df
예제 #24
0
def get_all_multi_strengths(df_strength):
    pickle_path_str_agg_all = 'all_strengths'

    if os.path.isfile(pickle_path_str_agg_all):
        with open(pickle_path_str_agg_all, 'rb') as handle:
            df_multi_strength = pickle.load(handle)
    else:
        df_strength = process_timestamp_and_clean_data(df_strength)
        df_by_user_date = df_strength.groupby(
            ['user_id', Grouper(key='timestamp', freq='D')])

        df_multi_strength = df_by_user_date.apply(transform_to_multi_strength)
        df_multi_strength = df_multi_strength.drop_duplicates()

        with open(pickle_path_str_agg_all, "wb") as handle:
            pickle.dump(df_multi_strength,
                        handle,
                        protocol=pickle.HIGHEST_PROTOCOL)

    return df_multi_strength
예제 #25
0
df['lpep_pickup_datetime'] = pd.to_datetime(df['lpep_pickup_datetime'],format='%m/%d/%Y %H:%M')#2015-01-31 23:00:09
df['Lpep_dropoff_datetime'] = pd.to_datetime(df['Lpep_dropoff_datetime'],format='%m/%d/%Y %H:%M')
df['trip_duration']= df['Lpep_dropoff_datetime'] - df['lpep_pickup_datetime']
data = df[~(df['lpep_pickup_datetime'] < '2014-01-01')]
data = data[~(df['lpep_pickup_datetime'] > '2014-02-01')]
data['count']=1
#print(data)
#print(data)

reqcolumns = ['Trip_distance', 'Total_amount']
reqcolumns2 = ['trip_duration', 'Total_amount']
data_distance=data[reqcolumns]
data_time = data[reqcolumns2]
#print(data_time)

by_days=data.groupby(Grouper(key='lpep_pickup_datetime', freq='d')).sum()
by_hour=data.groupby(Grouper(key='lpep_pickup_datetime', freq='H')).sum()

print(by_days)

# data atribute has non group data


# In[130]:


X = data.iloc[:, 0].values
y = data.iloc[:, 1].values
X = X.reshape(-1, 1)
y = y.reshape(-1, 1)
예제 #26
0
# create stacked line plots
from pandas import read_csv
from pandas import DataFrame
from pandas import Grouper
from matplotlib import pyplot

filename = 'daily-minimum-temperatures.csv'
series = read_csv(filename,
                  header=0,
                  index_col=0,
                  parse_dates=True,
                  squeeze=True)
groups = series.groupby(Grouper(freq='A'))
years = DataFrame()
for name, group in groups:
    years[name.year] = group.values
years.plot(subplots=True, legend=False)
pyplot.show()
예제 #27
0
#mpl.style.use('fivethirtyeight')
os.chdir(r'C:\Users\Administrator\Documents\GitHub\DS\Hessel, TimeSeries')
series = read_csv(r'data\MAAND OPEN PRD 2014-2019.csv',
                  header=0,
                  index_col=0,
                  parse_dates=True,
                  squeeze=True)
pp.figure(figsize=(8, 3), dpi=100)
pp.title('Openstaande Incidenten Per Maand')
series.plot()
pp.xlabel('jaren')
pp.ylabel('incidenten')
pp.tight_layout(pad=3.0)
pp.show()

groups = series['2014':'2019'].groupby(Grouper(freq='A'))
years = DataFrame()
for name, group in groups:
    years[name.year] = group.values
# Box and Whisker Plots
pp.figure(figsize=(6, 4), dpi=100, edgecolor='k')
years.boxplot()
pp.title('Trend')
pp.tight_layout(pad=3.0)
pp.show()

years = years.transpose()
pp.figure(figsize=(6, 4), dpi=100, edgecolor='k')
years.boxplot()
pp.tight_layout(pad=3.0)
pp.xticks([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [
예제 #28
0
airline.describe()

#### Histogram and Density Plots

# create a histogram plot
airline.hist()

# create a density plot
airline.plot(kind='kde')

#### Box and Whisker Plots by Interval

# create a boxplot of yearly data

from pandas import Grouper
groups = airline.groupby(Grouper(freq='A'))
years = pd.DataFrame()
for name, group in groups:
    years[name.year] = group.values
years.boxplot()

#### Lag plot

# create a scatter plot

from pandas.plotting import lag_plot
lag_plot(airline, lag=1)  # for 1st lag. by default its 1

# create an autocorrelation plot

from statsmodels.graphics.tsaplots import plot_acf
list_of_people.columns = columns
# print(list_of_people.head())

list_of_people.to_csv('people.csv', index=0)
# print(list_of_people)

csvfile = pd.read_csv('people.csv')

rounded_age = csvfile.loc[:, 'age'].round(-1)
# print(csvfile.columns)
# print(rounded_age)

csvfile.loc[:, 'rounded_age'] = csvfile.loc[:, 'age'].round(-1)

grouped_age = csvfile.groupby(Grouper(key='rounded_age')).count()
# print(grouped_age)

rounded_weight = csvfile.loc[:, 'weight'].round(-1)
# print(rounded_weight)

csvfile.loc[:, 'rounded_weight'] = csvfile.loc[:, 'weight'].round(-1)

grouped_weight = csvfile.groupby(Grouper(key='rounded_weight')).count()
# print(rounded_weight)

rounded_height = csvfile.loc[:, 'height'].round(-1)
# print(rounded_height)

csvfile.loc[:, 'rounded_height'] = csvfile.loc[:, 'height'].round(-1)
예제 #30
0
    def test_pivot_timegrouper(self):
        df = DataFrame({
            'Branch':
            'A A A A A A A B'.split(),
            'Buyer':
            'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
            'Quantity': [1, 3, 5, 1, 8, 1, 9, 3],
            'Date': [
                datetime(2013, 1, 1),
                datetime(2013, 1, 1),
                datetime(2013, 10, 1),
                datetime(2013, 10, 2),
                datetime(2013, 10, 1),
                datetime(2013, 10, 2),
                datetime(2013, 12, 2),
                datetime(2013, 12, 2),
            ]
        }).set_index('Date')

        expected = DataFrame(np.array([10, 18, 3],
                                      dtype='int64').reshape(1, 3),
                             index=[datetime(2013, 12, 31)],
                             columns='Carl Joe Mark'.split())
        expected.index.name = 'Date'
        expected.columns.name = 'Buyer'

        result = pivot_table(df,
                             index=Grouper(freq='A'),
                             columns='Buyer',
                             values='Quantity',
                             aggfunc=np.sum)
        tm.assert_frame_equal(result, expected)

        result = pivot_table(df,
                             index='Buyer',
                             columns=Grouper(freq='A'),
                             values='Quantity',
                             aggfunc=np.sum)
        tm.assert_frame_equal(result, expected.T)

        expected = DataFrame(
            np.array([1, np.nan, 3, 9, 18, np.nan]).reshape(2, 3),
            index=[datetime(2013, 1, 1),
                   datetime(2013, 7, 1)],
            columns='Carl Joe Mark'.split())
        expected.index.name = 'Date'
        expected.columns.name = 'Buyer'

        result = pivot_table(df,
                             index=Grouper(freq='6MS'),
                             columns='Buyer',
                             values='Quantity',
                             aggfunc=np.sum)
        tm.assert_frame_equal(result, expected)

        result = pivot_table(df,
                             index='Buyer',
                             columns=Grouper(freq='6MS'),
                             values='Quantity',
                             aggfunc=np.sum)
        tm.assert_frame_equal(result, expected.T)

        # passing the name
        df = df.reset_index()
        result = pivot_table(df,
                             index=Grouper(freq='6MS', key='Date'),
                             columns='Buyer',
                             values='Quantity',
                             aggfunc=np.sum)
        tm.assert_frame_equal(result, expected)

        result = pivot_table(df,
                             index='Buyer',
                             columns=Grouper(freq='6MS', key='Date'),
                             values='Quantity',
                             aggfunc=np.sum)
        tm.assert_frame_equal(result, expected.T)

        self.assertRaises(
            KeyError, lambda: pivot_table(df,
                                          index=Grouper(freq='6MS', key='foo'),
                                          columns='Buyer',
                                          values='Quantity',
                                          aggfunc=np.sum))
        self.assertRaises(
            KeyError,
            lambda: pivot_table(df,
                                index='Buyer',
                                columns=Grouper(freq='6MS', key='foo'),
                                values='Quantity',
                                aggfunc=np.sum))

        # passing the level
        df = df.set_index('Date')
        result = pivot_table(df,
                             index=Grouper(freq='6MS', level='Date'),
                             columns='Buyer',
                             values='Quantity',
                             aggfunc=np.sum)
        tm.assert_frame_equal(result, expected)

        result = pivot_table(df,
                             index='Buyer',
                             columns=Grouper(freq='6MS', level='Date'),
                             values='Quantity',
                             aggfunc=np.sum)
        tm.assert_frame_equal(result, expected.T)

        self.assertRaises(
            ValueError,
            lambda: pivot_table(df,
                                index=Grouper(freq='6MS', level='foo'),
                                columns='Buyer',
                                values='Quantity',
                                aggfunc=np.sum))
        self.assertRaises(
            ValueError,
            lambda: pivot_table(df,
                                index='Buyer',
                                columns=Grouper(freq='6MS', level='foo'),
                                values='Quantity',
                                aggfunc=np.sum))

        # double grouper
        df = DataFrame({
            'Branch':
            'A A A A A A A B'.split(),
            'Buyer':
            'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
            'Quantity': [1, 3, 5, 1, 8, 1, 9, 3],
            'Date': [
                datetime(2013, 11, 1, 13, 0),
                datetime(2013, 9, 1, 13, 5),
                datetime(2013, 10, 1, 20, 0),
                datetime(2013, 10, 2, 10, 0),
                datetime(2013, 11, 1, 20, 0),
                datetime(2013, 10, 2, 10, 0),
                datetime(2013, 10, 2, 12, 0),
                datetime(2013, 12, 5, 14, 0)
            ],
            'PayDay': [
                datetime(2013, 10, 4, 0, 0),
                datetime(2013, 10, 15, 13, 5),
                datetime(2013, 9, 5, 20, 0),
                datetime(2013, 11, 2, 10, 0),
                datetime(2013, 10, 7, 20, 0),
                datetime(2013, 9, 5, 10, 0),
                datetime(2013, 12, 30, 12, 0),
                datetime(2013, 11, 20, 14, 0),
            ]
        })

        result = pivot_table(df,
                             index=Grouper(freq='M', key='Date'),
                             columns=Grouper(freq='M', key='PayDay'),
                             values='Quantity',
                             aggfunc=np.sum)
        expected = DataFrame(np.array([
            np.nan, 3, np.nan, np.nan, 6, np.nan, 1, 9, np.nan, 9, np.nan,
            np.nan, np.nan, np.nan, 3, np.nan
        ]).reshape(4, 4),
                             index=[
                                 datetime(2013, 9, 30),
                                 datetime(2013, 10, 31),
                                 datetime(2013, 11, 30),
                                 datetime(2013, 12, 31)
                             ],
                             columns=[
                                 datetime(2013, 9, 30),
                                 datetime(2013, 10, 31),
                                 datetime(2013, 11, 30),
                                 datetime(2013, 12, 31)
                             ])
        expected.index.name = 'Date'
        expected.columns.name = 'PayDay'

        tm.assert_frame_equal(result, expected)

        result = pivot_table(df,
                             index=Grouper(freq='M', key='PayDay'),
                             columns=Grouper(freq='M', key='Date'),
                             values='Quantity',
                             aggfunc=np.sum)
        tm.assert_frame_equal(result, expected.T)

        tuples = [
            (datetime(2013, 9, 30), datetime(2013, 10, 31)),
            (datetime(2013, 10, 31), datetime(2013, 9, 30)),
            (datetime(2013, 10, 31), datetime(2013, 11, 30)),
            (datetime(2013, 10, 31), datetime(2013, 12, 31)),
            (datetime(2013, 11, 30), datetime(2013, 10, 31)),
            (datetime(2013, 12, 31), datetime(2013, 11, 30)),
        ]
        idx = MultiIndex.from_tuples(tuples, names=['Date', 'PayDay'])
        expected = DataFrame(np.array(
            [3, np.nan, 6, np.nan, 1, np.nan, 9, np.nan, 9, np.nan, np.nan,
             3]).reshape(6, 2),
                             index=idx,
                             columns=['A', 'B'])
        expected.columns.name = 'Branch'

        result = pivot_table(df,
                             index=[
                                 Grouper(freq='M', key='Date'),
                                 Grouper(freq='M', key='PayDay')
                             ],
                             columns=['Branch'],
                             values='Quantity',
                             aggfunc=np.sum)
        tm.assert_frame_equal(result, expected)

        result = pivot_table(df,
                             index=['Branch'],
                             columns=[
                                 Grouper(freq='M', key='Date'),
                                 Grouper(freq='M', key='PayDay')
                             ],
                             values='Quantity',
                             aggfunc=np.sum)
        tm.assert_frame_equal(result, expected.T)