Python Grouper примеры, pandas.Grouper Python примеры использования

Пример #1

0

Показать файл

Файл: common.py Проект: yutiansut/xarray

    def resample(self,
                 indexer: Mapping[Hashable, str] = None,
                 skipna=None,
                 closed: str = None,
                 label: str = None,
                 base: int = 0,
                 keep_attrs: bool = None,
                 loffset=None,
                 restore_coord_dims: bool = None,
                 **indexer_kwargs: str):
        """Returns a Resample object for performing resampling operations.

        Handles both downsampling and upsampling. If any intervals contain no
        values from the original object, they will be given the value ``NaN``.

        Parameters
        ----------
        indexer : {dim: freq}, optional
            Mapping from the dimension name to resample frequency.
        skipna : bool, optional
            Whether to skip missing values when aggregating in downsampling.
        closed : 'left' or 'right', optional
            Side of each interval to treat as closed.
        label : 'left or 'right', optional
            Side of each interval to use for labeling.
        base : int, optional
            For frequencies that evenly subdivide 1 day, the "origin" of the
            aggregated intervals. For example, for '24H' frequency, base could
            range from 0 through 23.
        loffset : timedelta or str, optional
            Offset used to adjust the resampled time labels. Some pandas date
            offset strings are supported.
        keep_attrs : bool, optional
            If True, the object's attributes (`attrs`) will be copied from
            the original object to the new one.  If False (default), the new
            object will be returned without attributes.
        restore_coord_dims : bool, optional
            If True, also restore the dimension order of multi-dimensional
            coordinates.
        **indexer_kwargs : {dim: freq}
            The keyword arguments form of ``indexer``.
            One of indexer or indexer_kwargs must be provided.

        Returns
        -------
        resampled : same type as caller
            This object resampled.

        Examples
        --------
        Downsample monthly time-series data to seasonal data:

        >>> da = xr.DataArray(np.linspace(0, 11, num=12),
        ...                   coords=[pd.date_range('15/12/1999',
        ...                           periods=12, freq=pd.DateOffset(months=1))],
        ...                   dims='time')
        >>> da
        <xarray.DataArray (time: 12)>
        array([  0.,   1.,   2.,   3.,   4.,   5.,   6.,   7., 8.,   9.,  10.,  11.])
        Coordinates:
          * time     (time) datetime64[ns] 1999-12-15 2000-01-15 2000-02-15 ...
        >>> da.resample(time="QS-DEC").mean()
        <xarray.DataArray (time: 4)>
        array([ 1.,  4.,  7., 10.])
        Coordinates:
          * time     (time) datetime64[ns] 1999-12-01 2000-03-01 2000-06-01 2000-09-01

        Upsample monthly time-series data to daily data:

        >>> da.resample(time='1D').interpolate('linear')
        <xarray.DataArray (time: 337)>
        array([ 0.      ,  0.032258,  0.064516, ..., 10.935484, 10.967742, 11.      ])
        Coordinates:
          * time     (time) datetime64[ns] 1999-12-15 1999-12-16 1999-12-17 ...

        Limit scope of upsampling method
        >>> da.resample(time='1D').nearest(tolerance='1D')
        <xarray.DataArray (time: 337)>
        array([ 0.,  0., nan, ..., nan, 11., 11.])
        Coordinates:
          * time     (time) datetime64[ns] 1999-12-15 1999-12-16 ... 2000-11-15

        References
        ----------

        .. [1] http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases
        """  # noqa
        # TODO support non-string indexer after removing the old API.

        from .dataarray import DataArray
        from .resample import RESAMPLE_DIM
        from ..coding.cftimeindex import CFTimeIndex

        if keep_attrs is None:
            keep_attrs = _get_keep_attrs(default=False)

        # note: the second argument (now 'skipna') use to be 'dim'
        if ((skipna is not None and not isinstance(skipna, bool))
                or ("how" in indexer_kwargs and "how" not in self.dims)
                or ("dim" in indexer_kwargs and "dim" not in self.dims)):
            raise TypeError(
                "resample() no longer supports the `how` or "
                "`dim` arguments. Instead call methods on resample "
                "objects, e.g., data.resample(time='1D').mean()")

        indexer = either_dict_or_kwargs(indexer, indexer_kwargs, "resample")
        if len(indexer) != 1:
            raise ValueError(
                "Resampling only supported along single dimensions.")
        dim, freq = next(iter(indexer.items()))

        dim_name = dim
        dim_coord = self[dim]

        if isinstance(self.indexes[dim_name], CFTimeIndex):
            from .resample_cftime import CFTimeGrouper

            grouper = CFTimeGrouper(freq, closed, label, base, loffset)
        else:
            # TODO: to_offset() call required for pandas==0.19.2
            grouper = pd.Grouper(
                freq=freq,
                closed=closed,
                label=label,
                base=base,
                loffset=pd.tseries.frequencies.to_offset(loffset),
            )
        group = DataArray(dim_coord,
                          coords=dim_coord.coords,
                          dims=dim_coord.dims,
                          name=RESAMPLE_DIM)
        resampler = self._resample_cls(
            self,
            group=group,
            dim=dim_name,
            grouper=grouper,
            resample_dim=RESAMPLE_DIM,
            restore_coord_dims=restore_coord_dims,
        )

        return resampler

Пример #2

0

Показать файл

        cfg = yaml.load(f)

    # if cfg["Server"]:
    #     c = get_client(agent=cfg["Agent_IP"], entity=cfg["Entity_File"])
    # else:
    c = get_client()

    now = datetime.datetime(year=2018, month=06, day=01).replace(tzinfo=pytz.timezone("UTC"))

    dataManager = ThermalDataManager(cfg, c)


    o = dataManager._get_outside_data(now-datetime.timedelta(days=10), now)
    o = dataManager._preprocess_outside_data(o.values())

    grouper = o.groupby([pd.Grouper(freq='1H')])
    print(grouper['t_out'].mean())

    # zo = o.values()[0]
    # print zo.iloc[zo.shape[0]/2 - 5:]
    # print(o)
    # print(dataManager._preprocess_outside_data(o.values()))
    # o.dropna()

    # print("shape of outside data", o.shape)
    # print("number of 32 temperatures", (o["t_out"] == 32).sum())
    # t = dataManager.thermal_data(days_back=50)

    # print(t)

    # # plots the data here .

Пример #3

0

Показать файл

    def import_results_from_gleam(
        self,
        sims_dir,
        regions,
        *,
        allow_unfinished=False,
        resample=None,
        overwrite=False,
        info_level=logging.DEBUG,
    ):
        """
        Import simulation result data from GLEAMViz data/sims dir into the HDF5 file.
        """
        if "new_fraction" in self.hdf and not overwrite:
            raise Exception(f"Would overwrite existing `new_fraction` in {self}!")
        sims_df = self.hdf["simulations"]
        sims_dir = Path(sims_dir)
        for sid, sim in sims_df.iterrows():
            path = sims_dir / f"{sid}.gvh5" / "results.h5"
            if not path.exists() and not allow_unfinished:
                raise Exception(f"No gleam result found for {sid} {sim.Name!r}")
        dfs = []
        skipped = set()
        for sid, sim in sims_df.iterrows():
            path = sims_dir / f"{sid}.gvh5" / "results.h5"
            if not path.exists() and allow_unfinished:
                log.log(info_level, "Skipping missing result file {} ..".format(path))
                continue

            log.log(info_level, "Loading results from {} ..".format(path))
            with tables.File(path) as f:
                for r in regions:
                    if pd.isnull(r.GleamID):
                        skipped.add(r.DisplayName)
                        continue
                    gtype = LEVEL_TO_GTYPE[r.Level]
                    node = f.get_node(f"/population/new/{gtype}/median/dset")
                    days = pd.date_range(sim.StartDate, periods=node.shape[3], tz="utc")
                    dcols = {}
                    for ci, cn in COMPARTMENTS.items():
                        new_fraction = node[ci, 0, int(r.GleamID), :]
                        new_fraction = np.expand_dims(new_fraction, 0)
                        idx = pd.MultiIndex.from_tuples(
                            [(sid, r.Code)], names=["SimulationID", "Code"]
                        )
                        dcols[cn] = pd.DataFrame(
                            new_fraction.astype("float32"),
                            index=idx,
                            columns=pd.Index(days, name="Date"),
                        ).stack()
                    dfs.append(pd.DataFrame(dcols).sort_index())
        if skipped:
            log.info(f"Skipped {len(skipped)} regions without GleamID: {skipped!r}")
        if not dfs:
            raise Exception("No GLEAM records loaded!")
        dfall = pd.concat(dfs)
        len0 = len(dfall)
        if resample is not None:
            dfall = dfall.groupby(
                [
                    pd.Grouper(level=0),
                    pd.Grouper(level=1),
                    pd.Grouper(freq=resample, level=2),
                ]
            ).mean()

        self.hdf.put(
            "new_fraction", dfall, format="table", complib="bzip2", complevel=9
        )
        log.info(f"Loaded {len0} GLEAM result rows into {self} (resampling {resample})")

Пример #4

0

Показать файл

Файл: final_code.py Проект: DJKarma/My-Scripts

def get_text_weekly(week,tweet): #weekly
    tweet =tweet.groupby( pd.Grouper(key='TwittedAt',freq='W'))    #for monthly analysis ,use freq='M'
    tweet=tweet.get_group(week)
    text=tweet['Tweet'].T.tolist()
    return text

Пример #5

0

Показать файл

Файл: final_code.py Проект: DJKarma/My-Scripts

def get_tweet_count_weekly(tweet):
    tweet =tweet.groupby( pd.Grouper(key='TwittedAt', freq='W'))['Tweet'].count().reset_index().sort_values('TwittedAt')
    print(tweet)

Пример #6

0

Показать файл

Файл: final_code.py Проект: DJKarma/My-Scripts

weekly_data(tweet)

#In[27]:
tweet=cleaned_tweet()     #location based analysis
x=top_five(top_location(tweet))
print(x)
print("\n\n\n")
get_location_data(x) 



#In[101]:

                                           #overall analysis
tweet=cleaned_tweet()     
tweet=tweet.groupby( pd.Grouper(key='UserName'))['Tweet'].count().reset_index()
tweet.sort_values(["Tweet"], axis=0,ascending=False, inplace=True) 
print(tweet.head(10))       #top ten most active accounts

#In[102]:
tweet=cleaned_tweet()
tweet.drop_duplicates(subset ="UserName", keep ="last", inplace = True) 
tweet.sort_values(["Reach"], axis=0,ascending=False, inplace=True) 
tweet.head(10)      #highest contributers


#In[103]:
tweet=cleaned_tweet()
tweet['Reach'].sum()    #total reach

#In[104]:

Пример #7

0

Показать файл

Файл: final_code.py Проект: DJKarma/My-Scripts

def  get_tweet_count_location(loc1,tweet):
    tweet =tweet.groupby( pd.Grouper(key='Location'))['Tweet'].count().reset_index()
    g=tweet.groupby('Location')  
    return g.get_group(loc1)

Пример #8

0

Показать файл

Файл: Analysis.py Проект: taimoor-9-6/Analysis-Chicago-Crime-Dataset

                                   seasonal_periods=680).fit()
# Let's predict 365 days into the future
crime_forecast = crime_model.forecast(365)
# Print original and then our prediction
df2.plot(figsize=(12, 6))
crime_forecast.plot()  #xlim=['2015-01-01','2021-10-10'])
plt.ylabel('Number of Arrests:False')
plt.xlabel('Years')
plt.legend(['Actual', "Forecasted"])

# In[ ]:

# In[181]:

df_True_district = df[df["Arrest"] == True].groupby(
    [pd.Grouper('District')] +
    [pd.Grouper(level='timestamp', freq='M')]).size().to_frame()
df_True_district.columns = ["Number of Crimes"]
#df4['timestamp']= pd.to_datetime(df4['timestamp'])
df_True_district

# In[182]:

df_True_plot = df_True_district.query("District <= 5.0")
df_True_plot

plt.figure(figsize=(17, 10))
sns.lineplot(data=df_True_plot,
             x=df_True_plot.index.get_level_values('timestamp'),
             y="Number of Crimes",
             hue=df_True_plot.index.get_level_values('District'),

Пример #9

0

Показать файл

    def create_feature(self):
        n_train = self.train.shape[0]
        n_test = self.test.shape[0]

        # aggregate by day
        self.train.index = pd.to_datetime(self.train["timestamp"])
        self.test.index = pd.to_datetime(self.test["timestamp"])
        train_agg = self.train.groupby(\
            ["building_id", "meter", pd.Grouper(freq="24h")])\
            .agg(np.median).reset_index()
        test_agg = self.test.groupby(\
            ["building_id", "meter", pd.Grouper(freq="24h")])\
            .agg(np.median).reset_index()
        
        # KCPD for each building
        building_inds = train_agg["building_id"].unique()
        dfs_train = []; dfs_test = []
        for bidx in tqdm(building_inds):
            df_train = train_agg.query(f"building_id == {bidx}")
            df_test = test_agg.query(f"building_id == {bidx}")
            df_train = pd.DataFrame(dict(timestamp=\
                    pd.date_range(train_agg["timestamp"][0], train_agg["timestamp"][train_agg["timestamp"].size - 1])\
                ))\
                .merge(df_train, on="timestamp", how="left")
            df_test = pd.DataFrame(dict(timestamp=\
                    pd.date_range(test_agg["timestamp"][0], test_agg["timestamp"][test_agg["timestamp"].size - 1])\
                ))\
                .merge(df_test, on="timestamp", how="left")

            dfp = df_train.pivot(index="timestamp", columns="meter", values=["meter_reading"])
            X = dfp.values
            dfp_test = df_test.pivot(index="timestamp", columns="meter", values="building_id")
            dfp["timestamp"] = dfp.index
            dfp_test["timestamp"] = dfp_test.index
            # standardize
            sig = StandardScaler().fit_transform(X)
            # change-point detection
            kcpd = DynpKcpd(min_size=7, jump=1, max_n_bkps=50).fit(sig)
            bkps = kcpd.predict(sig, beta=0.1)["best_bkps"]
            bkps.insert(0, 0)
            # add segment label
            segment_label = np.repeat(range(len(bkps) - 1),\
                np.diff(np.array(bkps)))
            # import pdb; pdb.set_trace()

            dfp["segment"] = segment_label
            # NOTE: 2016 is leap year
            segment_label = np.delete(segment_label, 31 + 29 - 1)
            dfp_test["segment"] = segment_label.tolist() * 2

            df_train = df_train.merge(dfp["segment"], on="timestamp", how="left")
            df_test = df_test.merge(dfp_test["segment"], on="timestamp", how="left")
            dfs_train.append(df_train)
            dfs_test.append(df_test)

            del dfp
            del dfp_test
            del X
            del sig
            del kcpd
            del bkps
            del segment_label
            gc.collect()

        del df_train
        del df_test
        gc.collect()

        dfs_train = pd.concat(dfs_train, axis=0)
        dfs_train.index = dfs_train["timestamp"]
        dfs_train = dfs_train.groupby(["building_id", "meter"])\
            .resample("H").ffill()["segment"].reset_index()
        dfs_test = pd.concat(dfs_test, axis=0)
        dfs_test.index = dfs_test["timestamp"]
        dfs_test = dfs_test.groupby(["building_id", "meter"])\
            .resample("H").ffill()["segment"].reset_index()

        self.train.index.names = ["date"]
        self.test.index.names = ["date"]

        dfs_train["building_id"] = dfs_train["building_id"].astype(np.int16)
        dfs_train["meter"] = dfs_train["meter"].astype(np.int16)
        dfs_train["segment"] = dfs_train["segment"].fillna(method="ffill")
        dfs_test["building_id"] = dfs_test["building_id"].astype(np.int16)
        dfs_test["meter"] = dfs_test["meter"].astype(np.int16)
        dfs_test["segment"] = dfs_test["segment"].fillna(method="ffill")

        # merge segmentation label
        train = self.train.merge(dfs_train,\
            on=["building_id", "timestamp", "meter"],
            how="left")
        test = self.test.merge(dfs_test,\
            on=["building_id", "timestamp", "meter"],
            how="left")

        train["segment"] = train["segment"].fillna(method="ffill")
        test["segment"] = test["segment"].fillna(method="ffill")

        assert train.shape[0] == n_train, f"length must be the same. original:{n_train}, processed:{self.train.shape[0]}"
        assert test.shape[0] == n_test, f"length must be the same. original:{n_test}, processed:{self.test.shape[0]}"

        return train["segment"], test["segment"]

Пример #10

0

Показать файл

    table_invokana, 'prov_prescribing_npi',
    'New to Brand Providers of Invokana')
frac_timeline, new_to_invokana_patients = unique_adopters_plot(
    table_invokana, 'hvid', 'New to Brand Patients of Invokana')

frac_timeline, new_to_trulicity_providers = unique_adopters_plot(
    table_trulicity, 'prov_prescribing_npi',
    'New to Brand Providers of Trulicity')
frac_timeline, new_to_trulicity_patients = unique_adopters_plot(
    table_trulicity, 'hvid', 'New to Brand Patients of Trulicity')

### Analytics - Total volumne dispensed by month
# The columns "dispensed_quantity" and "days_supply" are the same
plt.figure(figsize=(8, 6))
table_invokana[['date_service', 'dispensed_quantity'
                ]].groupby([pd.Grouper(freq='1M', key='date_service')
                            ]).sum().plot(figsize=(6, 4),
                                          title='Total Volume for Invokana',
                                          legend=False,
                                          fontsize=12)
table_trulicity[['date_service', 'dispensed_quantity'
                 ]].groupby([pd.Grouper(freq='1M', key='date_service')
                             ]).sum().plot(figsize=(6, 4),
                                           title='Total Volume for Trulicity',
                                           legend=False,
                                           fontsize=12)

##### Analytics Counts of Refills authorized
plt.figure(figsize=(8, 6))
refill_auth = table_invokana.refill_auth_amount.value_counts()
plt.scatter(refill_auth.index, refill_auth.values, label='Drug I')

Пример #11

0

Показать файл

Файл: 2.process_clinical_data.py Проект: vakilos/ELBWI

                  dayfirst=True)
bila['Patient'] = bila['Patient'].map(pdict)
### milk composition according to Michaelsen etal (1990), Macronutrient (g/dL) and energy (kcal/dL)
milk = pd.DataFrame({
    'Group': ['protein', 'fat', 'lactose', 'energy'],
    "Median": [.9, 3.6, 7.2, 67],
    '-2std': [.6, 1.8, 6.4, 17],
    '+2std': [1.4, 8.9, 7.6, 117]
})
milk.set_index('Group', inplace=True)
bila['Protein'] = bila.SummeEnteral * milk.loc['protein', 'Median'] / 10
bila['Fat'] = bila.SummeEnteral * milk.loc['fat', 'Median'] / 10
bila['Lactose'] = bila.SummeEnteral * milk.loc['lactose', 'Median'] / 10
bila['Energy'] = bila.SummeEnteral * milk.loc[
    'energy', "Median"] / 10  ## cause it is kcal/dl
bila = bila.groupby(["Patient", pd.Grouper(key='Date',
                                           freq='D')]).sum().reset_index()
bila['DoL'] = bila.groupby('Patient')['Date'].transform(
    lambda x: x - x.min() + pd.Timedelta(days=0)).dt.days
#bila["Timepoint"] = pd.cut(anti.Age, bins=[0,2,5,9,16, np.inf], labels=False).apply(lambda x: x+1 )
bila = bila[bila.Energy > 0]
#bila.to_csv('/media/christos/ssd/work/Infants/tmp/feeding.tsv',sep='\t', index=False)

meta.reset_index(inplace=True)
enter = meta.merge(nut, on=['Patient', 'Date'],
                   how='left')[['Patient', 'Age', 'Ratio', 'Timepoint']]
#enter['DoL'] = enter.groupby('Patient')['Date'].transform(lambda x: x-x.min()+pd.Timedelta(days=1)).dt.days
enter['Enteral_feeding'] = enter.Ratio.astype(str).apply(
    lambda x: int(x.split(":")[1])
)  ### this is the percentage of enteral feeding received for this day
enter.drop(columns=['Ratio'], inplace=True)
meta = meta.merge(enter.drop_duplicates(['Patient', 'Age']),

Пример #12

0

Показать файл

Файл: plot_main01.py Проект: bahp/python-spare-code

###################################################################
# Group by patient and sum

# Group by patient and sum
agg = aux.groupby('patient').sum()

# Show
if TERMINAL:
    print("\nOut:")
    print(agg)
agg

###################################################################
# Group by patient (2days) and ..

agg = aux.groupby(by=['patient', pd.Grouper(freq='2D')]) \
    .agg('mean', 'max')
    #.agg({'idx': ['first', 'last'],
    #      0: [skew, kurtosis, own],
    #      1: [skew, kurtosis, own],
    #      '0_hr': [own],
    #      '0_rr': [own]})

# Show
if TERMINAL:
    print("\nOut:")
    print(agg)
agg


def f(x):

Пример #13

0

Показать файл

Файл: 2.treasurybillrates_rtrs.py Проект: cagdemir/equity-index-predictors

                                  fields=['CLOSE'],
                                  start_date=firstday,
                                  end_date=today,
                                  interval='weekly')  #Uk
# 3 months

#short_term_tickers = ['US3MT=RR' ,'GB3MT=RR','TR1YT=RR','ZA3MT=RR','JP3MT=RR','MX3MT=RR','RU3MT=RR','BR1YT=RR','DE3MT=RR','HK3MT=RR']

#short_term =  ek.get_timeseries( short_term_tickers, fields=['CLOSE'], start_date='2000-01-01', end_date='2019-12-11', interval='weekly')                                            #Uk

short_terms = pd.concat([
    short_term_us, short_term_de, short_term_uk, short_term_jp, short_term_ch,
    short_term_tr, short_term_mx, short_term_br, short_term_ru, short_term_sa
],
                        axis=1)

short_terms_int = short_terms.interpolate(method='linear')
short_terms_int_w = ((short_terms_int / 100) + 1)**(1 / 52) - 1

short_terms_int_w = short_terms_int_w[short_terms_int_w.index >= start]

short_terms_int_w = short_terms_int_w.groupby(pd.Grouper(freq='W')).last()

short_terms_int_w.columns = [
    '2_US', '2_GER', '2_UK', '2_JP', '2_CH', '2_TR', '2_MX', '2_BR', '2_RU',
    '2_SA'
]

short_terms_int_w.to_excel(
    'C:/Users/sb0538/Desktop/15022020/excels/2_treasurybillrates.xlsx')

Пример #14

0

Показать файл

Файл: rhrad_online_beta.py Проект: gireeshkbogu/AnomalyDetect

    def create_alerts(self, anomalies, data, fitbit_oldProtocol_hr, k):
        """
        # creates alerts at every 24 hours and send at 9PM.
        # visualise alerts
        """

        # function to assign different alert names
        # summarize hourly alerts
        def alert_types(alert):
            if alert['alerts'] >= 6:
                return 'RED'
            elif alert['alerts'] >= 1:
                return 'YELLOW'
            else:
                return 'GREEN'

        # summarize hourly alerts
        #anomalies.columns = ['datetime', 'std.rhr', 'name']
        anomalies = anomalies[['datetime']]
        anomalies['datetime'] = pd.to_datetime(anomalies['datetime'],
                                               errors='coerce')
        anomalies['alerts'] = 1
        anomalies = anomalies.set_index('datetime')
        anomalies = anomalies[~anomalies.index.duplicated(keep='first')]
        anomalies = anomalies.sort_index()
        alerts = anomalies.groupby(pd.Grouper(freq='24H', base=21)).cumsum()
        # apply alert_types function
        alerts['alert_type'] = alerts.apply(alert_types, axis=1)
        alerts_reset = alerts.reset_index()
        #print(alerts_reset)
        # save alerts
        #alerts.to_csv(myphd_id_alerts, mode='a', header=True)

        # summarize hourly alerts to daily alerts
        daily_alerts = alerts_reset.resample('24H',
                                             on='datetime',
                                             base=21,
                                             label='right').count()
        daily_alerts = daily_alerts.drop(['datetime'], axis=1)

        #print(daily_alerts)

        # function to assign different alert names
        def alert_types(alert):
            if alert['alert_type'] >= 6:
                return 'RED'
            elif alert['alert_type'] >= 1:
                return 'YELLOW'
            else:
                return 'GREEN'

        # apply alert_types function
        daily_alerts['alert_type'] = daily_alerts.apply(alert_types, axis=1)

        # merge missing 'datetime' with 'alerts' as zero aka GREEN
        data1 = data[['index']]
        data1['alert_type'] = 0
        data1 = data1.rename(columns={"index": "datetime"})
        data1['datetime'] = pd.to_datetime(data1['datetime'], errors='coerce')
        data1 = data1.resample('24H', on='datetime', base=21,
                               label='right').count()
        data1 = data1.drop(data1.columns[[0, 1]], axis=1)
        data1 = data1.reset_index()
        data1['alert_type'] = 0

        data3 = pd.merge(data1, daily_alerts, on='datetime', how='outer')
        data4 = data3[['datetime', 'alert_type_y']]
        data4 = data4.rename(columns={"alert_type_y": "alert_type"})
        daily_alerts = data4.fillna("GREEN")
        daily_alerts = daily_alerts.set_index('datetime')
        daily_alerts = daily_alerts.sort_index()

        # merge alerts with main data and pass 'NA' when there is a missing day instead of 'GREEN'
        df_hr = pd.read_csv(fitbit_oldProtocol_hr)

        df_hr['datetime'] = pd.to_datetime(df_hr['datetime'], errors='coerce')
        df_hr = df_hr.resample('24H', on='datetime', base=21,
                               label='right').mean()
        df_hr = df_hr.reset_index()
        df_hr = df_hr.set_index('datetime')
        df_hr.index.name = None
        df_hr.index = pd.to_datetime(df_hr.index)

        df3 = pd.merge(df_hr,
                       daily_alerts,
                       how='outer',
                       left_index=True,
                       right_index=True)
        df3 = df3[df3.alert_type.notnull()]
        df3.loc[df3.heartrate.isna(), 'alert_type'] = pd.NA

        daily_alerts = df3.drop('heartrate', axis=1)
        daily_alerts = daily_alerts.reset_index()
        daily_alerts = daily_alerts.rename(columns={"index": "datetime"})
        daily_alerts.to_csv("_" + str(round(lst[k], 1)) + "_" +
                            myphd_id_alerts,
                            na_rep='NA',
                            header=True)

        # visualize hourly alerts
        #colors = {'RED': 'red', 'YELLOW': 'yellow', 'GREEN': ''}
        #ax = alerts['alerts'].plot(kind='bar', color=[colors[i] for i in alerts['alert_type']],figsize=(20,4))
        #ax.set_ylabel('No.of Alerts \n', fontsize = 14) # Y label
        #ax.axvline(pd.to_datetime(symptom_date), color='grey', zorder=1, linestyle='--', marker="v" ) # Symptom date
        #ax.axvline(pd.to_datetime(diagnosis_date), color='purple',zorder=1, linestyle='--', marker="v") # Diagnosis date
        #plt.xticks(fontsize=4, rotation=90)
        #plt.tight_layout()
        #ax.figure.savefig(myphd_id_figure2, bbox_inches = "tight")
        return daily_alerts

Пример #15

0

Показать файл

Файл: raster.py Проект: mullenkamp/HydroPandas

def point_interp_ts(df,
                    time_col,
                    x_col,
                    y_col,
                    data_col,
                    point_shp,
                    point_site_col,
                    from_crs,
                    to_crs=None,
                    interp_fun='cubic',
                    agg_ts_fun=None,
                    period=None,
                    digits=2):
    """
    Function to take a dataframe of z values and interate through and resample both in time and space. Returns a DataFrame structured like df.

    Parameters
    ----------
    df: DataFrame
        DataFrame containing four columns as shown in the below parameters.
    time_col: str
        The time column name.
    x_col: str
        The x column name.
    y_col: str
        The y column name.
    data_col: str
        The data column name.
    point_shp: str or GeoDataFrame
        Path to shapefile of points to be interpolated or a GeoPandas GeoDataFrame.
    point_site_col: str
        The column name of the site names/numbers of the point_shp.
    grid_res: int
        The resulting grid resolution in meters (or the unit of the final projection).
    from_crs: int or str or None
        The projection info for the input data if the result should be reprojected to the to_crs projection (either a proj4 str or epsg int).
    to_crs: int or str
        The projection for the output data similar to from_crs.
    interp_fun: str
        The scipy Rbf interpolation function to be applied (see https://docs.scipy.org/doc/scipy-0.16.1/reference/generated/scipy.interpolate.Rbf.html).
    agg_ts_fun: str or None
        The pandas time series resampling function to resample the data in time (either 'mean' or 'sum'). If None, then no time resampling.
    period: str or None
        The pandas time series code to resample the data in time (i.e. '2H' for two hours).
    digits: int
        the number of digits to round.

    Returns
    -------
    DataFrame
    """

    #### Read in points
    if isinstance(point_shp, str) & isinstance(point_site_col, str):
        points = gpd.read_file(point_shp)[[point_site_col, 'geometry']]
        to_crs1 = points.crs
    elif isinstance(point_shp, gpd.GeoDataFrame) & isinstance(
            point_site_col, str):
        points = point_shp[[point_site_col, 'geometry']]
        to_crs1 = points.crs
    else:
        raise ValueError(
            'point_shp must be a str path to a shapefile or a GeoDataFrame and point_site_col must be a str.'
        )

    #### Create the grids
    df1 = df.copy()

    #### Resample the time series data
    if agg_ts_fun is not None:
        df1a = df1.set_index(time_col)
        if agg_ts_fun == 'sum':
            df2 = df1a.groupby(
                [pd.TimeGrouper(period),
                 pd.Grouper(y_col),
                 pd.Grouper(x_col)])[data_col].sum().reset_index()
        elif agg_ts_fun == 'mean':
            df2 = df1a.groupby(
                [pd.TimeGrouper(period),
                 pd.Grouper(y_col),
                 pd.Grouper(x_col)])[data_col].mean().reset_index()
        else:
            raise ValueError("agg_ts_fun should be either 'sum' or 'mean'.")
        time = df2[time_col].unique()
    else:
        df2 = df1

    time = df2[time_col].sort_values().unique()

    #### Convert input data to crs of points shp and create input xy
    data1 = df2.loc[df2[time_col] == time[0]]
    from_crs1 = convert_crs(from_crs, pass_str=True)

    if to_crs is not None:
        to_crs1 = convert_crs(to_crs, pass_str=True)
        points = points.to_crs(to_crs1)
    geometry = [Point(xy) for xy in zip(data1[x_col], data1[y_col])]
    gpd1 = gpd.GeoDataFrame(data1.index, geometry=geometry, crs=from_crs1)
    gpd2 = gpd1.to_crs(crs=to_crs1)
    x = gpd2.geometry.apply(lambda p: p.x).round(digits).values
    y = gpd2.geometry.apply(lambda p: p.y).round(digits).values

    xy = np.column_stack((x, y))

    #### Prepare the x and y of the points geodataframe output
    x_int = points.geometry.apply(lambda p: p.x).round(digits).values
    y_int = points.geometry.apply(lambda p: p.y).round(digits).values
    sites = points[point_site_col]

    xy_int = np.column_stack((x_int, y_int))

    #### Create new df
    sites_ar = np.tile(sites, len(time))
    time_ar = np.repeat(time, len(xy_int))
    x_ar = np.tile(x_int, len(time))
    y_ar = np.tile(y_int, len(time))
    new_df = pd.DataFrame({
        'site': sites_ar,
        'time': time_ar,
        'x': x_ar,
        'y': y_ar,
        data_col: np.repeat(0,
                            len(time) * len(xy_int))
    })

    new_lst = []
    for t in pd.to_datetime(time):
        set1 = df2.loc[df2[time_col] == t, data_col]
        new_z = griddata(xy, set1.values, xy_int,
                         method=interp_fun).round(digits)
        new_z[new_z < 0] = 0
        new_lst.extend(new_z.tolist())


#        print(t)
    new_df.loc[:, data_col] = new_lst

    #### Export results
    return new_df[new_df[data_col].notnull()]

Пример #16

0

Показать файл

def rotate_to_run(m, avp):
    """
    Correct tilt and align with horizontal streamline over a single run
    Adapted from rotate_to_run.m by IMB July 2006
    references:
    Wilczak et al. 2001: sonic anemometer tilt corrections. BLM, 99, 127-150
    (but beware typos in equations)
    Kaimal & Finnigan, 1994: Atmospheric Boundary Layer Flows: Their
    Structure and Measurement. Oxford University Press
    van Dijk et al. 2004: The principles of surface flux physics: theory,
    practice and description of the ECPACK library
    www.met.wau.nl/projects/jep
    
    Parameters:
        m     : Unrotated metek data structure.
        avp   : Length of a single run (minutes). i.e. averaging period. 
        
    Returns:
        m_out     : Wind components in streamline oriented reference frame
    
    """

    # First rotate to align x-axis with mean wind direction in sonic's
    # reference frame

    m_g = m.groupby(pd.Grouper(
        freq='%sMin' % avp))  # Split into single runs (of length avp minutes)
    m_out = pd.DataFrame(
        columns=['x', 'y', 'z', 'T', 'u', 'v', 'w', 'theta', 'phi'])

    # Loop through each run to perform correction:
    for group in m_g:
        m = group[1]

        # First rotate to align x-axis with mean wind direction in sonic's
        # reference frame

        theta = np.arctan2(np.mean(m['y']), np.mean(m['x']))
        u1 = m['x'] * np.cos(theta) + m['y'] * np.sin(theta)
        v1 = -m['x'] * np.sin(theta) + m['y'] * np.cos(theta)
        w1 = m['z']

        # Next rotate u and w so that x-axis lies along mean streamline and
        # mean(w) is zero

        phi = np.arctan2(np.mean(w1), np.mean(u1))
        m['u'] = u1 * np.cos(phi) + w1 * np.sin(phi)
        m['v'] = v1
        m['w'] = -u1 * np.sin(phi) + w1 * np.cos(phi)

        # Theta is angle of rotation um-to-vm (anticlockwise or righthanded)
        # to align u with mean wind (degrees)

        m['theta'] = theta * 180 / np.pi

        # phi is tilt angle (+ve tilts x-axis upwards) to align x-axis with
        # mean streamline and force <w>=0

        m['phi'] = phi * 180 / np.pi
        m_out = m_out.append(m)
    return m_out

Пример #17

0

Показать файл

Файл: raster.py Проект: mullenkamp/HydroPandas

def grid_interp_ts(df,
                   time_col,
                   x_col,
                   y_col,
                   data_col,
                   grid_res,
                   from_crs=None,
                   to_crs=2193,
                   interp_fun='cubic',
                   agg_ts_fun=None,
                   period=None,
                   digits=2):
    """
    Function to take a dataframe of z values and interate through and resample both in time and space. Returns a DataFrame structured like df.

    Parameters
    ----------
    df: DataFrame
        DataFrame containing four columns as shown in the below parameters.
    time_col: str
        The time column name.
    x_col: str
        The x column name.
    y_col: str
        The y column name.
    data_col: str
        The data column name.
    grid_res: int
        The resulting grid resolution in meters (or the unit of the final projection).
    from_crs: int or str or None
        The projection info for the input data if the result should be reprojected to the to_crs projection (either a proj4 str or epsg int).
    to_crs: int or str
        The projection for the output data similar to from_crs.
    interp_fun: str
        The scipy Rbf interpolation function to be applied (see https://docs.scipy.org/doc/scipy-0.16.1/reference/generated/scipy.interpolate.Rbf.html).
    agg_ts_fun: str or None
        The pandas time series resampling function to resample the data in time (either 'mean' or 'sum'). If None, then no time resampling.
    period: str or None
        The pandas time series code to resample the data in time (i.e. '2H' for two hours).
    digits: int
        the number of digits to round.

    Returns
    -------
    DataFrame
    """

    #### Create the grids
    df1 = df.copy()

    #### Resample the time series data
    if agg_ts_fun is not None:
        df1a = df1.set_index(time_col)
        if agg_ts_fun == 'sum':
            df2 = df1a.groupby(
                [pd.TimeGrouper(period),
                 pd.Grouper(y_col),
                 pd.Grouper(x_col)])[data_col].sum().reset_index()
        elif agg_ts_fun == 'mean':
            df2 = df1a.groupby(
                [pd.TimeGrouper(period),
                 pd.Grouper(y_col),
                 pd.Grouper(x_col)])[data_col].mean().reset_index()
        else:
            raise ValueError("agg_ts_fun should be either 'sum' or 'mean'.")
        time = df2[time_col].unique()
    else:
        df2 = df1

    time = df2[time_col].sort_values().unique()

    if from_crs is None:
        x = df2.loc[df2[time_col] == time[0], x_col].values
        y = df2.loc[df2[time_col] == time[0], y_col].values
    else:
        data1 = df2.loc[df2[time_col] == time[0]]
        from_crs1 = convert_crs(from_crs, pass_str=True)
        to_crs1 = convert_crs(to_crs, pass_str=True)
        geometry = [Point(xy) for xy in zip(data1[x_col], data1[y_col])]
        gpd1 = gpd.GeoDataFrame(data1.index, geometry=geometry, crs=from_crs1)
        gpd2 = gpd1.to_crs(crs=to_crs1)
        x = gpd2.geometry.apply(lambda p: p.x).round(digits).values
        y = gpd2.geometry.apply(lambda p: p.y).round(digits).values

    xy = np.column_stack((x, y))

    max_x = x.max()
    min_x = x.min()

    max_y = y.max()
    min_y = y.min()

    new_x = np.arange(min_x, max_x, grid_res)
    new_y = np.arange(min_y, max_y, grid_res)
    x_int, y_int = np.meshgrid(new_x, new_y)

    #### Create new df
    x_int2 = x_int.flatten()
    y_int2 = y_int.flatten()
    xy_int = np.column_stack((x_int2, y_int2))
    time_df = np.repeat(time, len(x_int2))
    x_df = np.tile(x_int2, len(time))
    y_df = np.tile(y_int2, len(time))
    new_df = pd.DataFrame({
        'time': time_df,
        'x': x_df,
        'y': y_df,
        data_col: np.repeat(0,
                            len(time) * len(x_int2))
    })

    new_lst = []
    for t in pd.to_datetime(time):
        set1 = df2.loc[df2[time_col] == t, data_col]
        #        index = new_df[new_df['time'] == t].index
        new_z = griddata(xy, set1.values, xy_int,
                         method=interp_fun).round(digits)
        new_z[new_z < 0] = 0
        new_lst.extend(new_z.tolist())


#        print(t)
    new_df.loc[:, data_col] = new_lst

    #### Export results
    return new_df[new_df[data_col].notnull()]

Пример #18

0

Показать файл

Файл: 1.dividendyield.py Проект: cagdemir/equity-index-predictors

con.start()

index_tickers = ['NYA Index', 'SPX Index', 'CCMP Index','NDX Index','CDAX Index' ,'DAX Index', 
            'ASX Index','UKX Index', 'TPX Index','NKY Index', 'SHCOMP Index' , 
           'SZCOMP Index','XUTUM Index','XU100 Index',  'MEXBOL Index', 
           'IBOV Index', 'IMOEX Index' , 'JALSH Index']
 #Gross Aggregate Dividend Yield
dy = con.bdh(index_tickers,['GROSS AGGTE DVD YLD'], firstday, today)


dy_int = dy.interpolate(method='cubic')

#dy_temp = dy.interpolate(method='spline',order=2,limit=10, limit_direction='backward')
#dy_int_temp = dy_int.copy()
#dy_int_temp.update(dy_temp, overwrite=True)

dy_w = dy_int.groupby(pd.Grouper(freq='W')).mean()
dy_w = dy_w[dy_w.index>=start]
dy_w.fillna(method='bfill', inplace=True)


var_no = '1'
dy_w.columns = [i[0] for i in dy_w.columns]
dy_w = dy_w[index_tickers]
dy_w.columns = [var_no+'_'+i for i in dy_w.columns]
dy_w = dy_w[dy_w.index>=start]
#dy_w.columns = ['1_US_NY','1_US_SPX','1_US_CCMP', '1_DE','1_UK','1_JP','1_CH_SH','1_CH_SZ', '1_TR','1_MX','1_BR','1_RU','1_SA']

dy_w.to_excel('C:/Users/sb0538/Desktop/15022020/excels/1_dividendyield.xlsx')

Пример #19

0

Показать файл

Файл: final_code.py Проект: DJKarma/My-Scripts

def  get_reach_location(loc1,tweet):
    tweet =tweet.groupby( pd.Grouper(key='Location'))['Reach'].sum().reset_index()
    g=tweet.groupby('Location')  
    return  g.get_group(loc1)

Пример #20

0

Показать файл

Файл: helpers.py Проект: naraqb/gtfs_kit

def downsample(time_series: pd.DataFrame, freq: str) -> pd.DataFrame:
    """
    Downsample the given route, stop, or feed time series,
    (outputs of :func:`.routes.compute_route_time_series`,
    :func:`.stops.compute_stop_time_series`, or
    :func:`.miscellany.compute_feed_time_series`,
    respectively) to the given Pandas frequency string (e.g. '15Min').
    Return the given time series unchanged if the given frequency is
    shorter than the original frequency.
    """

    f = time_series.copy()

    # Can't downsample to a shorter frequency
    if f.empty or pd.tseries.frequencies.to_offset(
            freq) <= pd.tseries.frequencies.to_offset(pd.infer_freq(f.index)):
        return f

    result = None
    if "stop_id" in time_series.columns.names:
        # It's a stops time series
        result = f.resample(freq).sum(min_count=1)
    else:
        # It's a route or feed time series.
        inds = [
            "num_trips",
            "num_trip_starts",
            "num_trip_ends",
            "service_distance",
            "service_duration",
        ]
        frames = []

        # Resample num_trips in a custom way that depends on
        # num_trips and num_trip_ends
        def agg_num_trips(group):
            return group["num_trips"].iloc[-1] + group[
                "num_trip_ends"].iloc[:-1].sum(min_count=1)

        num_trips = f.groupby(pd.Grouper(freq=freq)).apply(agg_num_trips)
        frames.append(num_trips)

        # Resample the rest of the indicators via summing, preserving all-NaNs
        frames.extend([
            f[ind].resample(freq).agg(lambda x: x.sum(min_count=1))
            for ind in inds[1:]
        ])

        g = pd.concat(frames, axis=1, keys=inds)

        # Calculate speed and add it to f. Can't resample it.
        speed = (g.service_distance / g.service_duration).fillna(
            g.service_distance)
        speed = pd.concat({"service_speed": speed}, axis=1)
        result = pd.concat([g, speed], axis=1)

    # Reset column names and sort the hierarchical columns to allow slicing;
    # see http://pandas.pydata.org/pandas-docs/stable/advanced.html#sorting-a-multiindex
    result.columns.names = f.columns.names
    result = result.sort_index(axis=1, sort_remaining=True)

    # Set frequency, which is not automatically set
    result.index.freq = freq

    return result

Пример #21

0

Показать файл

Файл: final_code.py Проект: DJKarma/My-Scripts

def get_text_location(loc1,tweet): #location_based
    tweet =tweet.groupby( pd.Grouper(key='Location'))
    tweet=tweet.get_group(loc1)
    text=tweet['Tweet'].T.tolist()
    return text

Пример #22

0

Показать файл

def render_body(startdate, enddate, aggregation):
    df = Body.load_df()
    mask = (df.date >= startdate) & (df.date <= enddate)
    df = df.loc[mask]
    df.index = df.date
    df = df.groupby(pd.Grouper(freq=aggregation)).mean()
    y1 = df.weight
    y2 = df.muscle_mass
    y3 = df.fat_mass_weight

    df2 = Total_Energy.load_df()
    mask = (df2.date >= startdate) & (df2.date <= enddate)
    df2 = df2.loc[mask]
    df2.index = df2.date
    df2 = df2.groupby(pd.Grouper(freq=aggregation)).mean()

    fig = make_subplots(rows=4,
                        cols=1,
                        shared_xaxes=True,
                        vertical_spacing=0.01)
    fig.append_trace(go.Scatter(
        x=df.index,
        y=y1,
        mode='lines+markers',
        line={
            "shape": "spline",
            "color": "#6610f2"
        },
        hoverinfo="text",
        hovertemplate="<b>Weight:</b> <br> %{y:.2f)} kg<extra></extra>",
        connectgaps=True),
                     row=1,
                     col=1)

    fig.append_trace(go.Scatter(
        x=df.index,
        y=y2,
        mode='lines+markers',
        line={
            "shape": "spline",
            "color": "#20c997"
        },
        hoverinfo="text",
        hovertemplate="<b>Muscle Mass:</b> <br> %{y:.2f)} kg<extra></extra>",
        connectgaps=True),
                     row=2,
                     col=1)

    fig.append_trace(
        go.Scatter(x=df.index,
                   y=y3,
                   mode='lines+markers',
                   line={
                       "shape": "spline",
                       "color": "#eb6864"
                   },
                   hoverinfo="text",
                   hovertemplate=
                   "<b>Fat Mass Weight:</b> <br> %{y:.2f)} kg<extra></extra>",
                   connectgaps=True),
        row=3,
        col=1)

    fig.append_trace(go.Bar(x=df2.index, y=df2.active_energy), row=4, col=1)

    fig.update_layout(showlegend=False,
                      margin={
                          "t": 10,
                          "l": 0,
                          "r": 0,
                          "b": 40
                      },
                      plot_bgcolor="white",
                      hovermode="x unified",
                      height=700)

    fig.update_traces(xaxis='x4', row=1, col=1)
    fig.update_traces(xaxis='x4', row=2, col=1)
    fig.update_traces(xaxis='x4', row=3, col=1)
    fig.update_traces(xaxis='x4', row=4, col=1)

    fig.update_yaxes(range=[y1.min() - 0.1, y1.max() + 1], row=1, col=1)
    fig.update_yaxes(range=[y2.min() - 0.1, y2.max() + 0.1], row=2, col=1)
    fig.update_yaxes(range=[y3.min() - 1, y3.max() + 0.1], row=3, col=1)

    if aggregation == "D" or aggregation == "W":
        fig.update_xaxes(tickformat="%d-%m-%y")
    elif aggregation == "M":
        fig.update_xaxes(tickformat="%m-%Y")
    elif aggregation == "Y":
        fig.update_xaxes(tickformat="%Y", ticklabelmode="period")

    return fig

Пример #23

0

Показать файл

Файл: final_code.py Проект: DJKarma/My-Scripts

def get_reach_weekly(tweet):
    tweet =tweet.groupby( pd.Grouper(key='TwittedAt', freq='W'))['Reach'].sum().reset_index().sort_values('TwittedAt')
    print(tweet)

Пример #24

0

Показать файл

            ['Kg', 'kg', 'Kilo', 'Litre', 'Litres']),
    commands['CP_QuantiteTotale'] * 1000,
    np.where(
        np.logical_and(
            np.isin(commands['CP_QuantiteUnite'],
                    ['Unités', 'Unité', 'unité', 'unités', 'pièces', 'Pcs']),
            np.isin(commands['Qty_unit'], ['g'])),
        commands['Qty_val'] * commands['CP_QuantiteTotale'], float('nan')))
end = time.clock()
print('processing time: ', round(end - start), ' seconds')

#aggregate df at day x EC x RC x food group level & drop line without quantity in grams
print('----> group by ---->')
start = time.clock()
commands_agg = commands.groupby([
    pd.Grouper(key='CO_DateCommande', freq='D'), "EC_Id", "RC_Id",
    "P_food_group"
]).agg({
    "Qty_totale": "sum"
}).dropna().reset_index().set_index(["CO_DateCommande", "EC_Id", "RC_Id"])
end = time.clock()
print('processing time: ', round(end - start), ' seconds')

#create a df with food groups and ids & replacing food groups in main df by ids
print('----> food group dictionnary ---->')
start = time.clock()
food_groups = commands_agg.reset_index()["P_food_group"].drop_duplicates(
).reset_index(drop=True)
food_groups_dict = dict(enumerate(food_groups.tolist()))
food_groups_inv = {v: k for k, v in food_groups_dict.items()}
commands_id = commands_agg.replace(

Пример #25

0

Показать файл

Файл: make_figure_5-6_paper.py Проект: ua-snap/seaice_noaa_indicators

    fubu_df = pd.DataFrame(fubu)

    # FUBU dates
    day_list = pd.Series(np.concatenate([
        np.arange(1, 367) if calendar.isleap(year) else np.arange(1, 366)
        for year in fubu_df.index
    ]),
                         index=pd.date_range('{}-01-01'.format(begin_year),
                                             '{}-12-31'.format(end_year),
                                             freq='D'))
    day_list = day_list.loc[sl.start:sl.stop]
    day_list[:] = np.nan

    # out = []
    for dt, df in day_list.groupby(pd.Grouper(freq='Y')):
        year = dt.year
        for metric in metrics:
            val = fubu_df.loc[year, metric] - 1
            ts = pd.Timestamp(
                datetime.datetime.strptime(str(year) + str(val), '%Y%j'))
            if ts > day_list.index[0] and ts < day_list.index[-1]:
                day_list.loc[ts] = annual_dat.loc[ts]

    plt.figure(figsize=(10, 6))
    # plot the 'annual' data
    plt.plot(annual_dat.values)

    # plot extended climatology
    plt.plot(clim_mean)
    plt.plot(day_list.values, 'bo')

Пример #26

0

Показать файл

Файл: utils.py Проект: Gci04/Household-Power-Consumption

def get_data(data_path="../Dataset/household_power_consumption_data.zip",
             do_profile=False,
             with_app_stat=False):

    df = pd.read_csv(data_path,
                     sep=';',
                     parse_dates={'dt': ['Date', 'Time']},
                     infer_datetime_format=True,
                     low_memory=False,
                     na_values=['nan', '?'],
                     index_col='dt')
    print("[Reading Data] : DONE")
    df.drop([
        "Global_active_power", "Global_reactive_power", "Voltage",
        "Global_intensity"
    ],
            axis=1,
            inplace=True)

    if do_profile:
        profile = pandas_profiling.ProfileReport(df)
        profile.to_file("report.html")
        print("[Profiling data finished]")

    #fill nan values with column average
    for j in range(0, 3):
        df.iloc[:, j] = df.iloc[:, j].fillna(df.iloc[:, j].mean())

    print("[Filling Missing Values] : DONE")

    df["consumption"] = df.iloc[:, :].sum(
        axis=1)  #consumption -> Energy consumption
    if with_app_stat:
        df_with_app_status = appliances_status(df)
        grouped = df_with_app_status.groupby(
            pd.Grouper(freq='1h', base=0, label='right')).agg({
                "consumption":
                lambda x: np.sum(x) / 60,
                "Set1":
                "any",
                "Set2":
                "any",
                "Set3":
                "any"
            })
        data = grouped * 1
    else:
        grouped = df["consumption"].groupby(
            pd.Grouper(freq='1h', base=0, label='right')).sum()
        data = pd.DataFrame(grouped / 60)

    data = merge_additional_features(data)
    print("[Extracting features from timestamp] : DONE")

    xtrain = data.loc["2006":"2010"]
    ytrain = xtrain.pop("consumption")

    xtest = data.loc["2010":]
    ytest = xtest.pop("consumption")

    print("[Train Test split] : DONE")

    return xtrain, ytrain, xtest, ytest

Пример #27

0

Показать файл

Файл: WG_Multi_HisAnalysis.py Проект: qa4510qa/GCCRW_yhHuang

def CalSimI(r, MultiSiteDict, Setting, Stat, Wth_gen):
    # MultiSiteDict = MultiSiteDict.copy()
    Var = Setting["Var"] + ["P_Occurrence"]  # Add prep event variable
    rSimYear = Setting["MultiSite"]["rSimYear"]
    Stns = Setting["StnID"]
    plot = Setting["Plot"]["Multi_ECDFFittingPlot"]
    DayInMonth = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
    SpatialRnNum = {}

    # Simulate spatial correlated RN
    for v in Var:
        # Gen 40 years for the size is greater than 1000 for each month, which we consider as statistically robust
        Rn = 0
        for y in range(rSimYear):
            for m in range(12):
                day_in_month = DayInMonth[m]
                W = MultiSiteDict["Weight"][v][m + 1]
                # Gen Rn
                if v == "PP01" or "P_Occurrence":
                    rn = GenMultiRN(r,
                                    W,
                                    Type="P",
                                    Size=day_in_month,
                                    TransformFunc=ECDFFitting(r, W, plot),
                                    HisGen=True)
                else:
                    rn = GenMultiRN(r,
                                    W,
                                    Type="T",
                                    Size=day_in_month,
                                    TransformFunc=Standardization(r, W),
                                    HisGen=True)
                # Add up
                if type(Rn) is int:
                    Rn = rn
                else:
                    Rn = np.concatenate((Rn, rn), axis=1)
        SpatialRnNum[v] = Rn
    MultiSiteDict["SpatialRnNum"] = SpatialRnNum

    # Re-organize the Rn to each stn
    for i, s in enumerate(Stns):
        RnNum = pd.DataFrame()
        for v in Var:
            RnNum[v] = SpatialRnNum[v][i]
        Stat[s]["RnNum"] = RnNum

    # Creat the "Setting" dictionary for I-r curve simulation. We need to turn off the leap year option to make sure the output are able to be iterate
    Setting_Multi = Setting.copy()
    Setting_Multi["GenYear"] = rSimYear
    Setting_Multi["LeapYear"] = False
    Setting_Multi["Condition"] = True
    for k in list(
            Setting_Multi["Plot"].keys()):  # Ture off all plotting options
        Setting_Multi["Plot"][k] = False

    # Generate weather data and re-calculate spetial autocorrelation index
    # Use single core here since we already distribute r into different cores/threads.
    Wth_gen, Stat = Generate(Wth_gen,
                             Setting_Multi,
                             Stat,
                             Export=False,
                             ParalCores=1)
    SimI = HisI(MultiSiteDict, Setting, Wth_gen, ForGenWth=False)["HisI"]

    # Calculate monthly mean for establish I-r curve
    rng = pd.date_range(pd.datetime(2013, 1, 1), pd.datetime(2013, 12, 31))
    SimI.index = rng
    SimI = SimI.groupby(pd.Grouper(freq='M')).mean()
    SimI = SimI.reset_index().drop("index", axis=1)
    SimI.index = np.arange(1, 13)
    return SimI

Пример #28

0

Показать файл

Файл: get_ping.py Проект: TELUS-BBA/living-lab-visualize

def get_ping_count(conn, year, month, day):
    query = 'select * from testresults_pingresult where time between "{year}-{month:02}-{day:02} 00:00:00" and "{year}-{month:02}-{day:02} 23:59:59"'
    print('getting {}-{:02}-{:02}'.format(year, month, day))
    df = pd.read_sql_query(query.format(year=year, month=month, day=day), conn)
    df.loc[:, 'time'] = pd.to_datetime(df.loc[:, 'time'])
    df2 = df.loc[:, ['id', 'nanopi_id', 'state', 'time']].groupby(['nanopi_id', 'state', pd.Grouper(freq='1H', key='time')]).count().unstack(level=1).loc[:, 'id'].fillna(value=0)
    return df2

Пример #29

0

Показать файл

Файл: my_first_timeseriesv3.py Проект: Richardbmk/ExploringWWII

from bokeh.plotting import figure, output_file, show
from bokeh.models import ColumnDataSource
from datetime import datetime
from bokeh.palettes import Spectral3 #@UnresolvedImport
from bokeh.models import BoxAnnotation

output_file('eto_operations.html')

df = pd.read_csv('thor_wwii.csv')

#filter for the European Theater of Operations
filter = df['THEATER']=='ETO'
df = df[filter]

df['MSNDATE'] = pd.to_datetime(df['MSNDATE'], format='%m/%d/%Y')
group = df.groupby(pd.Grouper(key='MSNDATE', freq='M'))['TOTAL_TONS', 'TONS_IC', 'TONS_FRAG'].sum()
group = group / 1000

source = ColumnDataSource(group)

p = figure(x_axis_type="datetime")

p.line(x='MSNDATE', y='TOTAL_TONS', line_width=2, source=source, legend='All Munitions')
p.line(x='MSNDATE', y='TONS_FRAG', line_width=2, source=source, color=Spectral3[1], legend='Fragmentation')
p.line(x='MSNDATE', y='TONS_IC', line_width=2, source=source, color=Spectral3[2], legend='Incendiary')

p.title.text = 'European Theater of Operations'

p.yaxis.axis_label = 'Kilotons of Munitions Dropped'
p.legend.location = 'top_left'

Пример #30

0

Показать файл

data_show = []
weekend7 = pd.date_range(start='2015-09-05', end='2015-10-31', freq='7D')
weekend6 = pd.date_range(start='2015-09-06', end='2015-10-31', freq='7D')
low = 0
up = 0
avg_number = 0
median_number = 0
# plt.figure()
lr_predict_data = weather_data[len(weather_data) - 61:]
del lr_predict_data['leasetime']
for j, i in enumerate(sample_shedid.values, start=1):
    if i in shedid.values:
        each_data = data[data.SHEDID == i]
        each_data['show'] = 1
        each_data = each_data[['leasetime', 'show']].set_index('leasetime')
        each_data = each_data.groupby(pd.Grouper(freq='1D')).sum().fillna(0)
        data_show.append([i, len(each_data)])
        if len(each_data) < (2 * 7):
            data_merge['time'] = pd.date_range(start='2015-09-01',
                                               periods=61,
                                               freq='D')
            data_merge['SHEDID'] = i
            data_merge['LEASE'] = each_data.values.mean() * 1.25
            data_merge.LEASE = data_merge[['time', 'LEASE']].apply(
                lambda x: 1 * x.LEASE
                if x.time in weekend6 or x.time in weekend7 else x.LEASE,
                axis=1)
            data_merge['time'] = data_merge.apply(
                lambda x: str(x['time'].year) + str('/') + str(x[
                    'time'].month) + str('/') + str(x['time'].day),
                axis=1)

Python Grouper примеры использования