def resample(self, indexer: Mapping[Hashable, str] = None, skipna=None, closed: str = None, label: str = None, base: int = 0, keep_attrs: bool = None, loffset=None, restore_coord_dims: bool = None, **indexer_kwargs: str): """Returns a Resample object for performing resampling operations. Handles both downsampling and upsampling. If any intervals contain no values from the original object, they will be given the value ``NaN``. Parameters ---------- indexer : {dim: freq}, optional Mapping from the dimension name to resample frequency. skipna : bool, optional Whether to skip missing values when aggregating in downsampling. closed : 'left' or 'right', optional Side of each interval to treat as closed. label : 'left or 'right', optional Side of each interval to use for labeling. base : int, optional For frequencies that evenly subdivide 1 day, the "origin" of the aggregated intervals. For example, for '24H' frequency, base could range from 0 through 23. loffset : timedelta or str, optional Offset used to adjust the resampled time labels. Some pandas date offset strings are supported. keep_attrs : bool, optional If True, the object's attributes (`attrs`) will be copied from the original object to the new one. If False (default), the new object will be returned without attributes. restore_coord_dims : bool, optional If True, also restore the dimension order of multi-dimensional coordinates. **indexer_kwargs : {dim: freq} The keyword arguments form of ``indexer``. One of indexer or indexer_kwargs must be provided. Returns ------- resampled : same type as caller This object resampled. Examples -------- Downsample monthly time-series data to seasonal data: >>> da = xr.DataArray(np.linspace(0, 11, num=12), ... coords=[pd.date_range('15/12/1999', ... periods=12, freq=pd.DateOffset(months=1))], ... dims='time') >>> da <xarray.DataArray (time: 12)> array([ 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11.]) Coordinates: * time (time) datetime64[ns] 1999-12-15 2000-01-15 2000-02-15 ... >>> da.resample(time="QS-DEC").mean() <xarray.DataArray (time: 4)> array([ 1., 4., 7., 10.]) Coordinates: * time (time) datetime64[ns] 1999-12-01 2000-03-01 2000-06-01 2000-09-01 Upsample monthly time-series data to daily data: >>> da.resample(time='1D').interpolate('linear') <xarray.DataArray (time: 337)> array([ 0. , 0.032258, 0.064516, ..., 10.935484, 10.967742, 11. ]) Coordinates: * time (time) datetime64[ns] 1999-12-15 1999-12-16 1999-12-17 ... Limit scope of upsampling method >>> da.resample(time='1D').nearest(tolerance='1D') <xarray.DataArray (time: 337)> array([ 0., 0., nan, ..., nan, 11., 11.]) Coordinates: * time (time) datetime64[ns] 1999-12-15 1999-12-16 ... 2000-11-15 References ---------- .. [1] http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases """ # noqa # TODO support non-string indexer after removing the old API. from .dataarray import DataArray from .resample import RESAMPLE_DIM from ..coding.cftimeindex import CFTimeIndex if keep_attrs is None: keep_attrs = _get_keep_attrs(default=False) # note: the second argument (now 'skipna') use to be 'dim' if ((skipna is not None and not isinstance(skipna, bool)) or ("how" in indexer_kwargs and "how" not in self.dims) or ("dim" in indexer_kwargs and "dim" not in self.dims)): raise TypeError( "resample() no longer supports the `how` or " "`dim` arguments. Instead call methods on resample " "objects, e.g., data.resample(time='1D').mean()") indexer = either_dict_or_kwargs(indexer, indexer_kwargs, "resample") if len(indexer) != 1: raise ValueError( "Resampling only supported along single dimensions.") dim, freq = next(iter(indexer.items())) dim_name = dim dim_coord = self[dim] if isinstance(self.indexes[dim_name], CFTimeIndex): from .resample_cftime import CFTimeGrouper grouper = CFTimeGrouper(freq, closed, label, base, loffset) else: # TODO: to_offset() call required for pandas==0.19.2 grouper = pd.Grouper( freq=freq, closed=closed, label=label, base=base, loffset=pd.tseries.frequencies.to_offset(loffset), ) group = DataArray(dim_coord, coords=dim_coord.coords, dims=dim_coord.dims, name=RESAMPLE_DIM) resampler = self._resample_cls( self, group=group, dim=dim_name, grouper=grouper, resample_dim=RESAMPLE_DIM, restore_coord_dims=restore_coord_dims, ) return resampler
cfg = yaml.load(f) # if cfg["Server"]: # c = get_client(agent=cfg["Agent_IP"], entity=cfg["Entity_File"]) # else: c = get_client() now = datetime.datetime(year=2018, month=06, day=01).replace(tzinfo=pytz.timezone("UTC")) dataManager = ThermalDataManager(cfg, c) o = dataManager._get_outside_data(now-datetime.timedelta(days=10), now) o = dataManager._preprocess_outside_data(o.values()) grouper = o.groupby([pd.Grouper(freq='1H')]) print(grouper['t_out'].mean()) # zo = o.values()[0] # print zo.iloc[zo.shape[0]/2 - 5:] # print(o) # print(dataManager._preprocess_outside_data(o.values())) # o.dropna() # print("shape of outside data", o.shape) # print("number of 32 temperatures", (o["t_out"] == 32).sum()) # t = dataManager.thermal_data(days_back=50) # print(t) # # plots the data here .
def import_results_from_gleam( self, sims_dir, regions, *, allow_unfinished=False, resample=None, overwrite=False, info_level=logging.DEBUG, ): """ Import simulation result data from GLEAMViz data/sims dir into the HDF5 file. """ if "new_fraction" in self.hdf and not overwrite: raise Exception(f"Would overwrite existing `new_fraction` in {self}!") sims_df = self.hdf["simulations"] sims_dir = Path(sims_dir) for sid, sim in sims_df.iterrows(): path = sims_dir / f"{sid}.gvh5" / "results.h5" if not path.exists() and not allow_unfinished: raise Exception(f"No gleam result found for {sid} {sim.Name!r}") dfs = [] skipped = set() for sid, sim in sims_df.iterrows(): path = sims_dir / f"{sid}.gvh5" / "results.h5" if not path.exists() and allow_unfinished: log.log(info_level, "Skipping missing result file {} ..".format(path)) continue log.log(info_level, "Loading results from {} ..".format(path)) with tables.File(path) as f: for r in regions: if pd.isnull(r.GleamID): skipped.add(r.DisplayName) continue gtype = LEVEL_TO_GTYPE[r.Level] node = f.get_node(f"/population/new/{gtype}/median/dset") days = pd.date_range(sim.StartDate, periods=node.shape[3], tz="utc") dcols = {} for ci, cn in COMPARTMENTS.items(): new_fraction = node[ci, 0, int(r.GleamID), :] new_fraction = np.expand_dims(new_fraction, 0) idx = pd.MultiIndex.from_tuples( [(sid, r.Code)], names=["SimulationID", "Code"] ) dcols[cn] = pd.DataFrame( new_fraction.astype("float32"), index=idx, columns=pd.Index(days, name="Date"), ).stack() dfs.append(pd.DataFrame(dcols).sort_index()) if skipped: log.info(f"Skipped {len(skipped)} regions without GleamID: {skipped!r}") if not dfs: raise Exception("No GLEAM records loaded!") dfall = pd.concat(dfs) len0 = len(dfall) if resample is not None: dfall = dfall.groupby( [ pd.Grouper(level=0), pd.Grouper(level=1), pd.Grouper(freq=resample, level=2), ] ).mean() self.hdf.put( "new_fraction", dfall, format="table", complib="bzip2", complevel=9 ) log.info(f"Loaded {len0} GLEAM result rows into {self} (resampling {resample})")
def get_text_weekly(week,tweet): #weekly tweet =tweet.groupby( pd.Grouper(key='TwittedAt',freq='W')) #for monthly analysis ,use freq='M' tweet=tweet.get_group(week) text=tweet['Tweet'].T.tolist() return text
def get_tweet_count_weekly(tweet): tweet =tweet.groupby( pd.Grouper(key='TwittedAt', freq='W'))['Tweet'].count().reset_index().sort_values('TwittedAt') print(tweet)
weekly_data(tweet) #In[27]: tweet=cleaned_tweet() #location based analysis x=top_five(top_location(tweet)) print(x) print("\n\n\n") get_location_data(x) #In[101]: #overall analysis tweet=cleaned_tweet() tweet=tweet.groupby( pd.Grouper(key='UserName'))['Tweet'].count().reset_index() tweet.sort_values(["Tweet"], axis=0,ascending=False, inplace=True) print(tweet.head(10)) #top ten most active accounts #In[102]: tweet=cleaned_tweet() tweet.drop_duplicates(subset ="UserName", keep ="last", inplace = True) tweet.sort_values(["Reach"], axis=0,ascending=False, inplace=True) tweet.head(10) #highest contributers #In[103]: tweet=cleaned_tweet() tweet['Reach'].sum() #total reach #In[104]:
def get_tweet_count_location(loc1,tweet): tweet =tweet.groupby( pd.Grouper(key='Location'))['Tweet'].count().reset_index() g=tweet.groupby('Location') return g.get_group(loc1)
seasonal_periods=680).fit() # Let's predict 365 days into the future crime_forecast = crime_model.forecast(365) # Print original and then our prediction df2.plot(figsize=(12, 6)) crime_forecast.plot() #xlim=['2015-01-01','2021-10-10']) plt.ylabel('Number of Arrests:False') plt.xlabel('Years') plt.legend(['Actual', "Forecasted"]) # In[ ]: # In[181]: df_True_district = df[df["Arrest"] == True].groupby( [pd.Grouper('District')] + [pd.Grouper(level='timestamp', freq='M')]).size().to_frame() df_True_district.columns = ["Number of Crimes"] #df4['timestamp']= pd.to_datetime(df4['timestamp']) df_True_district # In[182]: df_True_plot = df_True_district.query("District <= 5.0") df_True_plot plt.figure(figsize=(17, 10)) sns.lineplot(data=df_True_plot, x=df_True_plot.index.get_level_values('timestamp'), y="Number of Crimes", hue=df_True_plot.index.get_level_values('District'),
def create_feature(self): n_train = self.train.shape[0] n_test = self.test.shape[0] # aggregate by day self.train.index = pd.to_datetime(self.train["timestamp"]) self.test.index = pd.to_datetime(self.test["timestamp"]) train_agg = self.train.groupby(\ ["building_id", "meter", pd.Grouper(freq="24h")])\ .agg(np.median).reset_index() test_agg = self.test.groupby(\ ["building_id", "meter", pd.Grouper(freq="24h")])\ .agg(np.median).reset_index() # KCPD for each building building_inds = train_agg["building_id"].unique() dfs_train = []; dfs_test = [] for bidx in tqdm(building_inds): df_train = train_agg.query(f"building_id == {bidx}") df_test = test_agg.query(f"building_id == {bidx}") df_train = pd.DataFrame(dict(timestamp=\ pd.date_range(train_agg["timestamp"][0], train_agg["timestamp"][train_agg["timestamp"].size - 1])\ ))\ .merge(df_train, on="timestamp", how="left") df_test = pd.DataFrame(dict(timestamp=\ pd.date_range(test_agg["timestamp"][0], test_agg["timestamp"][test_agg["timestamp"].size - 1])\ ))\ .merge(df_test, on="timestamp", how="left") dfp = df_train.pivot(index="timestamp", columns="meter", values=["meter_reading"]) X = dfp.values dfp_test = df_test.pivot(index="timestamp", columns="meter", values="building_id") dfp["timestamp"] = dfp.index dfp_test["timestamp"] = dfp_test.index # standardize sig = StandardScaler().fit_transform(X) # change-point detection kcpd = DynpKcpd(min_size=7, jump=1, max_n_bkps=50).fit(sig) bkps = kcpd.predict(sig, beta=0.1)["best_bkps"] bkps.insert(0, 0) # add segment label segment_label = np.repeat(range(len(bkps) - 1),\ np.diff(np.array(bkps))) # import pdb; pdb.set_trace() dfp["segment"] = segment_label # NOTE: 2016 is leap year segment_label = np.delete(segment_label, 31 + 29 - 1) dfp_test["segment"] = segment_label.tolist() * 2 df_train = df_train.merge(dfp["segment"], on="timestamp", how="left") df_test = df_test.merge(dfp_test["segment"], on="timestamp", how="left") dfs_train.append(df_train) dfs_test.append(df_test) del dfp del dfp_test del X del sig del kcpd del bkps del segment_label gc.collect() del df_train del df_test gc.collect() dfs_train = pd.concat(dfs_train, axis=0) dfs_train.index = dfs_train["timestamp"] dfs_train = dfs_train.groupby(["building_id", "meter"])\ .resample("H").ffill()["segment"].reset_index() dfs_test = pd.concat(dfs_test, axis=0) dfs_test.index = dfs_test["timestamp"] dfs_test = dfs_test.groupby(["building_id", "meter"])\ .resample("H").ffill()["segment"].reset_index() self.train.index.names = ["date"] self.test.index.names = ["date"] dfs_train["building_id"] = dfs_train["building_id"].astype(np.int16) dfs_train["meter"] = dfs_train["meter"].astype(np.int16) dfs_train["segment"] = dfs_train["segment"].fillna(method="ffill") dfs_test["building_id"] = dfs_test["building_id"].astype(np.int16) dfs_test["meter"] = dfs_test["meter"].astype(np.int16) dfs_test["segment"] = dfs_test["segment"].fillna(method="ffill") # merge segmentation label train = self.train.merge(dfs_train,\ on=["building_id", "timestamp", "meter"], how="left") test = self.test.merge(dfs_test,\ on=["building_id", "timestamp", "meter"], how="left") train["segment"] = train["segment"].fillna(method="ffill") test["segment"] = test["segment"].fillna(method="ffill") assert train.shape[0] == n_train, f"length must be the same. original:{n_train}, processed:{self.train.shape[0]}" assert test.shape[0] == n_test, f"length must be the same. original:{n_test}, processed:{self.test.shape[0]}" return train["segment"], test["segment"]
table_invokana, 'prov_prescribing_npi', 'New to Brand Providers of Invokana') frac_timeline, new_to_invokana_patients = unique_adopters_plot( table_invokana, 'hvid', 'New to Brand Patients of Invokana') frac_timeline, new_to_trulicity_providers = unique_adopters_plot( table_trulicity, 'prov_prescribing_npi', 'New to Brand Providers of Trulicity') frac_timeline, new_to_trulicity_patients = unique_adopters_plot( table_trulicity, 'hvid', 'New to Brand Patients of Trulicity') ### Analytics - Total volumne dispensed by month # The columns "dispensed_quantity" and "days_supply" are the same plt.figure(figsize=(8, 6)) table_invokana[['date_service', 'dispensed_quantity' ]].groupby([pd.Grouper(freq='1M', key='date_service') ]).sum().plot(figsize=(6, 4), title='Total Volume for Invokana', legend=False, fontsize=12) table_trulicity[['date_service', 'dispensed_quantity' ]].groupby([pd.Grouper(freq='1M', key='date_service') ]).sum().plot(figsize=(6, 4), title='Total Volume for Trulicity', legend=False, fontsize=12) ##### Analytics Counts of Refills authorized plt.figure(figsize=(8, 6)) refill_auth = table_invokana.refill_auth_amount.value_counts() plt.scatter(refill_auth.index, refill_auth.values, label='Drug I')
dayfirst=True) bila['Patient'] = bila['Patient'].map(pdict) ### milk composition according to Michaelsen etal (1990), Macronutrient (g/dL) and energy (kcal/dL) milk = pd.DataFrame({ 'Group': ['protein', 'fat', 'lactose', 'energy'], "Median": [.9, 3.6, 7.2, 67], '-2std': [.6, 1.8, 6.4, 17], '+2std': [1.4, 8.9, 7.6, 117] }) milk.set_index('Group', inplace=True) bila['Protein'] = bila.SummeEnteral * milk.loc['protein', 'Median'] / 10 bila['Fat'] = bila.SummeEnteral * milk.loc['fat', 'Median'] / 10 bila['Lactose'] = bila.SummeEnteral * milk.loc['lactose', 'Median'] / 10 bila['Energy'] = bila.SummeEnteral * milk.loc[ 'energy', "Median"] / 10 ## cause it is kcal/dl bila = bila.groupby(["Patient", pd.Grouper(key='Date', freq='D')]).sum().reset_index() bila['DoL'] = bila.groupby('Patient')['Date'].transform( lambda x: x - x.min() + pd.Timedelta(days=0)).dt.days #bila["Timepoint"] = pd.cut(anti.Age, bins=[0,2,5,9,16, np.inf], labels=False).apply(lambda x: x+1 ) bila = bila[bila.Energy > 0] #bila.to_csv('/media/christos/ssd/work/Infants/tmp/feeding.tsv',sep='\t', index=False) meta.reset_index(inplace=True) enter = meta.merge(nut, on=['Patient', 'Date'], how='left')[['Patient', 'Age', 'Ratio', 'Timepoint']] #enter['DoL'] = enter.groupby('Patient')['Date'].transform(lambda x: x-x.min()+pd.Timedelta(days=1)).dt.days enter['Enteral_feeding'] = enter.Ratio.astype(str).apply( lambda x: int(x.split(":")[1]) ) ### this is the percentage of enteral feeding received for this day enter.drop(columns=['Ratio'], inplace=True) meta = meta.merge(enter.drop_duplicates(['Patient', 'Age']),
################################################################### # Group by patient and sum # Group by patient and sum agg = aux.groupby('patient').sum() # Show if TERMINAL: print("\nOut:") print(agg) agg ################################################################### # Group by patient (2days) and .. agg = aux.groupby(by=['patient', pd.Grouper(freq='2D')]) \ .agg('mean', 'max') #.agg({'idx': ['first', 'last'], # 0: [skew, kurtosis, own], # 1: [skew, kurtosis, own], # '0_hr': [own], # '0_rr': [own]}) # Show if TERMINAL: print("\nOut:") print(agg) agg def f(x):
fields=['CLOSE'], start_date=firstday, end_date=today, interval='weekly') #Uk # 3 months #short_term_tickers = ['US3MT=RR' ,'GB3MT=RR','TR1YT=RR','ZA3MT=RR','JP3MT=RR','MX3MT=RR','RU3MT=RR','BR1YT=RR','DE3MT=RR','HK3MT=RR'] #short_term = ek.get_timeseries( short_term_tickers, fields=['CLOSE'], start_date='2000-01-01', end_date='2019-12-11', interval='weekly') #Uk short_terms = pd.concat([ short_term_us, short_term_de, short_term_uk, short_term_jp, short_term_ch, short_term_tr, short_term_mx, short_term_br, short_term_ru, short_term_sa ], axis=1) short_terms_int = short_terms.interpolate(method='linear') short_terms_int_w = ((short_terms_int / 100) + 1)**(1 / 52) - 1 short_terms_int_w = short_terms_int_w[short_terms_int_w.index >= start] short_terms_int_w = short_terms_int_w.groupby(pd.Grouper(freq='W')).last() short_terms_int_w.columns = [ '2_US', '2_GER', '2_UK', '2_JP', '2_CH', '2_TR', '2_MX', '2_BR', '2_RU', '2_SA' ] short_terms_int_w.to_excel( 'C:/Users/sb0538/Desktop/15022020/excels/2_treasurybillrates.xlsx')
def create_alerts(self, anomalies, data, fitbit_oldProtocol_hr, k): """ # creates alerts at every 24 hours and send at 9PM. # visualise alerts """ # function to assign different alert names # summarize hourly alerts def alert_types(alert): if alert['alerts'] >= 6: return 'RED' elif alert['alerts'] >= 1: return 'YELLOW' else: return 'GREEN' # summarize hourly alerts #anomalies.columns = ['datetime', 'std.rhr', 'name'] anomalies = anomalies[['datetime']] anomalies['datetime'] = pd.to_datetime(anomalies['datetime'], errors='coerce') anomalies['alerts'] = 1 anomalies = anomalies.set_index('datetime') anomalies = anomalies[~anomalies.index.duplicated(keep='first')] anomalies = anomalies.sort_index() alerts = anomalies.groupby(pd.Grouper(freq='24H', base=21)).cumsum() # apply alert_types function alerts['alert_type'] = alerts.apply(alert_types, axis=1) alerts_reset = alerts.reset_index() #print(alerts_reset) # save alerts #alerts.to_csv(myphd_id_alerts, mode='a', header=True) # summarize hourly alerts to daily alerts daily_alerts = alerts_reset.resample('24H', on='datetime', base=21, label='right').count() daily_alerts = daily_alerts.drop(['datetime'], axis=1) #print(daily_alerts) # function to assign different alert names def alert_types(alert): if alert['alert_type'] >= 6: return 'RED' elif alert['alert_type'] >= 1: return 'YELLOW' else: return 'GREEN' # apply alert_types function daily_alerts['alert_type'] = daily_alerts.apply(alert_types, axis=1) # merge missing 'datetime' with 'alerts' as zero aka GREEN data1 = data[['index']] data1['alert_type'] = 0 data1 = data1.rename(columns={"index": "datetime"}) data1['datetime'] = pd.to_datetime(data1['datetime'], errors='coerce') data1 = data1.resample('24H', on='datetime', base=21, label='right').count() data1 = data1.drop(data1.columns[[0, 1]], axis=1) data1 = data1.reset_index() data1['alert_type'] = 0 data3 = pd.merge(data1, daily_alerts, on='datetime', how='outer') data4 = data3[['datetime', 'alert_type_y']] data4 = data4.rename(columns={"alert_type_y": "alert_type"}) daily_alerts = data4.fillna("GREEN") daily_alerts = daily_alerts.set_index('datetime') daily_alerts = daily_alerts.sort_index() # merge alerts with main data and pass 'NA' when there is a missing day instead of 'GREEN' df_hr = pd.read_csv(fitbit_oldProtocol_hr) df_hr['datetime'] = pd.to_datetime(df_hr['datetime'], errors='coerce') df_hr = df_hr.resample('24H', on='datetime', base=21, label='right').mean() df_hr = df_hr.reset_index() df_hr = df_hr.set_index('datetime') df_hr.index.name = None df_hr.index = pd.to_datetime(df_hr.index) df3 = pd.merge(df_hr, daily_alerts, how='outer', left_index=True, right_index=True) df3 = df3[df3.alert_type.notnull()] df3.loc[df3.heartrate.isna(), 'alert_type'] = pd.NA daily_alerts = df3.drop('heartrate', axis=1) daily_alerts = daily_alerts.reset_index() daily_alerts = daily_alerts.rename(columns={"index": "datetime"}) daily_alerts.to_csv("_" + str(round(lst[k], 1)) + "_" + myphd_id_alerts, na_rep='NA', header=True) # visualize hourly alerts #colors = {'RED': 'red', 'YELLOW': 'yellow', 'GREEN': ''} #ax = alerts['alerts'].plot(kind='bar', color=[colors[i] for i in alerts['alert_type']],figsize=(20,4)) #ax.set_ylabel('No.of Alerts \n', fontsize = 14) # Y label #ax.axvline(pd.to_datetime(symptom_date), color='grey', zorder=1, linestyle='--', marker="v" ) # Symptom date #ax.axvline(pd.to_datetime(diagnosis_date), color='purple',zorder=1, linestyle='--', marker="v") # Diagnosis date #plt.xticks(fontsize=4, rotation=90) #plt.tight_layout() #ax.figure.savefig(myphd_id_figure2, bbox_inches = "tight") return daily_alerts
def point_interp_ts(df, time_col, x_col, y_col, data_col, point_shp, point_site_col, from_crs, to_crs=None, interp_fun='cubic', agg_ts_fun=None, period=None, digits=2): """ Function to take a dataframe of z values and interate through and resample both in time and space. Returns a DataFrame structured like df. Parameters ---------- df: DataFrame DataFrame containing four columns as shown in the below parameters. time_col: str The time column name. x_col: str The x column name. y_col: str The y column name. data_col: str The data column name. point_shp: str or GeoDataFrame Path to shapefile of points to be interpolated or a GeoPandas GeoDataFrame. point_site_col: str The column name of the site names/numbers of the point_shp. grid_res: int The resulting grid resolution in meters (or the unit of the final projection). from_crs: int or str or None The projection info for the input data if the result should be reprojected to the to_crs projection (either a proj4 str or epsg int). to_crs: int or str The projection for the output data similar to from_crs. interp_fun: str The scipy Rbf interpolation function to be applied (see https://docs.scipy.org/doc/scipy-0.16.1/reference/generated/scipy.interpolate.Rbf.html). agg_ts_fun: str or None The pandas time series resampling function to resample the data in time (either 'mean' or 'sum'). If None, then no time resampling. period: str or None The pandas time series code to resample the data in time (i.e. '2H' for two hours). digits: int the number of digits to round. Returns ------- DataFrame """ #### Read in points if isinstance(point_shp, str) & isinstance(point_site_col, str): points = gpd.read_file(point_shp)[[point_site_col, 'geometry']] to_crs1 = points.crs elif isinstance(point_shp, gpd.GeoDataFrame) & isinstance( point_site_col, str): points = point_shp[[point_site_col, 'geometry']] to_crs1 = points.crs else: raise ValueError( 'point_shp must be a str path to a shapefile or a GeoDataFrame and point_site_col must be a str.' ) #### Create the grids df1 = df.copy() #### Resample the time series data if agg_ts_fun is not None: df1a = df1.set_index(time_col) if agg_ts_fun == 'sum': df2 = df1a.groupby( [pd.TimeGrouper(period), pd.Grouper(y_col), pd.Grouper(x_col)])[data_col].sum().reset_index() elif agg_ts_fun == 'mean': df2 = df1a.groupby( [pd.TimeGrouper(period), pd.Grouper(y_col), pd.Grouper(x_col)])[data_col].mean().reset_index() else: raise ValueError("agg_ts_fun should be either 'sum' or 'mean'.") time = df2[time_col].unique() else: df2 = df1 time = df2[time_col].sort_values().unique() #### Convert input data to crs of points shp and create input xy data1 = df2.loc[df2[time_col] == time[0]] from_crs1 = convert_crs(from_crs, pass_str=True) if to_crs is not None: to_crs1 = convert_crs(to_crs, pass_str=True) points = points.to_crs(to_crs1) geometry = [Point(xy) for xy in zip(data1[x_col], data1[y_col])] gpd1 = gpd.GeoDataFrame(data1.index, geometry=geometry, crs=from_crs1) gpd2 = gpd1.to_crs(crs=to_crs1) x = gpd2.geometry.apply(lambda p: p.x).round(digits).values y = gpd2.geometry.apply(lambda p: p.y).round(digits).values xy = np.column_stack((x, y)) #### Prepare the x and y of the points geodataframe output x_int = points.geometry.apply(lambda p: p.x).round(digits).values y_int = points.geometry.apply(lambda p: p.y).round(digits).values sites = points[point_site_col] xy_int = np.column_stack((x_int, y_int)) #### Create new df sites_ar = np.tile(sites, len(time)) time_ar = np.repeat(time, len(xy_int)) x_ar = np.tile(x_int, len(time)) y_ar = np.tile(y_int, len(time)) new_df = pd.DataFrame({ 'site': sites_ar, 'time': time_ar, 'x': x_ar, 'y': y_ar, data_col: np.repeat(0, len(time) * len(xy_int)) }) new_lst = [] for t in pd.to_datetime(time): set1 = df2.loc[df2[time_col] == t, data_col] new_z = griddata(xy, set1.values, xy_int, method=interp_fun).round(digits) new_z[new_z < 0] = 0 new_lst.extend(new_z.tolist()) # print(t) new_df.loc[:, data_col] = new_lst #### Export results return new_df[new_df[data_col].notnull()]
def rotate_to_run(m, avp): """ Correct tilt and align with horizontal streamline over a single run Adapted from rotate_to_run.m by IMB July 2006 references: Wilczak et al. 2001: sonic anemometer tilt corrections. BLM, 99, 127-150 (but beware typos in equations) Kaimal & Finnigan, 1994: Atmospheric Boundary Layer Flows: Their Structure and Measurement. Oxford University Press van Dijk et al. 2004: The principles of surface flux physics: theory, practice and description of the ECPACK library www.met.wau.nl/projects/jep Parameters: m : Unrotated metek data structure. avp : Length of a single run (minutes). i.e. averaging period. Returns: m_out : Wind components in streamline oriented reference frame """ # First rotate to align x-axis with mean wind direction in sonic's # reference frame m_g = m.groupby(pd.Grouper( freq='%sMin' % avp)) # Split into single runs (of length avp minutes) m_out = pd.DataFrame( columns=['x', 'y', 'z', 'T', 'u', 'v', 'w', 'theta', 'phi']) # Loop through each run to perform correction: for group in m_g: m = group[1] # First rotate to align x-axis with mean wind direction in sonic's # reference frame theta = np.arctan2(np.mean(m['y']), np.mean(m['x'])) u1 = m['x'] * np.cos(theta) + m['y'] * np.sin(theta) v1 = -m['x'] * np.sin(theta) + m['y'] * np.cos(theta) w1 = m['z'] # Next rotate u and w so that x-axis lies along mean streamline and # mean(w) is zero phi = np.arctan2(np.mean(w1), np.mean(u1)) m['u'] = u1 * np.cos(phi) + w1 * np.sin(phi) m['v'] = v1 m['w'] = -u1 * np.sin(phi) + w1 * np.cos(phi) # Theta is angle of rotation um-to-vm (anticlockwise or righthanded) # to align u with mean wind (degrees) m['theta'] = theta * 180 / np.pi # phi is tilt angle (+ve tilts x-axis upwards) to align x-axis with # mean streamline and force <w>=0 m['phi'] = phi * 180 / np.pi m_out = m_out.append(m) return m_out
def grid_interp_ts(df, time_col, x_col, y_col, data_col, grid_res, from_crs=None, to_crs=2193, interp_fun='cubic', agg_ts_fun=None, period=None, digits=2): """ Function to take a dataframe of z values and interate through and resample both in time and space. Returns a DataFrame structured like df. Parameters ---------- df: DataFrame DataFrame containing four columns as shown in the below parameters. time_col: str The time column name. x_col: str The x column name. y_col: str The y column name. data_col: str The data column name. grid_res: int The resulting grid resolution in meters (or the unit of the final projection). from_crs: int or str or None The projection info for the input data if the result should be reprojected to the to_crs projection (either a proj4 str or epsg int). to_crs: int or str The projection for the output data similar to from_crs. interp_fun: str The scipy Rbf interpolation function to be applied (see https://docs.scipy.org/doc/scipy-0.16.1/reference/generated/scipy.interpolate.Rbf.html). agg_ts_fun: str or None The pandas time series resampling function to resample the data in time (either 'mean' or 'sum'). If None, then no time resampling. period: str or None The pandas time series code to resample the data in time (i.e. '2H' for two hours). digits: int the number of digits to round. Returns ------- DataFrame """ #### Create the grids df1 = df.copy() #### Resample the time series data if agg_ts_fun is not None: df1a = df1.set_index(time_col) if agg_ts_fun == 'sum': df2 = df1a.groupby( [pd.TimeGrouper(period), pd.Grouper(y_col), pd.Grouper(x_col)])[data_col].sum().reset_index() elif agg_ts_fun == 'mean': df2 = df1a.groupby( [pd.TimeGrouper(period), pd.Grouper(y_col), pd.Grouper(x_col)])[data_col].mean().reset_index() else: raise ValueError("agg_ts_fun should be either 'sum' or 'mean'.") time = df2[time_col].unique() else: df2 = df1 time = df2[time_col].sort_values().unique() if from_crs is None: x = df2.loc[df2[time_col] == time[0], x_col].values y = df2.loc[df2[time_col] == time[0], y_col].values else: data1 = df2.loc[df2[time_col] == time[0]] from_crs1 = convert_crs(from_crs, pass_str=True) to_crs1 = convert_crs(to_crs, pass_str=True) geometry = [Point(xy) for xy in zip(data1[x_col], data1[y_col])] gpd1 = gpd.GeoDataFrame(data1.index, geometry=geometry, crs=from_crs1) gpd2 = gpd1.to_crs(crs=to_crs1) x = gpd2.geometry.apply(lambda p: p.x).round(digits).values y = gpd2.geometry.apply(lambda p: p.y).round(digits).values xy = np.column_stack((x, y)) max_x = x.max() min_x = x.min() max_y = y.max() min_y = y.min() new_x = np.arange(min_x, max_x, grid_res) new_y = np.arange(min_y, max_y, grid_res) x_int, y_int = np.meshgrid(new_x, new_y) #### Create new df x_int2 = x_int.flatten() y_int2 = y_int.flatten() xy_int = np.column_stack((x_int2, y_int2)) time_df = np.repeat(time, len(x_int2)) x_df = np.tile(x_int2, len(time)) y_df = np.tile(y_int2, len(time)) new_df = pd.DataFrame({ 'time': time_df, 'x': x_df, 'y': y_df, data_col: np.repeat(0, len(time) * len(x_int2)) }) new_lst = [] for t in pd.to_datetime(time): set1 = df2.loc[df2[time_col] == t, data_col] # index = new_df[new_df['time'] == t].index new_z = griddata(xy, set1.values, xy_int, method=interp_fun).round(digits) new_z[new_z < 0] = 0 new_lst.extend(new_z.tolist()) # print(t) new_df.loc[:, data_col] = new_lst #### Export results return new_df[new_df[data_col].notnull()]
con.start() index_tickers = ['NYA Index', 'SPX Index', 'CCMP Index','NDX Index','CDAX Index' ,'DAX Index', 'ASX Index','UKX Index', 'TPX Index','NKY Index', 'SHCOMP Index' , 'SZCOMP Index','XUTUM Index','XU100 Index', 'MEXBOL Index', 'IBOV Index', 'IMOEX Index' , 'JALSH Index'] #Gross Aggregate Dividend Yield dy = con.bdh(index_tickers,['GROSS AGGTE DVD YLD'], firstday, today) dy_int = dy.interpolate(method='cubic') #dy_temp = dy.interpolate(method='spline',order=2,limit=10, limit_direction='backward') #dy_int_temp = dy_int.copy() #dy_int_temp.update(dy_temp, overwrite=True) dy_w = dy_int.groupby(pd.Grouper(freq='W')).mean() dy_w = dy_w[dy_w.index>=start] dy_w.fillna(method='bfill', inplace=True) var_no = '1' dy_w.columns = [i[0] for i in dy_w.columns] dy_w = dy_w[index_tickers] dy_w.columns = [var_no+'_'+i for i in dy_w.columns] dy_w = dy_w[dy_w.index>=start] #dy_w.columns = ['1_US_NY','1_US_SPX','1_US_CCMP', '1_DE','1_UK','1_JP','1_CH_SH','1_CH_SZ', '1_TR','1_MX','1_BR','1_RU','1_SA'] dy_w.to_excel('C:/Users/sb0538/Desktop/15022020/excels/1_dividendyield.xlsx')
def get_reach_location(loc1,tweet): tweet =tweet.groupby( pd.Grouper(key='Location'))['Reach'].sum().reset_index() g=tweet.groupby('Location') return g.get_group(loc1)
def downsample(time_series: pd.DataFrame, freq: str) -> pd.DataFrame: """ Downsample the given route, stop, or feed time series, (outputs of :func:`.routes.compute_route_time_series`, :func:`.stops.compute_stop_time_series`, or :func:`.miscellany.compute_feed_time_series`, respectively) to the given Pandas frequency string (e.g. '15Min'). Return the given time series unchanged if the given frequency is shorter than the original frequency. """ f = time_series.copy() # Can't downsample to a shorter frequency if f.empty or pd.tseries.frequencies.to_offset( freq) <= pd.tseries.frequencies.to_offset(pd.infer_freq(f.index)): return f result = None if "stop_id" in time_series.columns.names: # It's a stops time series result = f.resample(freq).sum(min_count=1) else: # It's a route or feed time series. inds = [ "num_trips", "num_trip_starts", "num_trip_ends", "service_distance", "service_duration", ] frames = [] # Resample num_trips in a custom way that depends on # num_trips and num_trip_ends def agg_num_trips(group): return group["num_trips"].iloc[-1] + group[ "num_trip_ends"].iloc[:-1].sum(min_count=1) num_trips = f.groupby(pd.Grouper(freq=freq)).apply(agg_num_trips) frames.append(num_trips) # Resample the rest of the indicators via summing, preserving all-NaNs frames.extend([ f[ind].resample(freq).agg(lambda x: x.sum(min_count=1)) for ind in inds[1:] ]) g = pd.concat(frames, axis=1, keys=inds) # Calculate speed and add it to f. Can't resample it. speed = (g.service_distance / g.service_duration).fillna( g.service_distance) speed = pd.concat({"service_speed": speed}, axis=1) result = pd.concat([g, speed], axis=1) # Reset column names and sort the hierarchical columns to allow slicing; # see http://pandas.pydata.org/pandas-docs/stable/advanced.html#sorting-a-multiindex result.columns.names = f.columns.names result = result.sort_index(axis=1, sort_remaining=True) # Set frequency, which is not automatically set result.index.freq = freq return result
def get_text_location(loc1,tweet): #location_based tweet =tweet.groupby( pd.Grouper(key='Location')) tweet=tweet.get_group(loc1) text=tweet['Tweet'].T.tolist() return text
def render_body(startdate, enddate, aggregation): df = Body.load_df() mask = (df.date >= startdate) & (df.date <= enddate) df = df.loc[mask] df.index = df.date df = df.groupby(pd.Grouper(freq=aggregation)).mean() y1 = df.weight y2 = df.muscle_mass y3 = df.fat_mass_weight df2 = Total_Energy.load_df() mask = (df2.date >= startdate) & (df2.date <= enddate) df2 = df2.loc[mask] df2.index = df2.date df2 = df2.groupby(pd.Grouper(freq=aggregation)).mean() fig = make_subplots(rows=4, cols=1, shared_xaxes=True, vertical_spacing=0.01) fig.append_trace(go.Scatter( x=df.index, y=y1, mode='lines+markers', line={ "shape": "spline", "color": "#6610f2" }, hoverinfo="text", hovertemplate="<b>Weight:</b> <br> %{y:.2f)} kg<extra></extra>", connectgaps=True), row=1, col=1) fig.append_trace(go.Scatter( x=df.index, y=y2, mode='lines+markers', line={ "shape": "spline", "color": "#20c997" }, hoverinfo="text", hovertemplate="<b>Muscle Mass:</b> <br> %{y:.2f)} kg<extra></extra>", connectgaps=True), row=2, col=1) fig.append_trace( go.Scatter(x=df.index, y=y3, mode='lines+markers', line={ "shape": "spline", "color": "#eb6864" }, hoverinfo="text", hovertemplate= "<b>Fat Mass Weight:</b> <br> %{y:.2f)} kg<extra></extra>", connectgaps=True), row=3, col=1) fig.append_trace(go.Bar(x=df2.index, y=df2.active_energy), row=4, col=1) fig.update_layout(showlegend=False, margin={ "t": 10, "l": 0, "r": 0, "b": 40 }, plot_bgcolor="white", hovermode="x unified", height=700) fig.update_traces(xaxis='x4', row=1, col=1) fig.update_traces(xaxis='x4', row=2, col=1) fig.update_traces(xaxis='x4', row=3, col=1) fig.update_traces(xaxis='x4', row=4, col=1) fig.update_yaxes(range=[y1.min() - 0.1, y1.max() + 1], row=1, col=1) fig.update_yaxes(range=[y2.min() - 0.1, y2.max() + 0.1], row=2, col=1) fig.update_yaxes(range=[y3.min() - 1, y3.max() + 0.1], row=3, col=1) if aggregation == "D" or aggregation == "W": fig.update_xaxes(tickformat="%d-%m-%y") elif aggregation == "M": fig.update_xaxes(tickformat="%m-%Y") elif aggregation == "Y": fig.update_xaxes(tickformat="%Y", ticklabelmode="period") return fig
def get_reach_weekly(tweet): tweet =tweet.groupby( pd.Grouper(key='TwittedAt', freq='W'))['Reach'].sum().reset_index().sort_values('TwittedAt') print(tweet)
['Kg', 'kg', 'Kilo', 'Litre', 'Litres']), commands['CP_QuantiteTotale'] * 1000, np.where( np.logical_and( np.isin(commands['CP_QuantiteUnite'], ['Unités', 'Unité', 'unité', 'unités', 'pièces', 'Pcs']), np.isin(commands['Qty_unit'], ['g'])), commands['Qty_val'] * commands['CP_QuantiteTotale'], float('nan'))) end = time.clock() print('processing time: ', round(end - start), ' seconds') #aggregate df at day x EC x RC x food group level & drop line without quantity in grams print('----> group by ---->') start = time.clock() commands_agg = commands.groupby([ pd.Grouper(key='CO_DateCommande', freq='D'), "EC_Id", "RC_Id", "P_food_group" ]).agg({ "Qty_totale": "sum" }).dropna().reset_index().set_index(["CO_DateCommande", "EC_Id", "RC_Id"]) end = time.clock() print('processing time: ', round(end - start), ' seconds') #create a df with food groups and ids & replacing food groups in main df by ids print('----> food group dictionnary ---->') start = time.clock() food_groups = commands_agg.reset_index()["P_food_group"].drop_duplicates( ).reset_index(drop=True) food_groups_dict = dict(enumerate(food_groups.tolist())) food_groups_inv = {v: k for k, v in food_groups_dict.items()} commands_id = commands_agg.replace(
fubu_df = pd.DataFrame(fubu) # FUBU dates day_list = pd.Series(np.concatenate([ np.arange(1, 367) if calendar.isleap(year) else np.arange(1, 366) for year in fubu_df.index ]), index=pd.date_range('{}-01-01'.format(begin_year), '{}-12-31'.format(end_year), freq='D')) day_list = day_list.loc[sl.start:sl.stop] day_list[:] = np.nan # out = [] for dt, df in day_list.groupby(pd.Grouper(freq='Y')): year = dt.year for metric in metrics: val = fubu_df.loc[year, metric] - 1 ts = pd.Timestamp( datetime.datetime.strptime(str(year) + str(val), '%Y%j')) if ts > day_list.index[0] and ts < day_list.index[-1]: day_list.loc[ts] = annual_dat.loc[ts] plt.figure(figsize=(10, 6)) # plot the 'annual' data plt.plot(annual_dat.values) # plot extended climatology plt.plot(clim_mean) plt.plot(day_list.values, 'bo')
def get_data(data_path="../Dataset/household_power_consumption_data.zip", do_profile=False, with_app_stat=False): df = pd.read_csv(data_path, sep=';', parse_dates={'dt': ['Date', 'Time']}, infer_datetime_format=True, low_memory=False, na_values=['nan', '?'], index_col='dt') print("[Reading Data] : DONE") df.drop([ "Global_active_power", "Global_reactive_power", "Voltage", "Global_intensity" ], axis=1, inplace=True) if do_profile: profile = pandas_profiling.ProfileReport(df) profile.to_file("report.html") print("[Profiling data finished]") #fill nan values with column average for j in range(0, 3): df.iloc[:, j] = df.iloc[:, j].fillna(df.iloc[:, j].mean()) print("[Filling Missing Values] : DONE") df["consumption"] = df.iloc[:, :].sum( axis=1) #consumption -> Energy consumption if with_app_stat: df_with_app_status = appliances_status(df) grouped = df_with_app_status.groupby( pd.Grouper(freq='1h', base=0, label='right')).agg({ "consumption": lambda x: np.sum(x) / 60, "Set1": "any", "Set2": "any", "Set3": "any" }) data = grouped * 1 else: grouped = df["consumption"].groupby( pd.Grouper(freq='1h', base=0, label='right')).sum() data = pd.DataFrame(grouped / 60) data = merge_additional_features(data) print("[Extracting features from timestamp] : DONE") xtrain = data.loc["2006":"2010"] ytrain = xtrain.pop("consumption") xtest = data.loc["2010":] ytest = xtest.pop("consumption") print("[Train Test split] : DONE") return xtrain, ytrain, xtest, ytest
def CalSimI(r, MultiSiteDict, Setting, Stat, Wth_gen): # MultiSiteDict = MultiSiteDict.copy() Var = Setting["Var"] + ["P_Occurrence"] # Add prep event variable rSimYear = Setting["MultiSite"]["rSimYear"] Stns = Setting["StnID"] plot = Setting["Plot"]["Multi_ECDFFittingPlot"] DayInMonth = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] SpatialRnNum = {} # Simulate spatial correlated RN for v in Var: # Gen 40 years for the size is greater than 1000 for each month, which we consider as statistically robust Rn = 0 for y in range(rSimYear): for m in range(12): day_in_month = DayInMonth[m] W = MultiSiteDict["Weight"][v][m + 1] # Gen Rn if v == "PP01" or "P_Occurrence": rn = GenMultiRN(r, W, Type="P", Size=day_in_month, TransformFunc=ECDFFitting(r, W, plot), HisGen=True) else: rn = GenMultiRN(r, W, Type="T", Size=day_in_month, TransformFunc=Standardization(r, W), HisGen=True) # Add up if type(Rn) is int: Rn = rn else: Rn = np.concatenate((Rn, rn), axis=1) SpatialRnNum[v] = Rn MultiSiteDict["SpatialRnNum"] = SpatialRnNum # Re-organize the Rn to each stn for i, s in enumerate(Stns): RnNum = pd.DataFrame() for v in Var: RnNum[v] = SpatialRnNum[v][i] Stat[s]["RnNum"] = RnNum # Creat the "Setting" dictionary for I-r curve simulation. We need to turn off the leap year option to make sure the output are able to be iterate Setting_Multi = Setting.copy() Setting_Multi["GenYear"] = rSimYear Setting_Multi["LeapYear"] = False Setting_Multi["Condition"] = True for k in list( Setting_Multi["Plot"].keys()): # Ture off all plotting options Setting_Multi["Plot"][k] = False # Generate weather data and re-calculate spetial autocorrelation index # Use single core here since we already distribute r into different cores/threads. Wth_gen, Stat = Generate(Wth_gen, Setting_Multi, Stat, Export=False, ParalCores=1) SimI = HisI(MultiSiteDict, Setting, Wth_gen, ForGenWth=False)["HisI"] # Calculate monthly mean for establish I-r curve rng = pd.date_range(pd.datetime(2013, 1, 1), pd.datetime(2013, 12, 31)) SimI.index = rng SimI = SimI.groupby(pd.Grouper(freq='M')).mean() SimI = SimI.reset_index().drop("index", axis=1) SimI.index = np.arange(1, 13) return SimI
def get_ping_count(conn, year, month, day): query = 'select * from testresults_pingresult where time between "{year}-{month:02}-{day:02} 00:00:00" and "{year}-{month:02}-{day:02} 23:59:59"' print('getting {}-{:02}-{:02}'.format(year, month, day)) df = pd.read_sql_query(query.format(year=year, month=month, day=day), conn) df.loc[:, 'time'] = pd.to_datetime(df.loc[:, 'time']) df2 = df.loc[:, ['id', 'nanopi_id', 'state', 'time']].groupby(['nanopi_id', 'state', pd.Grouper(freq='1H', key='time')]).count().unstack(level=1).loc[:, 'id'].fillna(value=0) return df2
from bokeh.plotting import figure, output_file, show from bokeh.models import ColumnDataSource from datetime import datetime from bokeh.palettes import Spectral3 #@UnresolvedImport from bokeh.models import BoxAnnotation output_file('eto_operations.html') df = pd.read_csv('thor_wwii.csv') #filter for the European Theater of Operations filter = df['THEATER']=='ETO' df = df[filter] df['MSNDATE'] = pd.to_datetime(df['MSNDATE'], format='%m/%d/%Y') group = df.groupby(pd.Grouper(key='MSNDATE', freq='M'))['TOTAL_TONS', 'TONS_IC', 'TONS_FRAG'].sum() group = group / 1000 source = ColumnDataSource(group) p = figure(x_axis_type="datetime") p.line(x='MSNDATE', y='TOTAL_TONS', line_width=2, source=source, legend='All Munitions') p.line(x='MSNDATE', y='TONS_FRAG', line_width=2, source=source, color=Spectral3[1], legend='Fragmentation') p.line(x='MSNDATE', y='TONS_IC', line_width=2, source=source, color=Spectral3[2], legend='Incendiary') p.title.text = 'European Theater of Operations' p.yaxis.axis_label = 'Kilotons of Munitions Dropped' p.legend.location = 'top_left'
data_show = [] weekend7 = pd.date_range(start='2015-09-05', end='2015-10-31', freq='7D') weekend6 = pd.date_range(start='2015-09-06', end='2015-10-31', freq='7D') low = 0 up = 0 avg_number = 0 median_number = 0 # plt.figure() lr_predict_data = weather_data[len(weather_data) - 61:] del lr_predict_data['leasetime'] for j, i in enumerate(sample_shedid.values, start=1): if i in shedid.values: each_data = data[data.SHEDID == i] each_data['show'] = 1 each_data = each_data[['leasetime', 'show']].set_index('leasetime') each_data = each_data.groupby(pd.Grouper(freq='1D')).sum().fillna(0) data_show.append([i, len(each_data)]) if len(each_data) < (2 * 7): data_merge['time'] = pd.date_range(start='2015-09-01', periods=61, freq='D') data_merge['SHEDID'] = i data_merge['LEASE'] = each_data.values.mean() * 1.25 data_merge.LEASE = data_merge[['time', 'LEASE']].apply( lambda x: 1 * x.LEASE if x.time in weekend6 or x.time in weekend7 else x.LEASE, axis=1) data_merge['time'] = data_merge.apply( lambda x: str(x['time'].year) + str('/') + str(x[ 'time'].month) + str('/') + str(x['time'].day), axis=1)