def preprocessing_walmart(dataset):# 2nd function definition processing dataset['Day'] = pd.to_datetime(dataset['Date']) #using pandas pipeline panda_pipe = pdp.ApplyByCols('Day',lambda x: (x.day//7)+1,'Week_no',drop = False) #converting given day into week of the month panda_pipe += pdp.ApplyByCols('Day',lambda x: x.month,'month',drop = False) #getting month from the date panda_pipe += pdp.ColDrop(['Date','Day']) dataset = panda_pipe(dataset) dataset['Lag2'] = dataset['Weekly_Sales'].shift(2) dataset['Lag3'] = dataset['Weekly_Sales'].shift(3) dataset['Lag4'] = dataset['Weekly_Sales'].shift(4) dataset['Lag5'] = dataset['Weekly_Sales'].shift(5) dataset['Lag6'] = dataset['Weekly_Sales'].shift(6) to_be_predicted = dataset['Weekly_Sales'] dataset = dataset.drop(columns = ['Weekly_Sales']) X_train,X_test,Y_train,Y_test = train_test_split(dataset, to_be_predicted, random_state = 42, test_size = 0.3) return (X_train,Y_train,X_test)
def drop_host_location(): start_time = time.time() result = pdp.ColDrop('host_location') time_elapsed = time.time() - start_time print("drop_textual_columns:", time_elapsed) return result
def data_pipeline(self, df): pipeline = pdp.ColDrop(self.drop_cols) pipeline += pdp.DropNa() df = pipeline(df) df = self.encoder(df) df = self.scaler(df) return df
def drop_datetime_columns(): start_time = time.time() time_columns = ['first_review', 'last_review', 'host_since'] result = pdp.ColDrop(time_columns) time_elapsed = time.time() - start_time print("drop_datetime_columns:", time_elapsed) return result
def test_attribute_stage(): """Testing attribute pipeline stages.""" pipeline = pdp.ColDrop('name').Bin({'speed': [5]}, drop=True) assert isinstance(pipeline, Pipeline) assert isinstance(pipeline[0], ColDrop) assert isinstance(pipeline[1], Bin) df = _some_df() res_df = pipeline(df) assert 'speed' in res_df.columns assert 'name' not in res_df.columns
def drop_calendar_updated(): start_time = time.time() column_name = ['calendar_updated'] result = pdp.ColDrop(column_name) time_elapsed = time.time() - start_time print("drop_calendar_updated:", time_elapsed) return result
def drop_geometrical_columns(): start_time = time.time() geo_columns = ['geometry', 'latitude', 'longitude'] result = pdp.ColDrop(geo_columns) time_elapsed = time.time() - start_time print("drop_geometrical_columns:", time_elapsed) return result
def drop_useless(): start_time = time.time() useless_column = [ 'experiences_offered', 'has_availability', 'requires_license', 'is_business_travel_ready' ] result = pdp.ColDrop(useless_column) time_elapsed = time.time() - start_time print("drop_useless:", time_elapsed) return result
def _original_code(): start = time.time() salesdata = pd.read_csv("processed_salesdata.csv") pline = pdp.PdPipeline([ pdp.Schematize(COLUMNS), pdp.ApplyByCols("category_group", lambda x: "tops" if x == "tops" else "other"), pdp.ApplyByCols(["date", "shelf_date", "end_of_season"], pd.to_datetime), pdp.ApplyToRows(lambda row: pd.Series({ "standard_amount": row["original_price"] * row["sales"], "sales_discount": 0 if (row["original_price"] * row["sales"] <= 0) else row[ "sales_amount"] / ((row["original_price"] * row["sales"])), "week": int(row["date"].strftime('%W')), "days_on_counter": (row["date"] - row["shelf_date"]) / np.timedelta64(1, 'D'), "life_cycle": (row["end_of_season"] - row["shelf_date"]) / (np.timedelta64(1, 'D')), "C1": 1 if row["category_group"] == "tops" else 0, "C2": 1 if row["category_group"] == "other" else 0, "sales": 0 if row["sales"] < 0 else row["sales"], "passenger_flow": 0 if row["passenger_flow"] < 0 else (row["passenger_flow"]), "plus_purchase": 0 if row["plus_purchase"] < 0 else (row["plus_purchase"]), })), pdp.AdHocStage( lambda df: df[df["days_on_counter"] <= df["life_cycle"]]), pdp.ColDrop("activity_level") ]) salesdata = pline.apply(salesdata, verbose=True, exraise=True) salesdata_cumitems = salesdata[[ "SKC", "date", "sales", "passenger_flow", "plus_purchase" ]].sort_values(by=["SKC", "date"]).groupby(['SKC']).cumsum() salesdata_cumitems.columns = [ "total_sales", "total_passenger_flow", "total_plus_purchase" ] salesdata["total_sales"] = salesdata_cumitems["total_sales"] salesdata["total_passenger_flow"] = salesdata_cumitems[ "total_passenger_flow"] salesdata["total_plus_purchase"] = salesdata_cumitems[ "total_plus_purchase"] print("consumed time(s)=", time.time() - start)
def drop_textual_columns(): start_time = time.time() text_columns = [ 'name', 'summary', 'listing_url', 'space', 'description', 'neighborhood_overview', 'notes', 'transit', 'access', 'interaction', 'house_rules', 'picture_url', 'host_url', 'host_name', 'host_picture_url' ] result = pdp.ColDrop(text_columns) time_elapsed = time.time() - start_time print("drop_textual_columns:", time_elapsed) return result
cnxn, cursor = sql_addresses.connect() df_addresses = sql_addresses.executeQueryFromFile(cnxn) geodesic(df_subwaystations[['Latitude', 'Longitude']], df_addresses[['Latitude', 'Longitude']]) pipeline = pdp.ApplyByCols('District', District_transformation, 'District') pipeline += pdp.ApplyByCols('BuiltYear', builtYear_transformation, 'BuiltYear') pipeline += pdp.RowDrop({'District': lambda x: x == None}) pipeline += pdp.RowDrop({'OperatingCostInSek': lambda x: pd.isnull(x) == True}) pipeline += pdp.RowDrop({'NumberOfRooms': lambda x: x == 0}) pipeline += pdp.RowDrop({'FloorNumber': lambda x: pd.isnull(x) == True}) pipeline += pdp.RowDrop({'BuiltYear': lambda x: pd.isnull(x) == True}) pipeline += pdp.OneHotEncode('District') pipeline += pdp.OneHotEncode('BuiltYear') pipeline += pdp.ColDrop(['Address']) df_pipeline = pipeline(df) variables = GoMining(df_pipeline) MiningReport(variables) formula = 'SoldPricePerSquaredMeterInSek ~ MonthlyChargeInSek + \ PricePerSquaredMeterInSek + \ District_östermalm + \ BuiltYear_1950_1999 + \ District_kungsholmen + \ BuiltYear_2000 + \ District_södermalm + \ District_vasastan + \ FloorNumber'
data = pd.read_csv('C:\\Users\\13810\\tmdb_5000_movies.csv') # In[8]: import pdpipe as pdp#导入pdpipe包 # In[13]: #创建pdp.pdppipeline ,其中pdp.ColDrop和pdp.ApplyByCols和pdp.Rowdrop都是pdpipe中常用的API,是固定写法,有固定的语法格式 first_pipeline = pdp.PdPipeline( [pdp.ColDrop("original_title"),#ColDrop用于对指定单个或多个列进行丢弃 pdp.ApplyByCols(columns=['title'], func=lambda x: x.lower()),#对每一列进行应用 pdp.RowDrop({'vote_average': lambda x: x <= 7, 'original_language': lambda x: x != 'en'}),#对行进行删除 pdp.ApplyByCols(columns=['genres'], func=lambda x: [item['name'] for item in eval(x)].__len__(), result_columns=['genres_num']), pdp.RowDrop({'genres_num': lambda x: x <= 5})] ) # In[19]: data1=first_pipeline(data, verbose=True).reset_index(drop=True) pdp.RowDrop({'vote_average': lambda x: x <= 7, 'original_language': lambda x: x != 'en'}), pdp.ApplyByCols(columns=['genres'], func=lambda x: [item['name'] for item in eval(x)].__len__(), re
pdp.ValKeep([dt.date(2020,3,16)], columns=['tweet_date']), ]) neg_tweets_df = pipeline.apply(tweets_df).sort_values(by=['tweet_date']) # neg_tweets_df = neg_tweets_df[neg_tweets_df['tweet_date'] == dt.date(2020,3,16) ] neg_tweets_df.head() """## WordCloud: Negative Tweets, Mar 16, 2020""" # Word cloud: negative tweets, Mar 16, 2020 # convert cleaned tweet texts to lower case neg_tweets_df['tweet'] = neg_tweets_df['tweet'].str.lower() pipeline = pdp.PdPipeline([ pdp.ColDrop(['sentiment', 'is_neutral', 'is_negative', 'is_positive']), pdp.ApplyByCols('tweet', drop_tweeet_user_name), pdp.ApplyByCols('tweet', clean_text), pdp.ApplyByCols('tweet', wordfilter) ]) neg_tweets_df = pipeline.apply(neg_tweets_df) text_base = ' '.join(neg_tweets_df['tweet'].tolist()) wordcloud = WordCloud().generate(text_base) plt.imshow(wordcloud) plt.axis("off") plt.show() # Positive tweets: Mar 16, 2020