示例#1
0
def preprocessing_walmart(dataset):# 2nd function definition  processing
    dataset['Day'] = pd.to_datetime(dataset['Date'])

    #using pandas pipeline
    panda_pipe = pdp.ApplyByCols('Day',lambda x: (x.day//7)+1,'Week_no',drop = False)
    #converting given day into week of the month

    panda_pipe += pdp.ApplyByCols('Day',lambda x: x.month,'month',drop = False)
    #getting month from the date

    panda_pipe += pdp.ColDrop(['Date','Day'])
    dataset = panda_pipe(dataset)
   
    dataset['Lag2'] = dataset['Weekly_Sales'].shift(2)
    dataset['Lag3'] = dataset['Weekly_Sales'].shift(3)
    dataset['Lag4'] = dataset['Weekly_Sales'].shift(4)
    dataset['Lag5'] = dataset['Weekly_Sales'].shift(5)
    dataset['Lag6'] = dataset['Weekly_Sales'].shift(6)
    

    to_be_predicted = dataset['Weekly_Sales']
    dataset = dataset.drop(columns = ['Weekly_Sales'])
    X_train,X_test,Y_train,Y_test = train_test_split(dataset, to_be_predicted, random_state = 42, test_size = 0.3)

    return (X_train,Y_train,X_test)
示例#2
0
def drop_host_location():
    start_time = time.time()

    result = pdp.ColDrop('host_location')

    time_elapsed = time.time() - start_time
    print("drop_textual_columns:", time_elapsed)

    return result
    def data_pipeline(self, df):
        pipeline = pdp.ColDrop(self.drop_cols)
        pipeline += pdp.DropNa()
        df = pipeline(df)

        df = self.encoder(df)
        df = self.scaler(df)

        return df
示例#4
0
def drop_datetime_columns():
    start_time = time.time()
    time_columns = ['first_review', 'last_review', 'host_since']

    result = pdp.ColDrop(time_columns)

    time_elapsed = time.time() - start_time
    print("drop_datetime_columns:", time_elapsed)

    return result
示例#5
0
def test_attribute_stage():
    """Testing attribute pipeline stages."""
    pipeline = pdp.ColDrop('name').Bin({'speed': [5]}, drop=True)
    assert isinstance(pipeline, Pipeline)
    assert isinstance(pipeline[0], ColDrop)
    assert isinstance(pipeline[1], Bin)
    df = _some_df()
    res_df = pipeline(df)
    assert 'speed' in res_df.columns
    assert 'name' not in res_df.columns
示例#6
0
def drop_calendar_updated():
    start_time = time.time()

    column_name = ['calendar_updated']

    result = pdp.ColDrop(column_name)

    time_elapsed = time.time() - start_time
    print("drop_calendar_updated:", time_elapsed)

    return result
示例#7
0
def drop_geometrical_columns():
    start_time = time.time()

    geo_columns = ['geometry', 'latitude', 'longitude']

    result = pdp.ColDrop(geo_columns)

    time_elapsed = time.time() - start_time
    print("drop_geometrical_columns:", time_elapsed)

    return result
示例#8
0
def drop_useless():
    start_time = time.time()

    useless_column = [
        'experiences_offered', 'has_availability', 'requires_license',
        'is_business_travel_ready'
    ]

    result = pdp.ColDrop(useless_column)

    time_elapsed = time.time() - start_time
    print("drop_useless:", time_elapsed)

    return result
示例#9
0
def _original_code():
    start = time.time()
    salesdata = pd.read_csv("processed_salesdata.csv")
    pline = pdp.PdPipeline([
        pdp.Schematize(COLUMNS),
        pdp.ApplyByCols("category_group", lambda x: "tops"
                        if x == "tops" else "other"),
        pdp.ApplyByCols(["date", "shelf_date", "end_of_season"],
                        pd.to_datetime),
        pdp.ApplyToRows(lambda row: pd.Series({
            "standard_amount":
            row["original_price"] * row["sales"],
            "sales_discount":
            0 if (row["original_price"] * row["sales"] <= 0) else row[
                "sales_amount"] / ((row["original_price"] * row["sales"])),
            "week":
            int(row["date"].strftime('%W')),
            "days_on_counter":
            (row["date"] - row["shelf_date"]) / np.timedelta64(1, 'D'),
            "life_cycle": (row["end_of_season"] - row["shelf_date"]) /
            (np.timedelta64(1, 'D')),
            "C1":
            1 if row["category_group"] == "tops" else 0,
            "C2":
            1 if row["category_group"] == "other" else 0,
            "sales":
            0 if row["sales"] < 0 else row["sales"],
            "passenger_flow":
            0 if row["passenger_flow"] < 0 else (row["passenger_flow"]),
            "plus_purchase":
            0 if row["plus_purchase"] < 0 else (row["plus_purchase"]),
        })),
        pdp.AdHocStage(
            lambda df: df[df["days_on_counter"] <= df["life_cycle"]]),
        pdp.ColDrop("activity_level")
    ])
    salesdata = pline.apply(salesdata, verbose=True, exraise=True)

    salesdata_cumitems = salesdata[[
        "SKC", "date", "sales", "passenger_flow", "plus_purchase"
    ]].sort_values(by=["SKC", "date"]).groupby(['SKC']).cumsum()
    salesdata_cumitems.columns = [
        "total_sales", "total_passenger_flow", "total_plus_purchase"
    ]
    salesdata["total_sales"] = salesdata_cumitems["total_sales"]
    salesdata["total_passenger_flow"] = salesdata_cumitems[
        "total_passenger_flow"]
    salesdata["total_plus_purchase"] = salesdata_cumitems[
        "total_plus_purchase"]
    print("consumed time(s)=", time.time() - start)
示例#10
0
def drop_textual_columns():
    start_time = time.time()
    text_columns = [
        'name', 'summary', 'listing_url', 'space', 'description',
        'neighborhood_overview', 'notes', 'transit', 'access', 'interaction',
        'house_rules', 'picture_url', 'host_url', 'host_name',
        'host_picture_url'
    ]

    result = pdp.ColDrop(text_columns)

    time_elapsed = time.time() - start_time
    print("drop_textual_columns:", time_elapsed)

    return result
示例#11
0
cnxn, cursor = sql_addresses.connect()
df_addresses = sql_addresses.executeQueryFromFile(cnxn)

geodesic(df_subwaystations[['Latitude', 'Longitude']],
         df_addresses[['Latitude', 'Longitude']])

pipeline = pdp.ApplyByCols('District', District_transformation, 'District')
pipeline += pdp.ApplyByCols('BuiltYear', builtYear_transformation, 'BuiltYear')
pipeline += pdp.RowDrop({'District': lambda x: x == None})
pipeline += pdp.RowDrop({'OperatingCostInSek': lambda x: pd.isnull(x) == True})
pipeline += pdp.RowDrop({'NumberOfRooms': lambda x: x == 0})
pipeline += pdp.RowDrop({'FloorNumber': lambda x: pd.isnull(x) == True})
pipeline += pdp.RowDrop({'BuiltYear': lambda x: pd.isnull(x) == True})
pipeline += pdp.OneHotEncode('District')
pipeline += pdp.OneHotEncode('BuiltYear')
pipeline += pdp.ColDrop(['Address'])

df_pipeline = pipeline(df)
variables = GoMining(df_pipeline)
MiningReport(variables)

formula = 'SoldPricePerSquaredMeterInSek ~ MonthlyChargeInSek + \
            PricePerSquaredMeterInSek + \
            District_östermalm + \
            BuiltYear_1950_1999 + \
            District_kungsholmen + \
            BuiltYear_2000 + \
            District_södermalm + \
            District_vasastan + \
            FloorNumber'
示例#12
0
data = pd.read_csv('C:\\Users\\13810\\tmdb_5000_movies.csv')


# In[8]:


import pdpipe as pdp#导入pdpipe包


# In[13]:


#创建pdp.pdppipeline ,其中pdp.ColDrop和pdp.ApplyByCols和pdp.Rowdrop都是pdpipe中常用的API,是固定写法,有固定的语法格式
first_pipeline = pdp.PdPipeline(
[pdp.ColDrop("original_title"),#ColDrop用于对指定单个或多个列进行丢弃
 pdp.ApplyByCols(columns=['title'], func=lambda x: x.lower()),#对每一列进行应用
 pdp.RowDrop({'vote_average': lambda x: x <= 7, 'original_language': lambda x: x != 'en'}),#对行进行删除
 pdp.ApplyByCols(columns=['genres'], func=lambda x: [item['name'] for item in eval(x)].__len__(), result_columns=['genres_num']),
 pdp.RowDrop({'genres_num': lambda x: x <= 5})]
)


# In[19]:


data1=first_pipeline(data, verbose=True).reset_index(drop=True)
 pdp.RowDrop({'vote_average': lambda x: x <= 7, 'original_language': lambda x: x != 'en'}),
 pdp.ApplyByCols(columns=['genres'], func=lambda x: [item['name'] for item in eval(x)].__len__(), re

示例#13
0
    pdp.ValKeep([dt.date(2020,3,16)], columns=['tweet_date']),
])

neg_tweets_df = pipeline.apply(tweets_df).sort_values(by=['tweet_date'])
# neg_tweets_df = neg_tweets_df[neg_tweets_df['tweet_date'] == dt.date(2020,3,16) ]  
neg_tweets_df.head()

"""## WordCloud: Negative Tweets, Mar 16, 2020"""

# Word cloud: negative tweets, Mar 16, 2020

# convert cleaned tweet texts to lower case
neg_tweets_df['tweet'] = neg_tweets_df['tweet'].str.lower()

pipeline = pdp.PdPipeline([
    pdp.ColDrop(['sentiment',	'is_neutral',	'is_negative',	'is_positive']),
    pdp.ApplyByCols('tweet', drop_tweeet_user_name),
    pdp.ApplyByCols('tweet', clean_text),
    pdp.ApplyByCols('tweet', wordfilter)
])

neg_tweets_df = pipeline.apply(neg_tweets_df)

text_base = ' '.join(neg_tweets_df['tweet'].tolist())
wordcloud = WordCloud().generate(text_base)
plt.imshow(wordcloud)
plt.axis("off")

plt.show()

# Positive tweets: Mar 16, 2020