Пример #1
0
def preprocessing_walmart(dataset):# 2nd function definition  processing
    dataset['Day'] = pd.to_datetime(dataset['Date'])

    #using pandas pipeline
    panda_pipe = pdp.ApplyByCols('Day',lambda x: (x.day//7)+1,'Week_no',drop = False)
    #converting given day into week of the month

    panda_pipe += pdp.ApplyByCols('Day',lambda x: x.month,'month',drop = False)
    #getting month from the date

    panda_pipe += pdp.ColDrop(['Date','Day'])
    dataset = panda_pipe(dataset)
   
    dataset['Lag2'] = dataset['Weekly_Sales'].shift(2)
    dataset['Lag3'] = dataset['Weekly_Sales'].shift(3)
    dataset['Lag4'] = dataset['Weekly_Sales'].shift(4)
    dataset['Lag5'] = dataset['Weekly_Sales'].shift(5)
    dataset['Lag6'] = dataset['Weekly_Sales'].shift(6)
    

    to_be_predicted = dataset['Weekly_Sales']
    dataset = dataset.drop(columns = ['Weekly_Sales'])
    X_train,X_test,Y_train,Y_test = train_test_split(dataset, to_be_predicted, random_state = 42, test_size = 0.3)

    return (X_train,Y_train,X_test)
Пример #2
0
def main():
    print("Qual modelo quer treinar?")
    value = input(
        "1: Regressão Logistica, 2: Multinomial Naive Bayes, 3: CNN, digite o numero correspondente ao modelo:"
    )

    # import dataset
    train, test = common_modules.merge_files()

    if value == '1':
        pipeline = pdp.ApplyByCols("texto",
                                   common_modules.tag_remove,
                                   "clean_texto",
                                   drop=False)
        pipeline += pdp.ApplyByCols("clean_texto", common_modules.trat_texto)
        train = pipeline(train)
        test = pipeline(test)
        print("Treinando modelo de regressão logistica")
        regressao_logistica.train_model(train.texto, train.label, test.texto,
                                        test.label)
    if value == '2':
        pipeline = pdp.ApplyByCols("texto",
                                   common_modules.tag_remove,
                                   "clean_texto",
                                   drop=False)
        pipeline += pdp.ApplyByCols("clean_texto", common_modules.trat_texto)
        train = pipeline(train)
        test = pipeline(test)
        print("treinando modelo de Multinomial Naive Bayes")
        naive_bayes.train_model(train.texto, train.label, test.texto,
                                test.label)
    if value == '3':
        import CNN
        print("treinando modelo de CNN")
        CNN.train_model(train, test)
Пример #3
0
def make_conversion_pipe():
    """
    Creates the Pandas Pipeline for the transformation of the UN Geoscheme DataFrame
    """
    pipeline = pdp.ColRename({i: str(i) for i in range(0, 5)})
    pipeline += pdp.ApplyByCols(
        ['country/region', 'numeric', '0', '1', '2', '3', '4'],
        func=replace_new_lines)
    pipeline += pdp.ApplyByCols(['numeric', '0', '1', '2', '3', '4'],
                                func=clean_out_world)
    pipeline += pdp.DropNa(axis=1, how='all')
    return pipeline
Пример #4
0
def main():
    train, test = common_modules.merge_files()

    pipeline = pdp.ApplyByCols("texto",
                               common_modules.tag_remove,
                               "clean_texto",
                               drop=False)
    pipeline += pdp.ApplyByCols("clean_texto", common_modules.trat_texto)
    train = pipeline(train)
    test = pipeline(test)
    reg_log(train.texto, train.label, test.texto, test.label)
    naive_bayes(train.texto, train.label, test.texto, test.label)
Пример #5
0
def _original_code():
    start = time.time()
    salesdata = pd.read_csv("processed_salesdata.csv")
    pline = pdp.PdPipeline([
        pdp.Schematize(COLUMNS),
        pdp.ApplyByCols("category_group", lambda x: "tops"
                        if x == "tops" else "other"),
        pdp.ApplyByCols(["date", "shelf_date", "end_of_season"],
                        pd.to_datetime),
        pdp.ApplyToRows(lambda row: pd.Series({
            "standard_amount":
            row["original_price"] * row["sales"],
            "sales_discount":
            0 if (row["original_price"] * row["sales"] <= 0) else row[
                "sales_amount"] / ((row["original_price"] * row["sales"])),
            "week":
            int(row["date"].strftime('%W')),
            "days_on_counter":
            (row["date"] - row["shelf_date"]) / np.timedelta64(1, 'D'),
            "life_cycle": (row["end_of_season"] - row["shelf_date"]) /
            (np.timedelta64(1, 'D')),
            "C1":
            1 if row["category_group"] == "tops" else 0,
            "C2":
            1 if row["category_group"] == "other" else 0,
            "sales":
            0 if row["sales"] < 0 else row["sales"],
            "passenger_flow":
            0 if row["passenger_flow"] < 0 else (row["passenger_flow"]),
            "plus_purchase":
            0 if row["plus_purchase"] < 0 else (row["plus_purchase"]),
        })),
        pdp.AdHocStage(
            lambda df: df[df["days_on_counter"] <= df["life_cycle"]]),
        pdp.ColDrop("activity_level")
    ])
    salesdata = pline.apply(salesdata, verbose=True, exraise=True)

    salesdata_cumitems = salesdata[[
        "SKC", "date", "sales", "passenger_flow", "plus_purchase"
    ]].sort_values(by=["SKC", "date"]).groupby(['SKC']).cumsum()
    salesdata_cumitems.columns = [
        "total_sales", "total_passenger_flow", "total_plus_purchase"
    ]
    salesdata["total_sales"] = salesdata_cumitems["total_sales"]
    salesdata["total_passenger_flow"] = salesdata_cumitems[
        "total_passenger_flow"]
    salesdata["total_plus_purchase"] = salesdata_cumitems[
        "total_plus_purchase"]
    print("consumed time(s)=", time.time() - start)
Пример #6
0
def uniformize_boolean(columns):
    start_time = time.time()
    true_strings = ['t', 'true', 'yes', 'y', True]
    false_strings = ['f', 'false', 'n', 'no', False]

    func_true = lambda x: 1.0 if x in true_strings else x
    func_false = lambda x: 0.0 if x in false_strings else x

    result = pdp.ApplyByCols(columns, func_true) + pdp.ApplyByCols(
        columns, func_false)

    time_elapsed = time.time() - start_time
    print("uniformize_boolean:", time_elapsed)

    return result
Пример #7
0
def check_sk_pipeline():
    pline = pdp.make_pdpipeline(
        pdp.ApplyByCols("ph", lambda x: x - 1),
        # pdp.Bin({"ph": [0, 3, 5, 12]}),
        pdp.Encode(["type", "lbl"]),
    )
    print(pline)

    model_pline = make_pipeline(
        pdp.FreqDrop(2, "lbl"),
        LogisticRegression(),
    )
    print(model_pline)

    train = _train_df()
    res_train = pline(train)
    print("Processed train set: {}".format(res_train))
    x_train, y_train = x_y_by_col_lbl(res_train, "lbl")
    model_pline = model_pline.fit(x_train, y_train)
    print("Fitted model pipeline: {}".format(model_pline))

    test = _test_df()
    res_test = pline(test)
    print("Processed test set: {}".format(res_test))
    x_test, y_test = x_y_by_col_lbl(res_test, "lbl")
    predictions = model_pline.predict(x_test)
    print("predictions: {}".format(predictions))
Пример #8
0
def uniformize_missing(columns):
    start_time = time.time()
    missing_values_strings = ['NaN', '??', '*', 'UNK', '-', '###']
    func = lambda x: np.nan if x in missing_values_strings else x

    time_elapsed = time.time() - start_time
    result = pdp.ApplyByCols(columns, func)
    print("uniformize_missing:", time_elapsed)

    return result
Пример #9
0
def uniformize_percentage():
    start_time = time.time()

    percentage_columns = ['host_response_rate']

    func = lambda x: float(x[:-1]) if type(x) == str else x

    result = pdp.ApplyByCols(percentage_columns, func)

    time_elapsed = time.time() - start_time
    print("uniformize_percentage:", time_elapsed)

    return result
Пример #10
0
def main():
    train, test = common_modules.merge_files()

    pipeline = pdp.ApplyByCols("texto",
                               common_modules.tag_remove,
                               "clean_texto",
                               drop=False)
    pipeline += pdp.ApplyByCols("clean_texto", common_modules.trat_texto)
    train = pipeline(train)

    print("Fazendo gridSearch da Regressão logistica")
    score_log_reg, param_log_reg = reg_log(train.texto, train.label)
    print(
        f"Regressão linear best score {score_log_reg}, com parâmetros {param_log_reg} "
    )
    print("Fazendo gridSearch da SVM")
    score_svm, param_svm = sup_vec(train.texto, train.label)
    print(f"suport vector best score {score_svm}, com parâmetros {param_svm} ")
    print("Fazendo gridSearch da Naive bayes")
    score_nb, param_nb = naive_bayes(train.texto, train.label)
    print(
        f"Multinomial Naive bayes best score {score_nb}, com parâmetros {param_nb} "
    )
Пример #11
0
def uniformize_monetary():
    start_time = time.time()

    monetary_columns = [
        'price', 'weekly_price', 'monthly_price', 'security_deposit',
        'cleaning_fee', 'extra_people'
    ]

    func = lambda x: float(x[1:].replace(',', '')) if type(x) == str else x

    result = pdp.ApplyByCols(monetary_columns, func)

    time_elapsed = time.time() - start_time
    print("uniformize_monetary:", time_elapsed)

    return result
Пример #12
0
cnxn, cursor = sql_data.connect()
df = sql_data.executeQueryFromFile(cnxn)

sql_subwaystations = bsql.SQL(sql_file_name_coordinates_subwaystations,
                              sql_file_path)
cnxn, cursor = sql_subwaystations.connect()
df_subwaystations = sql_subwaystations.executeQueryFromFile(cnxn)

sql_addresses = bsql.SQL(sql_file_name_coordinates_addresses, sql_file_path)
cnxn, cursor = sql_addresses.connect()
df_addresses = sql_addresses.executeQueryFromFile(cnxn)

geodesic(df_subwaystations[['Latitude', 'Longitude']],
         df_addresses[['Latitude', 'Longitude']])

pipeline = pdp.ApplyByCols('District', District_transformation, 'District')
pipeline += pdp.ApplyByCols('BuiltYear', builtYear_transformation, 'BuiltYear')
pipeline += pdp.RowDrop({'District': lambda x: x == None})
pipeline += pdp.RowDrop({'OperatingCostInSek': lambda x: pd.isnull(x) == True})
pipeline += pdp.RowDrop({'NumberOfRooms': lambda x: x == 0})
pipeline += pdp.RowDrop({'FloorNumber': lambda x: pd.isnull(x) == True})
pipeline += pdp.RowDrop({'BuiltYear': lambda x: pd.isnull(x) == True})
pipeline += pdp.OneHotEncode('District')
pipeline += pdp.OneHotEncode('BuiltYear')
pipeline += pdp.ColDrop(['Address'])

df_pipeline = pipeline(df)
variables = GoMining(df_pipeline)
MiningReport(variables)

formula = 'SoldPricePerSquaredMeterInSek ~ MonthlyChargeInSek + \
Пример #13
0
data = pd.read_csv('C:\\Users\\13810\\tmdb_5000_movies.csv')


# In[8]:


import pdpipe as pdp#导入pdpipe包


# In[13]:


#创建pdp.pdppipeline ,其中pdp.ColDrop和pdp.ApplyByCols和pdp.Rowdrop都是pdpipe中常用的API,是固定写法,有固定的语法格式
first_pipeline = pdp.PdPipeline(
[pdp.ColDrop("original_title"),#ColDrop用于对指定单个或多个列进行丢弃
 pdp.ApplyByCols(columns=['title'], func=lambda x: x.lower()),#对每一列进行应用
 pdp.RowDrop({'vote_average': lambda x: x <= 7, 'original_language': lambda x: x != 'en'}),#对行进行删除
 pdp.ApplyByCols(columns=['genres'], func=lambda x: [item['name'] for item in eval(x)].__len__(), result_columns=['genres_num']),
 pdp.RowDrop({'genres_num': lambda x: x <= 5})]
)


# In[19]:


data1=first_pipeline(data, verbose=True).reset_index(drop=True)
 pdp.RowDrop({'vote_average': lambda x: x <= 7, 'original_language': lambda x: x != 'en'}),
 pdp.ApplyByCols(columns=['genres'], func=lambda x: [item['name'] for item in eval(x)].__len__(), re


# In[20]:
Пример #14
0
!gsutil -m cp -r gs://{bucket_name}/* /content/drive/My\ Drive/CoronaTweets/

base_db_folder = '/content/drive/My Drive/CoronaTweets'
tweet_db_paths = [
    # incomplete data - '/corona_tweets_1M.db/corona_tweets_1M.db',   # 27.02.2020 10:36 01.03.2020 18:24 1578957
    # malformed - '/corona_tweets_2M_2/corona_tweets_2M_2.db',  # 02.03.2020 17:27	07.03.2020 4:57	2268665
    '/corona_tweets_3M/tweets.db',  # 07.03.2020 5:06	14.03.2020 4:46	7472368
    '/corona_tweets_1M/tweets.db',  # 14.03.2020 5:23	15.03.2020 3:16	1903768
    '/corona_tweets_2M_3/tweets.db',  # 15.03.2020 3:28	16.03.2020 4:31	2081576
    '/corona_tweets_1M_2/tweets.db',  # 16.03.2020 4:38	17.03.2020 3:08	1889781
    '/corona_tweets_2L/tweets.db'  # 17.03.2020 3:12	17.03.2020 6:10	280304
]

pipeline = pdp.PdPipeline([
    pdp.ColRename({'unix': 'tweet_date'}),
    pdp.ApplyByCols('sentiment', is_positive, 'is_positive', drop=False),
    pdp.ApplyByCols('sentiment', is_negative, 'is_negative', drop=False),
    pdp.ApplyByCols('sentiment', is_neutral, 'is_neutral', drop=False),
])

tweets_df = pd.DataFrame()

for tweets_db in tweet_db_paths:
    full_tweet_db_path = base_db_folder + tweets_db
    print(dt.datetime.now(), "Processing started: ", full_tweet_db_path)
    conn = sqlite3.connect(full_tweet_db_path)
    c = conn.cursor()
    df_pie = pd.read_sql("SELECT * FROM sentiment", conn)
    df_pie['unix'] = pd.to_datetime(df_pie['unix'], unit='ms').dt.date  # cast to date
    df = pipeline.apply(df_pie).sort_values(by=['tweet_date'])