def run_tests(IC_DATA_DIR): from lib.process_data import instacart_process from lib.data_class import DataSet # set random seed for consistent tests np.random.seed(42) # load data from instacart csv files (values below use testing directory) order_data, product_data = instacart_process(data_dir=IC_DATA_DIR) # create dataset ic_dataset = DataSet(order_df=order_data, product_df=product_data) # check dataframes created correctly assert ic_dataset.order_df.shape == (31032, 4) assert ic_dataset.product_df.shape == (6126, 5) # create feature model and transform dataset feature_model = MainFeatureModel() X = feature_model.transform(ic_dataset) assert X.shape == (12214, 8) feature_model = MainFeatureModel() X = feature_model.transform(ic_dataset, return_df=True, drop_categorical=False) assert X.shape == (12214, 10) log.info("feature_models tests passed!")
def run_tests(IC_DATA_DIR): from lib.process_data import instacart_process # set random seed for consistent tests np.random.seed(42) # load data from instacart csv files (values below use testing directory) order_data, product_data = instacart_process(data_dir=IC_DATA_DIR) # create dataset ic_dataset = DataSet(order_df=order_data, product_df=product_data) # check dataframes created correctly assert ic_dataset.order_df.shape == (31032, 4) assert ic_dataset.product_df.shape == (6126, 5) # check user and product ids created correctly assert ic_dataset.user_ids.shape == (206, ) assert len(ic_dataset.user_idx) == 206 assert ic_dataset.prod_ids.shape == (6126, ) assert len(ic_dataset.prod_idx) == 6126 # perform train-test split train_dataset, test_dataset = ic_dataset.train_test_split() assert train_dataset.order_df.shape == (24939, 4) assert train_dataset.product_df.shape == (6126, 5) assert test_dataset.order_df.shape == (6093, 4) assert test_dataset.product_df.shape == (6126, 5) # check that prod_ids inherited correctly assert (train_dataset.prod_ids == ic_dataset.prod_ids).all() assert (test_dataset.prod_ids == ic_dataset.prod_ids).all() # create adversarial dataset adv_dataset = ic_dataset.make_adversarial() assert adv_dataset.order_df.shape == (31032, 4) assert adv_dataset.product_df.shape == (6126, 5) assert (adv_dataset.user_ids == ic_dataset.user_ids).all() # number of places old and new dfs differ, should be number of users (unless swap product for self; very unlikely) assert np.sum(ic_dataset.order_df.product_id.values != adv_dataset.order_df.product_id.values) == 206 # test prior-last order split assert ic_dataset.prior_order_df.shape == (28906, 4) assert ic_dataset.prior_user_prod.shape == (12214, 2) assert ic_dataset.labels.shape == (12214, ) assert ic_dataset.size == 12214 # test user-product matrix assert ic_dataset.user_prod_matrix.shape == (206, 6126) log.info("data_class tests passed!")
def run_tests(IC_DATA_DIR): from lib.process_data import instacart_process from lib.data_class import DataSet # set random seed for consistent tests np.random.seed(42) # load data from instacart csv files (values below use testing directory) order_data, product_data = instacart_process(data_dir=IC_DATA_DIR) # create dataset ic_dataset = DataSet(order_df=order_data, product_df=product_data) # check dataframes created correctly assert ic_dataset.order_df.shape == (31032, 4) assert ic_dataset.product_df.shape == (6126, 5) # perform train-test split train_dataset, test_dataset = ic_dataset.train_test_split() assert train_dataset.order_df.shape == (24939, 4) assert test_dataset.order_df.shape == (6093, 4) # getall model model = GetAllModel() model.fit(train_dataset) model.predict(test_dataset) assert model.preds.shape == test_dataset.labels.shape # random model model = RandomModel() model.fit(train_dataset) model.predict(test_dataset) assert model.preds.shape == test_dataset.labels.shape # logistic model model = LogisticModel() model.fit(train_dataset) model.predict(test_dataset) assert model.preds.shape == test_dataset.labels.shape # random forest model model = RandomForestModel() model.fit(train_dataset) model.predict(test_dataset) assert model.preds.shape == test_dataset.labels.shape # LGBoost model model = LGBoostModel() model.fit(train_dataset) model.predict(test_dataset) assert model.preds.shape == test_dataset.labels.shape log.info("baseline_models tests passed!")
def run_tests(IC_DATA_DIR): from lib.process_data import instacart_process from lib.data_class import DataSet # set random seed for consistent tests np.random.seed(42) # load data from instacart csv files (values below use testing directory) order_data, product_data = instacart_process(data_dir=IC_DATA_DIR) # create dataset ic_dataset = DataSet(order_df=order_data, product_df=product_data) # check dataframes created correctly assert ic_dataset.order_df.shape == (31032, 4) assert ic_dataset.product_df.shape == (6126, 5) # perform train-test split train_dataset, test_dataset = ic_dataset.train_test_split() assert train_dataset.order_df.shape == (24939, 4) assert test_dataset.order_df.shape == (6093, 4) # create user latent model and fit to train_dataset user_latent = UserModel() user_latent.fit(train_dataset, epochs=2) assert user_latent.transform(train_dataset).shape == (9447, 32) # test encoding and decoding works as expected encoded_upm = user_latent.encoder.predict(train_dataset.user_prod_matrix) decoded_upm = user_latent.decoder.predict(encoded_upm) autoencoded_upm = user_latent.autoencoder.predict( train_dataset.user_prod_matrix) assert encoded_upm.shape == (154, 32) assert decoded_upm.shape == (154, 6126) assert (decoded_upm == autoencoded_upm).all() # create topological user latent model and fit to train_dataset top_user_latent = TopUserModel() top_user_latent.fit_transform(train_dataset, epochs=2) assert top_user_latent.transform(test_dataset).shape == (2767, 219) # # manually compare original UPM and autoencoder prediction # print(train_dataset.user_prod_matrix[:5, :6]) # print(autoencoded_upm[:5, :6]) # transform test_dataset assert user_latent.transform(test_dataset).shape == (2767, 32) # create word2vec latent model and fit to train_dataset and transform test_dataset # TODO: not retraining correctly... w2v_model = word2vecModel() w2v_model.fit(train_dataset) log.debug(w2v_model.model) assert w2v_model.transform(test_dataset).shape == (2767, 100) # create TFIDF latent model and fit to train_dataset and transform test_dataset tfidf_model = TFIDFModel() tfidf_model.fit(train_dataset) assert tfidf_model.transform(test_dataset).shape == (2767, 20) # create product latent model (combination of previous two) and fit to train_dataset and transform test_dataset product_latent = ProductModel() product_latent.fit(train_dataset) assert product_latent.transform(test_dataset).shape == (2767, 10) log.info("latent_models tests passed!")
def run_tests(IC_DATA_DIR): from lib.process_data import instacart_process from lib.data_class import DataSet from models.latent_models import UserModel, ProductModel from models.feature_models import MainFeatureModel # set random seed for consistent tests np.random.seed(42) # load data from instacart csv files (values below use testing directory) order_data, product_data = instacart_process(data_dir=IC_DATA_DIR) # create dataset ic_dataset = DataSet(order_df=order_data, product_df=product_data) # check dataframes created correctly assert ic_dataset.order_df.shape == (31032, 4) assert ic_dataset.product_df.shape == (6126, 5) # perform train-test split train_dataset, test_dataset = ic_dataset.train_test_split() assert train_dataset.order_df.shape == (24939, 4) assert test_dataset.order_df.shape == (6093, 4) # create user latent model, fit and transform user_latent = UserModel() user_latent.fit(train_dataset, epochs=2) assert user_latent.transform(train_dataset).shape == (9447, 32) # create product latent model, fit and transform product_latent = ProductModel() product_latent.fit(train_dataset) assert product_latent.transform(train_dataset).shape == (9447, 10) # create feature model feature_model = MainFeatureModel() X = feature_model.transform(train_dataset) assert X.shape == (9447, 8) # fit non-top model to train_dataset model = NonTopModel(user_latent_model=user_latent, product_latent_model=product_latent, feature_model=feature_model) model.fit(train_dataset, fit_latent=False, epochs=2) assert model.input_dim == 50 # predict on test_dataset model.predict(test_dataset) assert model.preds.shape == (2767, ) assert model.preds.shape == test_dataset.labels.shape # fit top model to train_dataset model = TopModel(user_latent_model=user_latent, product_latent_model=product_latent, feature_model=feature_model) model.fit(train_dataset, fit_latent=False, epochs=2) print(model.input_dim) # predict on test_dataset model.predict(test_dataset) assert model.preds.shape == (2767, ) assert model.preds.shape == test_dataset.labels.shape log.info("main_models tests passed!")