def test_winsorize(self): #Check that winsorizing occurred at top and bottom 20% data = { 'TDLINX_STORE_CD': ['A1234', 'A1234', 'A1234', 'A1234', 'A1234'], 'MASTER_PKG_SKU_CD': ['1234', '1234', '1234', '1234', '1234'], 'L90_TY_QTY': [1, 2, 3, 4, 5] } df = pd.DataFrame(data) rate = ratings(df) rate.winsorize_quantTrans(lower=0.2, upper=0.2) self.assertEqual(sum(rate.quantity == np.array([2, 2, 3, 4, 4])), 5) #Check that winsorizing occurred at top and bottom 50% with zeros being ignored data = { 'TDLINX_STORE_CD': ['A1234', 'A1234', 'A1234', 'A1234', 'A1234'], 'MASTER_PKG_SKU_CD': ['1234', '1234', '1234', '1234', '1234'], 'L90_TY_QTY': [0, 0, 3, 4, 5] } df = pd.DataFrame(data) rate = ratings(df) rate.winsorize_quantTrans(lower=0.5, upper=0.5) self.assertEqual(sum(rate.quantity == np.array([0, 0, 4, 4, 4])), 5) #Check that winsorizing occurred at top and bottom 20% with zeros being included data = { 'TDLINX_STORE_CD': ['A1234', 'A1234', 'A1234', 'A1234', 'A1234'], 'MASTER_PKG_SKU_CD': ['1234', '1234', '1234', '1234', '1234'], 'L90_TY_QTY': [0, 2, 3, 4, 5] } df = pd.DataFrame(data) rate = ratings(df) rate.winsorize_quantTrans(lower=0.2, upper=0.2, ignore_zero=False) self.assertEqual(sum(rate.quantity == np.array([2, 2, 3, 4, 4])), 5)
def test_sparseMatrix(self): #Check that data is being properly pivoted data = { 'TDLINX_STORE_CD': ['A1234', 'A1234', 'B1234', 'B1234', 'B1234'], 'MASTER_PKG_SKU_CD': ['1234', '5678', '1234', '5678', '9012'], 'L90_TY_QTY': [10, 20, 30, 40, 50] } df = pd.DataFrame(data) rate = ratings(df) sp_matrix = rate.sparse_matrix() self.assertTrue(sp_matrix.shape == (2, 3)) self.assertEqual( sum((sp_matrix.toarray() == np.array([[10., 20., 0.], [30., 40., 50.]])).ravel()), 6) #check if items are ordered that data is properly pivoted data = { 'TDLINX_STORE_CD': ['A1234', 'A1234', 'B1234', 'B1234', 'B1234'], 'MASTER_PKG_SKU_CD': ['1234', '5678', '5678', '1234', '9012'], 'L90_TY_QTY': [10, 20, 30, 40, 50] } df = pd.DataFrame(data) rate = ratings(df) sp_matrix = rate.sparse_matrix() self.assertEqual( sum((sp_matrix.toarray() == np.array([[10., 20., 0.], [40., 30., 50.]])).ravel()), 6)
def __init__(self): fname = 'movie_matrix.p' if os.path.isfile(fname): self.matrix = pickle.load(open( fname, "rb" ) ) return matrix = {} mvs = movies('ds\\movies.csv') rats = ratings('ds\\ratings.csv') tgs = tags('ds\\tags.csv') for id in mvs.gen_mat: gm = mvs.gen_mat[id] ym = [0]*len(list(mvs.year_mat.values())[0]) if id in mvs.year_mat: ym = mvs.year_mat[id] tm = [0]*len(list(tgs.tag_mov_mat.values())[0]) if id in tgs.tag_mov_mat: tm = tgs.tag_mov_mat[id] rm = [0]*len(list(rats.mov_rat_mat.values())[0]) if id in rats.mov_rat_mat: rm = rats.mov_rat_mat[id] matrix[id] = gm + ym + tm + rm pickle.dump(matrix, open(fname, "wb")) self.matrix = matrix
def test_natLog(self): #Check that values are converted to natural log and that 0's have 1 add to it before transforming data = { 'TDLINX_STORE_CD': ['A1234', 'A1234', 'A1234', 'A1234', 'A1234'], 'MASTER_PKG_SKU_CD': ['1234', '1234', '1234', '1234', '1234'], 'L90_TY_QTY': [0, 0, 1, 2, 3] } df = pd.DataFrame(data) rate = ratings(df) rate.natLog_rateTrans() self.assertEqual(round(sum(rate.rating), 3), 3.178)
def test_percStore(self): #Check that percent store is working properly data = { 'TDLINX_STORE_CD': ['A1234', 'A1234', 'A1234', 'A1234', 'A1234'], 'MASTER_PKG_SKU_CD': ['1234', '1234', '1234', '1234', '1234'], 'L90_TY_QTY': [20, 50, 15, 10, 5] } df = pd.DataFrame(data) rate = ratings(df) rate.percStore_rateTrans() self.assertEqual( sum(rate.rating == np.array([0.20, 0.50, 0.15, 0.10, 0.05])), 5)
def test_binary(self): #Check that values are converted to 0,1 based on a 0 threshold data = { 'TDLINX_STORE_CD': ['A1234', 'A1234', 'A1234', 'A1234', 'A1234'], 'MASTER_PKG_SKU_CD': ['1234', '1234', '1234', '1234', '1234'], 'L90_TY_QTY': [-1, -2, 0, 4, 5] } df = pd.DataFrame(data) rate = ratings(df) rate.binary_rateTrans() self.assertEqual(sum(rate.rating == np.array([0, 0, 0, 1, 1])), 5) #Check that values are converted to -2,2 based on a 2 threshold data = { 'TDLINX_STORE_CD': ['A1234', 'A1234', 'A1234', 'A1234', 'A1234'], 'MASTER_PKG_SKU_CD': ['1234', '1234', '1234', '1234', '1234'], 'L90_TY_QTY': [-1, -2, 0, 4, 5] } df = pd.DataFrame(data) rate = ratings(df) rate.binary_rateTrans(p=2, n=-2, thresh=2) self.assertEqual(sum(rate.rating == np.array([-2, -2, -2, 2, 2])), 5)
def test_lte(self): #Check negatives are being converted and assigned to zero data = { 'TDLINX_STORE_CD': ['A1234', 'A1234', 'A1234', 'A1234', 'A1234'], 'MASTER_PKG_SKU_CD': ['1234', '1234', '1234', '1234', '1234'], 'L90_TY_QTY': [0, -1, 1, -0.1, -10] } df = pd.DataFrame(data) rate = ratings(df) rate.lte_quantTrans() self.assertEqual(sum(rate.quantity == np.array([0, 0, 1, 0, 0])), 5) #Check negatives are being converted to -1 and varying threshold from default data = { 'TDLINX_STORE_CD': ['A1234', 'A1234', 'A1234', 'A1234', 'A1234'], 'MASTER_PKG_SKU_CD': ['1234', '1234', '1234', '1234', '1234'], 'L90_TY_QTY': [1, -1, 10, -0.1, -10] } df = pd.DataFrame(data) rate = ratings(df) rate.lte_quantTrans(thresh=1, value=-1) self.assertEqual(sum(rate.quantity == np.array([-1, -1, 10, -1, -1])), 5)
import inputdata import ratings data = inputdata.raw_scores r = ratings.ratings(data) result = r.recommendations() print result
prod_info = prod_info[prod_info['STOCK_TYPE_CD'].isin(['S', 'D'])] master_sku = list(prod_info.MASTER_SKU_CD.unique()) master_sku = [str(sku) for sku in master_sku] master_pkg = list(prod_info.MASTER_PKG_CD.unique()) #Load depletions data data = civis.io.read_civis(table="cbi.IL_AL_AK_OFF_L90", database="Constellation Brands", use_pandas=True) mkts = ['AL', 'AK', 'IL'] for m in mkts: data_ratings = ratings(data[data['mkt_cd'] == m].drop('mkt_cd', axis=1), quantCol='l90_ty_qty', storeCol='tdlinx_store_cd', productCol='master_pkg_sku_cd') #Data preprocessing data_ratings.lte_quantTrans() #Remove negatives data_ratings.winsorize_quantTrans() #Winsorize data_ratings.natLog_rateTrans() #Convert to natural log scale data_sparse = data_ratings.sparse_matrix() #Prediction os.environ['OPENBLAS_NUM_THREADS'] = '1' param = { 'alpha': [1, 10, 100], 'factors': [10, 20, 40, 80], 'regularization': [0.001, 0.1] }
key = 'Rec_Eng/Product_info.csv' path = 'Product_info.csv' cb_s3.pull_file_from_s3(key, path) prod_info = pd.read_csv(path, encoding="ISO-8859-1") prod_info = prod_info[prod_info['STOCK_TYPE_CD'].isin(['S', 'D'])] master_sku = list(prod_info.MASTER_SKU_CD.unique()) master_sku = [str(sku) for sku in master_sku] master_pkg = list(prod_info.MASTER_PKG_CD.unique()) #Load depletions data data = civis.io.read_civis(table="cbi.IL_OFF_L90", database="Constellation Brands", use_pandas=True) il_off_data = ratings(data, quantCol='l90_ty_qty', storeCol='tdlinx_store_cd', productCol='master_pkg_sku_cd') #Data preprocessing il_off_data.lte_quantTrans() #Remove negatives il_off_data.winsorize_quantTrans() #Winsorize il_off_data.natLog_rateTrans() #Convert to natural log scale il_sparse = il_off_data.sparse_matrix() #Prediction import os os.environ['OPENBLAS_NUM_THREADS'] = '1' param = {'alpha': [1, 100], 'factors': [10, 80], 'regularization': [0.001]} d_test, opt_model, pred = implicit2.grid_search(il_sparse, param, itera=1,