def fit_and_predict(self, KPM, user_info=0, method="Naive"): n_Kunden, n_Produkte = np.shape(KPM) prod_occ = self.get_prod_occ(KPM, n_Kunden) predictions = np.zeros_like(KPM) for kunden_index in range(n_Kunden): for kunden_index in range(n_Kunden): if kunden_index == 0: load = loader(full=n_Kunden, message="predict") load.print_progress(kunden_index) kunden_vektor = KPM[kunden_index] kunden_buy_list = np.argwhere(kunden_vektor == 1)[:, 0] for prod_index in range(n_Produkte): P_y = prod_occ[prod_index] if method == "Naive": P_x = 1 for index in kunden_buy_list: P_x *= prod_occ[index] elif method == "Approx": P_x = np.sum(KPM[:, kunden_buy_list], axis=None) / (len(kunden_buy_list) * n_Kunden) elif method == "Squared": P_x = sum((np.sum(KPM[:, kunden_buy_list], axis=1) / len(kunden_buy_list))**2) / n_Kunden elif method == "Empirical": P_x = len( np.argwhere((np.sum(KPM[:, kunden_buy_list], axis=1) / len(kunden_buy_list)) == 1)) / n_Kunden item_buy_list = np.argwhere(KPM[:, prod_index] == 1)[:, 0] reduced_KPM = KPM[item_buy_list] n_reduced_Kunden = len(reduced_KPM) reduced_prod_occ = self.get_prod_occ(reduced_KPM, n_reduced_Kunden) if method == "Naive": P_x_if_y = 1 for index in kunden_buy_list: P_x_if_y *= reduced_prod_occ[index] elif method == "Approx": P_x_if_y = np.sum( reduced_KPM[:, kunden_buy_list], axis=None) / (len(kunden_buy_list) * n_reduced_Kunden) elif method == "Squared": P_x_if_y = sum( (np.sum(reduced_KPM[:, kunden_buy_list], axis=1) / len(kunden_buy_list))**2) / n_reduced_Kunden elif method == "Empirical": P_x_if_y = len( np.argwhere( (np.sum(reduced_KPM[:, kunden_buy_list], axis=1) / len(kunden_buy_list)) == 1)) / n_reduced_Kunden # if type(P_x_if_y * P_y / P_x) != float: # print(type(P_x_if_y * P_y / P_x)) # print(P_x_if_y,P_y,P_x) predictions[ kunden_index, prod_index] = P_x_if_y * P_y / P_x if P_x != 0 else 0 return predictions
def predict(self, ds=0): if self.config["approach"] == "multi": x = ds.get_value("KPM", self.config["pred_set"]) if self.config["use_user_info"]: x = np.hstack((x, ds.get_value("info", self.config["pred_set"]))) title = self.config["dataset"] + "_model" model = self.load_model(title) if self.config["model"] == "DeepLearning": prediction = model.predict(x) elif self.config["model"] == "NaiveBayes": prediction = model.predict_proba(x) elif self.config["approach"] == "binary": KPM = ds.get_value("KPM", self.config["pred_set"]) prediction = np.zeros_like(KPM) if self.config["use_user_info"]: KPM = np.hstack( (KPM, ds.get_value("info", self.config["pred_set"]))) for index in range(self.config["n_Produkte"]): if self.config["show_progress"]: if index == 0: load = loader(self.config["n_Produkte"], "predict") load.print_progress(index) x = np.delete(KPM, index, axis=1) title = self.config["dataset"] + "_model_no_" + str(index) model = self.load_model(title) if self.config["model"] == "DeepLearning": prediction[:, index] = model.predict(x)[:, 1] elif self.config["model"] == "NaiveBayes": prediction[:, index] = model.predict_proba(x)[:, 1] # title = self.config["dataset"] + "_predictions_" + "fit" + self.config["fit_set"] + \ # "_pred" + self.config["pred_set"] + "_" + self.config["NaiveBayes"]["model_type"] + \ # "_approach" + str(self.config["approach"]) + "_split" + self.config["split"] + \ # "_count" + str(self.config["count"]) + \ # "_info" + str(self.config["use_user_info"]) + self.config["info_string"] title = self.config["model_name"] + "_predictions" np.save(self.config["dataset"] + "/npy_files/" + title, prediction) self.config["n_pred_batches"] = ds.save_batches( data=[prediction], names=["prediction"], batch_size=self.config["pred_batch_size"]) with open(self.config["dataset"] + "/json_files/config.json", "w") as fp: json.dump(self.config, fp, indent=5)
def save_batches(self, data=[], names=[], batch_size=100): """ save a list of data into batches :param data: list of data :param names: list of names how the batches of each data in the datalist will be called dir: self.config["dataset"] + "/batches/"+names[index]+"_batch_no_" + str(batch_index) + ".npy" :param batch_size: size of the batches :return: number of batches """ N = len(data) len_check = len(data[0]) for dat in data[1:]: if not len_check == len(dat): print( "shapes dont fit for method save_batches. data should be a tuple of iterables of the same length " ) sys.exit(0) n_batches = round(len_check / batch_size + 0.5) for batch_index in range(n_batches): if self.config["show_progress"]: if batch_index == 0: load = loader(n_batches, "save_batches") load.print_progress(batch_index) if batch_index == n_batches - 1: batch_data = [dat[batch_index * batch_size:] for dat in data] else: batch_data = [ dat[batch_index * batch_size:(batch_index + 1) * batch_size] for dat in data ] for index in range(N): np.save( self.config["dataset"] + "/batches/" + names[index] + "_batch_no_" + str(batch_index) + ".npy", batch_data[index]) return n_batches
def export_as_csv_in_tableau_format(data_server, config={}): db_config = { "database": { "host": "192.168.178.14", "user": "******", "passwd": "$moothOperat0r", "database": "dsaas" }, "schema_name": "dbt_recommender_system_predictions", "table_name": "tb_b_recommendations_" + config["model_name"], "key_list": ['run_id', 'client', 'content', 'propability', 'already_bought'], "dtype_list": ["int8", "int", "int", "float", "int"], "primary_key": None, "auto_increment": None } run_id = to_integer(datetime.datetime.now()) if table_exists(db_config) == 1: delete_table(db_config) create_table(db_config) for batch_index in range(config["n_pred_batches"]): if config["show_progress"] and config["n_pred_batches"] != 1: if batch_index == 0: load = loader(config["n_pred_batches"], "export") load.print_progress(batch_index) predictions = np.load(config["dataset"] + "/batches/prediction_batch_no_" + str(batch_index) + ".npy") KPM = np.load(config["dataset"] + "/batches/KPM_batch_no_" + str(batch_index) + ".npy") indexes = np.load(config["dataset"] + "/batches/indexes_batch_no_" + str(batch_index) + ".npy") if len(predictions ) < config["pred_batch_size"] and batch_index != config[ "n_pred_batches"] - 1: print(""" prediction batch size ("pred_batch_size" = %d) is too high. change value in config file. For now it has been set to the length of the prediction. """ % config["pred_batch_size"]) config["pred_batch_size"] = len(predictions) n_Kunden, n_Produkte = predictions.shape dict = { "client": [], "content": [], "propability": [], "already_bought": [] } for k in range(n_Kunden): if config["show_progress"] and config["n_pred_batches"] == 1: if k == 0: load = loader( n_Kunden, "export batch: " + str(batch_index + 1) + " von" + str(config["n_pred_batches"])) load.print_progress(k) for p in range(n_Produkte): if config["split"] == "clients": dict["client"].append(indexes[k]) dict["already_bought"].append(KPM[k, p]) else: dict["client"].append(k) dict["already_bought"].append(KPM[k, p]) dict["content"].append(p) dict["propability"].append(predictions[k, p]) # title = config["dataset"] + "_predictions_batch_no_" + str(batch_index) + \ # "_fit" + config["fit_set"] + "_pred" + config["pred_set"] + "_" + \ # config["NaiveBayes"]["model_type"] + "_approach" + str(config["approach"]) + \ # "_split" + config["split"] + "_info" + str(config["use_user_info"]) + config["info_string"] title = config["model_name"] + "_predictions" df = pd.DataFrame(dict) if config["save_result_as_csv"]: df.to_csv("Tableau_exports/" + title + ".csv", index_label="Row_index", sep=";") if config["save_result_to_db"]: r, c = df.values.shape values = np.hstack((np.ones((r, 1)) * run_id, df.values)) insert_to_table(db_config, values, config["show_progress"])
def predict(self, test_KPM, method): predictions = np.zeros_like(test_KPM) n_test_Kunden, n_Produkte = test_KPM.shape for kunden_index in range(n_test_Kunden): if kunden_index == 0: load = loader(full=n_test_Kunden, message="predict") load.print_progress(kunden_index) #load.print_progress(kunden_index, n_test_Kunden, "predict") kunden_vektor = test_KPM[kunden_index] kunden_buy_list = np.argwhere(kunden_vektor == 1)[:, 0] for prod_index in range(n_Produkte): P_y = self.prod_occ[prod_index] if method == "Naive": P_x = 1 for index in kunden_buy_list: P_x *= self.prod_occ[index] elif method == "Approx": P_x = np.sum( self.train_KPM[:, kunden_buy_list], axis=None) / (len(kunden_buy_list) * self.n_Kunden) elif method == "Squared": P_x = sum( (np.sum(self.train_KPM[:, kunden_buy_list], axis=1) / len(kunden_buy_list))**2) / self.n_Kunden elif method == "Empirical": P_x = len( np.argwhere( (np.sum(self.train_KPM[:, kunden_buy_list], axis=1) / len(kunden_buy_list)) == 1)) / self.n_Kunden item_buy_list = np.argwhere(self.train_KPM[:, prod_index] == 1)[:, 0] reduced_KPM = self.train_KPM[item_buy_list] n_reduced_Kunden = len(reduced_KPM) reduced_prod_occ = self.get_prod_occ(reduced_KPM, n_reduced_Kunden) if n_reduced_Kunden == 0: predictions[kunden_index, prod_index] = 0 else: if method == "Naive": P_x_if_y = 1 for index in kunden_buy_list: P_x_if_y *= reduced_prod_occ[index] elif method == "Approx": P_x_if_y = np.sum(reduced_KPM[:, kunden_buy_list], axis=None) / (len(kunden_buy_list) * n_reduced_Kunden) elif method == "Squared": P_x_if_y = sum( (np.sum(reduced_KPM[:, kunden_buy_list], axis=1) / len(kunden_buy_list))**2) / n_reduced_Kunden elif method == "Empirical": P_x_if_y = len( np.argwhere(( np.sum(reduced_KPM[:, kunden_buy_list], axis=1) / len(kunden_buy_list)) == 1)) / n_reduced_Kunden predictions[ kunden_index, prod_index] = P_x_if_y * P_y / P_x if P_x != 0 else 0 return predictions
def eval(dataset, prediction_filename, split, set, threshold=0.5, top_n_test=True, top_n=20): index = 0 min_dist = 1 if split == "clients": indexes = np.load(dataset + "/npy_files/" + set + "_index.npy") KPM = np.load(dataset + "/npy_files/full_KPM.npy")[indexes] == 1 elif split == "orders": KPM = np.load(dataset + "/npy_files/" + set + "_KPM.npy") == 1 n_Kunden, n_Produkte = KPM.shape print("Kunden", n_Kunden) print("Produkte", n_Produkte) print("Interaktionen", np.sum(np.sum(KPM))) if threshold < 0: threshold = 1 / n_Produkte predictions = np.load(dataset + "/npy_files/" + prediction_filename) calssifications = predictions.flatten() > threshold if top_n_test: n_orders = np.sum(KPM, axis=None) n_hits = 0 for client_index in range(n_Kunden): if client_index == 0: load = loader(len(predictions), "evaluation") load.print_progress(client_index) bought_items = np.argwhere(KPM[client_index] == 1)[:, 0] for item_index in bought_items: if item_index in np.array( sorted(zip(predictions[client_index], np.arange(len(predictions[client_index]))), reverse=True))[:, 1][:top_n]: n_hits += 1 score = n_hits / n_orders print( str(score * 100) + "%\t(", n_hits, "von", n_orders, ") \tder getätigten käufte sind in der top", top_n, "der Produktempfehlungen") KPM = KPM.flatten() print("Kpm flattened") #fpr, tpr, thresholds = metrics.roc_curve(KPM, predictions.flatten()) print(prediction_filename + ":") print("MSE", metrics.mean_squared_error(KPM, predictions.flatten())) print("neg_log_loss", metrics.log_loss(KPM, predictions.flatten())) print("Accuracy", metrics.accuracy_score(KPM, calssifications)) print("Precision", metrics.precision_score(KPM, calssifications)) print("Recall", metrics.recall_score(KPM, calssifications)) print("F1", metrics.f1_score(KPM, calssifications)) print("Confusion Matrix (tn,fp,fn,tp)") print(metrics.confusion_matrix(KPM, calssifications))
def get_training_data(self, train_set): """ generatie training data based on given configuration :param train_set: :return: """ if self.config["use_user_info"]: user_info = self.get_value("info", train_set) self.config["n_info_cols"] = len(user_info[0]) KPM = self.get_value("KPM", train_set) n_k, n_p = KPM.shape if self.config["approach"] == "multi": target = [i for i in range(self.config["n_Produkte"])] data = [[ 0 for i in range(self.config["n_Produkte"] + self.config["n_info_cols"]) ] for i in range(self.config["n_Produkte"])] for kunden_index in range(n_k): if self.config["show_progress"]: if kunden_index == 0: load = loader(n_k, "save_batches") load.print_progress(kunden_index) for produkt_index in np.argwhere(KPM[kunden_index] > 0)[:, 0]: target.append(produkt_index) var_Kunde = np.array(KPM[kunden_index]) var_Kunde[produkt_index] = 0 if self.config["use_user_info"]: var_Kunde = np.hstack( (var_Kunde, user_info[kunden_index])) data.append(var_Kunde) self.config["n_train_batches"] = self.save_batches( data=(data, target), names=["data", "target"], batch_size=self.config["train_batch_size"]) elif self.config["approach"] == "binary": for prod_n in range(self.config["n_Produkte"]): if self.config["show_progress"]: if prod_n == 0: load = loader(full=self.config["n_Produkte"], message="save_batches") load.print_progress(prod_n) target = KPM[:, prod_n] data = np.delete(KPM, prod_n, axis=1) if self.config["use_user_info"]: data = np.hstack((data, user_info)) show_progress = self.config["show_progress"] self.config["show_progress"] = False self.config["n_train_batches"] = self.save_batches( data=[data, target], names=[ "data_model_" + str(prod_n), "target_model_" + str(prod_n) ], batch_size=self.config["train_batch_size"]) self.config["show_progress"] = show_progress
def fit(self, names=["data", "target"], ds=0): if self.config["approach"] == "multi": model = self.get_model() classes = np.arange(self.config["n_Produkte"]) if self.config["n_train_batches"]: for batch_index in range(self.config["n_train_batches"]): if self.config["show_progress"]: if batch_index == 0: load = loader(self.config["n_train_batches"], "train_batches") load.print_progress(batch_index) x = np.load(self.config["dataset"] + "/batches/" + names[0] + "_batch_no_" + str(batch_index) + ".npy") t = np.load(self.config["dataset"] + "/batches/" + names[1] + "_batch_no_" + str(batch_index) + ".npy") if self.config["model"] == "DeepLearning": T = np.zeros((len(t), self.config["n_Produkte"])) for row, col in zip(np.arange(len(t)), t): T[row, col] = 1 model.fit( x, T, epochs=self.config["DeepLearning"]["n_epochs"], verbose=True) else: if self.config["n_train_batches"] == 1: model.fit(x, t) else: model.partial_fit(x, t, classes) title = self.config["dataset"] + "_model" self.save_model(title, model) elif self.config["approach"] == "binary": for index in range(self.config["n_Produkte"]): model = self.get_model() classes = np.array([0, 1]) if self.config["show_progress"]: if index == 0: load = loader(self.config["n_Produkte"], "train") load.print_progress(index) KPM = ds.get_value("KPM", self.config["fit_set"]) if self.config["use_user_info"]: KPM = np.hstack( (KPM, ds.get_value("info", self.config["fit_set"]))) x = np.delete(KPM, index, axis=1) t = KPM[:, index] if self.config["model"] == "DeepLearning": T = np.zeros((len(t), 2)) for row, col in zip(np.arange(len(t)), t): T[row, int(col)] = 1 model.fit(x, T, epochs=self.config["DeepLearning"]["n_epochs"], verbose=True) elif self.config["model"] == "NaiveBayes": model.fit(x, t) title = self.config["dataset"] + "_model_no_" + str(index) self.save_model(title, model)