def __init__(self, km, fh): self.utils_cl = MyUtils() self.km = km self.fh = fh self.cm = ClusteringMachine() logging.info("Wrappers instantiated")
def __init__(self): logging.info(pk.getKhiopsInfo()) self.this_file_dir = os.path.dirname(os.path.realpath(__file__)) # path mgmt # use timestamp of each exec in paths self.fh = FileHelper() self.dictionary_file = os.path.join(self.this_file_dir, "dic", "series.kdic") self.classif_res = os.path.join(self.this_file_dir, "res", "khiops_res", "classif") self.coclus_res = os.path.join(self.this_file_dir, "res", "khiops_res", "coclus") self.pred_res = os.path.join(self.this_file_dir, "res", "khiops_res", "pred_res") self.fh.ensure_dirs_exist([ self.dictionary_file, self.classif_res, self.coclus_res, self.pred_res ]) self.ccr = CoclusteringResults() self.utils = MyUtils() logging.info("Khiops manager instantiated") logging.info("dictionary_file used: %s", self.dictionary_file)
def __init__(self, file_name): self.utils_cl = MyUtils() self.this_file_dir = os.path.dirname(os.path.realpath(__file__)) self.fh = FileHelper() self.hm = HmmMachine() self.cm = ClusteringMachine() self.pm = PredictMachine() self.my_metric = "euclidean" self.file_name = file_name self.out_fcasts_f = os.path.join(self.this_file_dir, "res", "fcasts", file_name, "ecml") self.fh.ensure_dirs_exist([self.out_fcasts_f]) logging.info("Instantiated ECML_operator")
}, requests.codes.bad, "error"), ({ "email": "peter@klaven" }, requests.codes.bad, "error"), ({ "email": "1", "password": "******" }, requests.codes.not_found, "error"), ({}, requests.codes.bad, "error"), ] ids_list = [ "successful", "w/o email", "w/o password", "invalid email/password", "w/o parameters" ] u = MyUtils() @pytest.fixture(scope='function', params=param_list, ids=ids_list) def param_test(request): return request.param class TestClass(): login_request = "api/login" email = "peter@klaven" password = "******" @allure.feature("Login") @allure.testcase("Login")
class Wrappers: def __init__(self, km, fh): self.utils_cl = MyUtils() self.km = km self.fh = fh self.cm = ClusteringMachine() logging.info("Wrappers instantiated") def launch_preds( self, data_fmt_clustering_train, data_fmt_clustering_valid_or_test, # Train and test formated to be fed in classifier days_train, days_valid_or_test, l_ref, # Days of train and test in a list cluster_and_values_train, # To compute centroids nb_cluster, mean_day, fit, classifier): """ Utilitary piece of code that is used to launch predictions. This wrapper takes input from clustering or coclustering algorithms and generates associated predictions MSEs. This method works both for clustering and coclustering group creation. ARGS: * data_fmt_clustering_train: Formatted data, ready to be fed into a classifier * data_fmt_clustering_valid_or_test: Same, but with only the last 20% remaining * days_train: all days present in train * days_valid_or_test: all days present in test * cluster_and_values_train: dic(cluster_num => values_associated) * nb_cluster: how many clusters pour cette passe * mean_day: mean day on train only * fit: classifier used * classifier: string label, which classifier are we using? """ # Separate target from data train_full = data_fmt_clustering_train.loc[:, data_fmt_clustering_train. columns != 'y'].sort_index( ) train_target = data_fmt_clustering_train.ix[:, "y"].sort_index() test_full = data_fmt_clustering_valid_or_test.loc[:, data_fmt_clustering_valid_or_test .columns != 'y'].sort_index() test_target = data_fmt_clustering_valid_or_test.ix[:, "y"].sort_index() # Feed df_train fit.fit(train_full, train_target) res = {} # Retrieve cluster names (we are not numbering them but referencing them # their little given name) clusts_names = cluster_and_values_train.keys() (centroids, e) = self.km.compute_centroids(cluster_and_values_train, days_train, l_ref) # NOTE: this lines below are indeed strange... But we have had developed some other API that we had to follow. # That is why we format the data that way. y_pred = pd.DataFrame(fit.predict_proba(test_full), columns=fit.classes_) y_pred["Predictedy"] = y_pred.idxmax(axis=1) y_pred.columns = ["Proby" + str(col) for col in y_pred.columns] y_pred.rename(columns={'ProbyPredictedy': 'Predictedy'}, inplace=True) # 1. w/ MODL & probabilistic prevision res[False] = self.km.process_pred_proba(y_pred, test_target, clusts_names, centroids, days_valid_or_test, nb_cluster, l_ref, mean_day) # 2. Oracle y_pred_or = y_pred.copy().reset_index(drop=True) y_pred_or["Predictedy"] = test_target.reset_index(drop=True) res[True] = self.km.process_pred_proba(y_pred_or, test_target, clusts_names, centroids, days_valid_or_test, nb_cluster, l_ref, mean_day, o=True) return (e, res) def simplify_coclus(self, n_clus_found, n_clus_target, mpi, mcn, file_name_train, path_file_test, file_name_root, wlabel): """ Ad-hoc method to simplify a Khiops coclustering json file. """ # loop configuration to_add = 5 to_remove = 2 mcn_init_ok = False # First try to get the good number of cluster! logging.info("Keep %s percent of info", mpi) logging.info("Desired nb of cluster is %s", n_clus_target) identifier = mpi # While we do not reach the good number of cluster all_mpis = [] while n_clus_found != n_clus_target: if mcn == 1 and len(all_mpis) < 20: logging.info("Simplifying using MPI") simplified_file = self.km.simplify_coclustering( file_name_train, mpi=mpi) n_clus_found = self.km.get_cluster_number(file_name_train, ref_id=mpi) logging.info( "Found %s clusters in simplified coclus file (expected %s)", n_clus_found, n_clus_target) all_mpis.append(mpi) if n_clus_found > n_clus_target: self.fh.rm_simplified_outputs(file_name_train, mpi) mpi = mpi - to_remove logging.info( "MPI was apparently too big, decreasing its size of -%s: mpi = %s", to_remove, mpi) if mpi in all_mpis: to_remove = to_remove / 2 to_add = to_add / 2 elif n_clus_found < n_clus_target: self.fh.rm_simplified_outputs(file_name_train, mpi) mpi = mpi + to_add logging.info( "MPI was apparently too small, increasing its size of +%s: mpi = %s", to_add, mpi) identifier = mpi else: logging.info("Now simplifying using MCN") # Init new loop: should use cell number now because it is more # precise than the latest if not mcn_init_ok: self.km.simplify_coclustering(file_name_train, mpi=mpi) mcn = self.km.get_cells_number(file_name_train, mpi) to_add = math.floor(mcn * 0.2437) # empirical coefficient to_remove = math.floor(mcn * 0.097) mcn = mcn + to_add # once initialised, we do not want to re run above mcn_init_ok = True simplified_file = self.km.simplify_coclustering( file_name_train, mcn=mcn) n_clus_found = self.km.get_cluster_number(file_name_train, ref_id=mcn) logging.info( "Found %s clusters in simplified coclus file (expected %s)", n_clus_found, n_clus_target) if n_clus_found > n_clus_target: self.fh.rm_simplified_outputs(file_name_train, mcn) mcn = mcn - to_remove logging.info( "MCN was apparently too big, decreasing its size of -%s: mcn = %s", to_remove, mcn) elif n_clus_found < n_clus_target: self.fh.rm_simplified_outputs(file_name_train, mcn) mcn = mcn + to_add logging.info( "MCN was apparently too small, increasing its size of +%s: mcn = %s", to_add, mcn) identifier = mcn # Then, manage khiops files _, cluster_and_values_train = self.km.get_clusters(simplified_file) # Write labels of test file for future MODL transfert path_test_labels = self.fh.write_labels(path_file_test, identifier) # Write cluster and values on disk to compare clustering results self.fh.write_cav_on_disk(cluster_and_values_train, file_name_root, n_clus_found, wlabel) # Deploy dic and use deployed dic to transfer (transfer = actually use # MODL model created, fm) path_deployed_dic = self.km.deploy_coclustering( file_name_train, identifier) path_transfered = self.km.transfer_database(path_deployed_dic, path_test_labels, path_file_test, identifier) # Retrieve clusters attribution from khiops use of MODL cluster_and_values_test = self.km.get_clusters_from_dep( path_transfered) return ((mpi, mcn, n_clus_found), (cluster_and_values_train, cluster_and_values_test), simplified_file) def modl_wrap(self, df_train, df_valid_or_test, l_ref, mean_day, fit, classifier, cluster_and_values_train, cluster_and_values_valid_or_test, n_clus_found): # Create datasets before training a classifier data_fmt_train = self.utils_cl.format_data_for_classifier( df_train, cluster_and_values_train, "days_train") data_fmt_valid_or_test = self.utils_cl.format_data_for_classifier( df_valid_or_test, cluster_and_values_valid_or_test, "days_valid_or_test") (_, res) = self.launch_preds(data_fmt_train, data_fmt_valid_or_test, df_train, df_valid_or_test, l_ref, cluster_and_values_train, n_clus_found, mean_day, fit, classifier) # Extract results # For the simple algo (no oracle) (mses_non_proba, mses_proba, mean_mse_mean_day) = res[False][0] (mae_non_proba, mae_proba, mean_mae_mean_day) = res[False][1] (mase_non_proba, mase_proba, mean_mase_mean_day) = res[False][2] classifier_acc = res[False][3] (std_mse_non_proba, std_mse_proba, std_mse_mean_day) = res[False][4] (std_mae_non_proba, std_mae_proba, std_mae_mean_day) = res[False][5] (std_mase_non_proba, std_mase_proba, std_mase_mean_day) = res[False][6] # Then for the oracle algo (mse_non_proba_or, _, _) = res[True][0] (mae_non_proba_or, _, _) = res[True][1] (mase_non_proba_or, _, _) = res[True][2] (std_mse_non_proba_or, _, _) = res[True][4] (std_mae_non_proba_or, _, _) = res[True][5] (std_mase_non_proba_or, _, _) = res[True][6] return (((mses_non_proba, mses_proba, mae_non_proba, mae_proba, mase_non_proba, mase_proba), (mse_non_proba_or, mae_non_proba_or, mase_non_proba_or), (mean_mse_mean_day, mean_mae_mean_day, mean_mase_mean_day), classifier_acc, (std_mse_non_proba, std_mse_proba, std_mse_mean_day, std_mae_non_proba, std_mae_proba, std_mae_mean_day, std_mase_non_proba, std_mase_proba, std_mase_mean_day), (std_mse_non_proba_or, std_mae_non_proba_or, std_mase_non_proba_or))) def clustering_wrapper(self, func_and_label, days_train, days_valid_or_test, mean_day_train, l_ref, file_name, nb_cluster, fit, classifier): func, wlabel = func_and_label logging.info( "Clustering wrapper: process %s redictions for file %s with %s classifier", wlabel, file_name, classifier) # TRAIN: train clustering model (clustering, clusters_train) = func(days_train, nb_cluster) cc_train = self.cm.create_c_a_v(days_train, clusters_train) # TEST: apply the clustering model previously trained clusters_test = self.cm.apply_clustering(days_valid_or_test, clustering) cc_valid_or_test = self.cm.create_c_a_v(days_valid_or_test, clusters_test) cav_clustering_train, _ = self.cm.format_clust_results( nb_cluster, cc_train) self.fh.write_cav_on_disk(cav_clustering_train, file_name, nb_cluster, wlabel) cav_clustering_valid_or_test, _ = self.cm.format_clust_results( nb_cluster, cc_valid_or_test) data_fmt_clustering_train = self.utils_cl.format_data_for_classifier( days_train, cav_clustering_train, "days_train") data_fmt_clustering_valid_or_test = self.utils_cl.format_data_for_classifier( days_valid_or_test, cav_clustering_valid_or_test, "days_valid_or_test") (empty_cluster_found, res) = self.launch_preds( data_fmt_clustering_train, data_fmt_clustering_valid_or_test, days_train, days_valid_or_test, l_ref, cav_clustering_train, nb_cluster, mean_day_train, fit, classifier) # Then for the simple algo (no oracle) (mses_non_proba, mses_proba, mean_mse_mean_day) = res[False][0] (mae_non_proba, mae_proba, mean_mae_mean_day) = res[False][1] (mase_non_proba, mase_proba, mean_mase_mean_day) = res[False][2] classifier_acc = res[False][3] (std_mse_non_proba, std_mse_proba, std_mse_mean_day) = res[False][4] (std_mae_non_proba, std_mae_proba, std_mae_mean_day) = res[False][5] (std_mase_non_proba, std_mase_proba, std_mase_mean_day) = res[False][6] # Then for the oracle algo (mse_non_proba_or, _, _) = res[True][0] (mae_non_proba_or, _, _) = res[True][1] (mase_non_proba_or, _, _) = res[True][2] (std_mse_non_proba_or, _, _) = res[True][4] (std_mae_non_proba_or, _, _) = res[True][5] (std_mase_non_proba_or, _, _) = res[True][6] return (empty_cluster_found, ((mses_non_proba, mses_proba, mae_non_proba, mae_proba, mase_non_proba, mase_proba), (mse_non_proba_or, mae_non_proba_or, mase_non_proba_or), (mean_mse_mean_day, mean_mae_mean_day, mean_mase_mean_day), classifier_acc, (std_mse_non_proba, std_mse_proba, std_mse_mean_day, std_mae_non_proba, std_mae_proba, std_mae_mean_day, std_mase_non_proba, std_mase_proba, std_mase_mean_day), (std_mse_non_proba_or, std_mae_non_proba_or, std_mase_non_proba_or)))
class ECMLMachine: def __init__(self, file_name): self.utils_cl = MyUtils() self.this_file_dir = os.path.dirname(os.path.realpath(__file__)) self.fh = FileHelper() self.hm = HmmMachine() self.cm = ClusteringMachine() self.pm = PredictMachine() self.my_metric = "euclidean" self.file_name = file_name self.out_fcasts_f = os.path.join(self.this_file_dir, "res", "fcasts", file_name, "ecml") self.fh.ensure_dirs_exist([self.out_fcasts_f]) logging.info("Instantiated ECML_operator") def get_results_univ(self, df, mean_day, n_pt_one_period, n_serie_concatenated = 1): # Create datasets in the format we need logging.info("Computing ECML results") logging.info("Find best number of cluster") (df_train, df_valid, df_test) = self.utils_cl.app_valid_test(df, n_pt_one_period) (df_train_compar, df_test_compar) = self.utils_cl.app_test(df, n_pt_one_period) last_day = df_train_compar["val_"][-len(df_test_compar):] # run algos using df_valid to find the better num of kmeans tb-used mean_mse = self.do_it( [2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 40, 60, 70, 80, 100, 150, 200], df_train, df_valid, mean_day, n_pt_one_period) best_num_of_kmean = min(mean_mse, key = lambda t: t[1])[0] logging.info("Found best number of cluster: %s", best_num_of_kmean) # run algos using df_test with previously found best kmean res = self.do_it( [best_num_of_kmean], df_train, df_test, mean_day, n_pt_one_period) # Retrieve results (mse_colin, mse_fake, mse_mean) = (res[0][1], res[0][2], res[0][3]) (mae_colin, mae_fake, mae_mean) = (res[0][4], res[0][5], res[0][6]) (mase_colin, mase_fake, mase_mean) = (res[0][7], res[0][8], res[0][9]) (std_mse_colin, std_mse_fake, std_mse_mean) = (res[0][10], res[0][11], res[0][12]) (std_mae_colin, std_mae_fake, std_mae_mean) = (res[0][13], res[0][14], res[0][15]) (std_mase_colin, std_mase_fake, std_mase_mean) = (res[0][16], res[0][17], res[0][18]) logging.info("Retrieve results") # Compute baselines logging.info("Compute baselines") (_, mse_ar_error, mae_ar_error, mase_ar_error) = self.pm.do_forecast_ar_model( last_day, df_train_compar["val_"], df_test_compar["val_"]) (mse_hw, mae_hw, mase_hw) = -1, -1, -1 return ( mse_colin, mse_fake, mse_mean, mae_colin, mae_fake, mae_mean, mase_colin, mase_fake, mase_mean, mse_ar_error, mae_ar_error, mase_ar_error, mse_hw, mae_hw, mase_hw, best_num_of_kmean, std_mse_colin, std_mse_fake, std_mse_mean, std_mae_colin, std_mae_fake, std_mae_mean, std_mase_colin, std_mase_fake, std_mase_mean ) def do_it(self, km_sizes, df_train, df_valid_ou_test, mean_day, n_pt_one_period, n_serie_concatenated = 1): mean_mse = [] for my_km_size in km_sizes: logging.info("ECML with km_size of %s", my_km_size) n_day_found_train = len(df_train["n_day_"].unique()) logging.debug("There are %s days in train", n_day_found_train) ################################# # I. TRAIN ################################# (km, comprehensive_clusters_df_train) = self.cm.do_kmeans_wrapper( df_train, km_size = my_km_size ) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # MM #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # compute MMs raw_hmm_data_df_train = list( self.hm.compute_raw_hmm(comprehensive_clusters_df_train, order = 1)) # compute transition matrix transition_mat = self.hm.compute_hmm_transition_mat_1d( raw_hmm_data_df_train, my_km_size) ################################# # II. VALID ################################# # 1. apply km on df_valid data y_pred = self.cm.apply_clustering(df_valid_ou_test, km) # create tuple of known/wanted: # a) for ts data itself s_arr=[] for c in range(df_valid_ou_test["n_day_"].min() + 1, df_valid_ou_test["n_day_"].max() - 1): s_arr.append( ( df_valid_ou_test[df_valid_ou_test["n_day_"] == c - 1]["val_"].values, df_valid_ou_test[df_valid_ou_test["n_day_"] == c]["val_"].values, df_valid_ou_test[df_valid_ou_test["n_day_"] <= c]["val_"].values ) ) # b) for ts labels s_l_arr=[] for c in range(1, len(y_pred) - 1): s_l_arr.append((y_pred[c - 1], y_pred[c])) precision_colin = [] precision_fake = [] precision_mean = [] precision_colin_mae = [] precision_fake_mae = [] precision_mean_mae = [] precision_colin_mase = [] precision_fake_mase = [] precision_mean_mase = [] # compute predictions and mse for count in range(0, len(s_arr)): path_count = os.path.join(self.out_fcasts_f, str(count) + "_fcast_" + str(my_km_size) + "_kmsize.csv") (known, guess, before_w) = s_arr[count] known_w = np.pad(known, (0,len(before_w) - len(known)), "constant", constant_values=-42.42) guess_w = np.pad(guess, (0,len(before_w) - len(guess)), "constant", constant_values=-42.42) (known_l, guess_l) = s_l_arr[count] pd.DataFrame({"known": known_w[:], "guess": guess_w[:], "before": before_w[:]}) .to_csv(path_count, sep = ";", index_label = False, index = False) pred = self.pm.predict_median_hmm(known_l, transition_mat, km) pred_fake = self.pm.predict_median_hmm( known_l, transition_mat, km, real_class_of_following_day = guess_l) precision_colin.append(self.utils_cl.compute_mse(guess, pred)) precision_fake .append(self.utils_cl.compute_mse(guess, pred_fake)) precision_mean .append(self.utils_cl.compute_mse(guess, mean_day)) precision_colin_mae.append(self.utils_cl.compute_mae(guess, pred)) precision_fake_mae .append(self.utils_cl.compute_mae(guess, pred_fake)) precision_mean_mae .append(self.utils_cl.compute_mae(guess, mean_day)) precision_colin_mase.append(self.utils_cl.compute_mase(known, guess, pred)) precision_fake_mase .append(self.utils_cl.compute_mase(known, guess, pred_fake)) precision_mean_mase .append(self.utils_cl.compute_mase(known, guess, mean_day)) mean_mse.append( ( my_km_size, np.mean(precision_colin), np.mean(precision_fake), np.mean(precision_mean), np.mean(precision_colin_mae), np.mean(precision_fake_mae), np.mean(precision_mean_mae), np.mean(precision_colin_mase), np.mean(precision_fake_mase),np.mean(precision_mean_mase), np.std(precision_colin), np.std(precision_fake), np.std(precision_mean), np.std(precision_colin_mae), np.std(precision_fake_mae), np.std(precision_mean_mae), np.std(precision_colin_mase), np.std(precision_fake_mase), np.std(precision_mean_mase) ) ) return mean_mse
# Bunch of cleanings for empty files from file_helper import FileHelper fh = FileHelper(tstmp) fh.clean_zips_folder() fh.clean_res_folder(out_dir_res) fh.ensure_dirs_exist([out_path_res, in_dir, out_dir_res, out_dir_khiops, out_dir_fcasts]) # After cleaning, zip the code which is executed now fh.zip_code() # Init objects from khiops import KhiopsManager km = KhiopsManager() from my_utils import MyUtils utils = MyUtils() from ecml_machine import ECMLMachine from clustering_machine import ClusteringMachine cm = ClusteringMachine() from wrappers import Wrappers wrapper = Wrappers(km, fh) # Read configuration file: which input data to use? import json with open(os.path.join(this_file_dir, "conf", "conf_lite.json"), 'r') as f: confs = json.load(f) # Init res files
class KhiopsManager: def __init__(self): logging.info(pk.getKhiopsInfo()) self.this_file_dir = os.path.dirname(os.path.realpath(__file__)) # path mgmt # use timestamp of each exec in paths self.fh = FileHelper() self.dictionary_file = os.path.join(self.this_file_dir, "dic", "series.kdic") self.classif_res = os.path.join(self.this_file_dir, "res", "khiops_res", "classif") self.coclus_res = os.path.join(self.this_file_dir, "res", "khiops_res", "coclus") self.pred_res = os.path.join(self.this_file_dir, "res", "khiops_res", "pred_res") self.fh.ensure_dirs_exist([ self.dictionary_file, self.classif_res, self.coclus_res, self.pred_res ]) self.ccr = CoclusteringResults() self.utils = MyUtils() logging.info("Khiops manager instantiated") logging.info("dictionary_file used: %s", self.dictionary_file) """KHIOPS COCLUSTERING TRAIN AND SIMPLIFICATIONS""" def train_coclustering(self, f): """ Train a coclustering model in the simplest way possible """ file_name = self.fh.get_file_name(f) logging.info("Train of coclustering for file %s", file_name) # Train coclustering model for variables "SampleId" and "Char" pk.trainCoclustering(dictionaryFile=self.dictionary_file, dictionary="train", dataTable=f, coclusteringVariables=["time_", "n_day_", "val_"], resultsDir=self.coclus_res, fieldSeparator=";", samplePercentage=100, resultsPrefix=file_name + "_") def simplify_coclustering(self, file_name, mpi=100, mcn=999999): """ Simplify a coclustering model in the simplest way possible """ base_path = os.path.join(self.coclus_res, file_name) self.fh.ensure_dirs_exist([base_path]) cf = os.path.join(self.coclus_res, file_name + "_Coclustering.khc") logging.info("Simplify coclustering for file %s", file_name) logging.info("MCN=%s, MPI=%s", mcn, mpi) if mcn != 999999: scf = file_name + "_Simplified-" + str(mcn) + ".khc" else: scf = file_name + "_Simplified-" + str(mpi) + ".khc" logging.info("scf=%s", scf) pk.simplifyCoclustering( coclusteringFile=cf, simplifiedCoclusteringFile=scf, resultsDir=base_path, maxCellNumber=mcn, maxPreservedInformation=mpi, ) return str(os.path.join(base_path, scf)).replace(".khc", ".json") def deploy_coclustering(self, file_name, identifier): """ Deploy a coclustering model """ base_path = os.path.join(self.coclus_res, file_name) self.fh.ensure_dirs_exist([base_path]) logging.info("Deploy coclustering for file %s", file_name) scf = os.path.join( self.coclus_res, file_name, file_name + "_Simplified-" + str(identifier) + ".khc") dep_prefix = file_name + "_Deployed-" + str(identifier) + "-" pk.prepareCoclusteringDeployment(dictionaryFile=self.dictionary_file, dictionary="root", coclusteringFile=scf, tableVariable="secondary", deployedVariable="n_day_", buildDistanceVariables=True, resultsPrefix=dep_prefix, resultsDir=base_path) return os.path.join(base_path, dep_prefix + "Coclustering.kdic") def transfer_database(self, d, deployed_path, f, identifier): """ Transfer a coclustering model to use it """ file_name = self.fh.get_file_name(f) out_path = os.path.join( self.coclus_res, file_name, file_name + "_Transf-" + str(identifier) + ".csv") logging.info("Transfering of coclustering for file %s into %s", file_name, out_path) pk.transferDatabase(dictionaryFile=d, dictionary="root", dataTable=deployed_path, additionalDataTables={"root`secondary": f}, fieldSeparator=";", outputFieldSeparator=";", outputDataTable=out_path) return out_path """JSON COCLUSTERING MANIPULATIONS""" def get_clusters(self, path): """ From a given Khiops json file, extract clusters, and detailed info about them. Returns: * zips: more info about each cluster zipped together (e.g all tuples that goes together 'aligned') => tuples (clus, values, value_frequencies, value_typicalities) * cluster_and_values: dic(cluster_num => values_associated) """ cluster_and_values = {} zips = [] with open(path) as f: data = json.load(f)["coclusteringReport"]["dimensionPartitions"] n_days = list(filter(lambda d: d['name'] in "n_day_", data)) for idx, n_day in enumerate(n_days[0]["valueGroups"], start=1): # Ugly turnaround to manipulate floats and int # see https://goo.gl/8tYfhn values = list(map(int, map(float, n_day["values"]))) clus = n_day["cluster"] value_frequencies = n_day["valueFrequencies"] value_typicalities = n_day["valueTypicalities"] zips.append( list( zip(clus, values, value_frequencies, value_typicalities))) cluster_and_values[clus] = values return self.utils.flattify(zips), cluster_and_values @staticmethod def get_clusters_from_dep(path): """ From a given Khiops transferred thing, extract clusters, and detailed info about them. Returns: * cluster_and_values: dic(cluster_num => values_associated) """ with open(path) as f: df = pd.read_csv(f, sep=";") k = df["n_day_PredictedLabel"].unique() cluster_and_values = dict((key, []) for key in k) for index, row in df.iterrows(): n_day_ = row["n_day_"] clus = row["n_day_PredictedLabel"] cluster_and_values[clus].append(n_day_) return cluster_and_values def get_cells_number(self, file_name, mpi=None): """ From a given Khiops json file, extract number of cells for all dimentions return: - cells """ if mpi is not None: p = os.path.join(self.coclus_res, file_name, file_name + "_Simplified-" + str(mpi) + ".json") else: p = os.path.join(self.coclus_res, file_name + "_Coclustering.json") with open(p) as f: cells = json.load(f)["coclusteringReport"]["summary"]["cells"] return int(cells) def get_cluster_number(self, file_name, ref_id=None): """ From a given Khiops json file, extract number of cluster for dim "n_day_" return: - nb_of_cluster_found """ if ref_id is not None: p = os.path.join( self.coclus_res, file_name, file_name + "_Simplified-" + str(ref_id) + ".json") else: p = os.path.join(self.coclus_res, file_name + "_Coclustering.json") with open(p) as f: data = json.load(f)["coclusteringReport"]["dimensionSummaries"] n_days = list(filter(lambda d: d['name'] in "n_day_", data)) if not n_days: return False else: return int(n_days[0]["parts"]) """UTILS""" @staticmethod def _compute_accuracy(len_y_pred, y_pred, y_pred_target): """ Compute the accuracy of a classifier. """ c = 0 for i in range(0, len_y_pred): pred_group_day_ahead = str(y_pred.iloc[i]["Predictedy"]) real_group_day_ahead = str(y_pred_target.values[i]) if pred_group_day_ahead != real_group_day_ahead: c = c + 1 train_acc = 100 - c * 100 / len_y_pred logging.debug("%s errors over %s values", c, len_y_pred) logging.debug("That is %s perc of accuracy", train_acc) return train_acc @staticmethod def __get_typicalitie(n_day, my_zip): """ From a list of tuples my_zip, find the tuple which concern n_day and retrieve its valTyp. PARAMETERS ------------------------ - my_zip: tuples (group, n_day, valFreq, valTyp) - n_day: int """ items = [i for i in my_zip if n_day == i[1]] return items[0][3] @staticmethod def compute_centroids(c_a_v, df, l_ref): """ This method computes mean days and centroids for given values PARAMETERS ------------------------ * c_a_v: dic(cluster_num => values_associated) * df: dataset studied * l_ref: len of one ref day RETURNS ------------------------- * centroids: dic(cluster_id: int => centroid: []) """ centroids = {} e = False for k, v in c_a_v.items(): # If there is at least one day in the cluster considered! if len(v) != 0: logging.info("Computing cluster %s centroid", k) centroids[k] = df[df['n_day_'].isin(map( str, v))].groupby('time_')['val_'].agg('mean').values # Otherwise, mean day = 0 else: logging.info("Cluster %s is empty", k) e = True centroids[k] = [0] * l_ref logging.info("Keys of centroids are %s", list(centroids.keys())) logging.debug("Centroids are %s", centroids) return centroids, e def process_pred_proba(self, y_pred, y_pred_target, clusts_names, centroids, days_to_predict, nb_cluster_found, n_pt_per_day, mean_day, o=False): """ This method process results from Khiops classification method to create predictions and compute mses. If Oracle mode is "True", then use a mock for classifier Y (e.g the classifier knows exactly the Y columns) WHAT IT DOES ------------------------ * First thing (A): Compute given classifier accuracy regarding known y_pred_target. * Second thing (B): Process probalistic classifier results but not using the probabilities and only the most probable class for day n + 1. NPA * Third thing (C): Process probalistic classifier results using probabilities affected to each cluster (e.g. day n + 1 could be 10% in cluster 1, 50% in cluster 2, etc). Use ponderations to sum up each centroids and produce result. PA * Last thing (D): Wrap up, means computations => compute results... PARAMETERS ------------------------ * y_pred: Matrix results from classifier * y_pred_target_test: Known y_pred_target for classifier for test data * clusts_names: names of all clusters * centroids: clusters centroids * days_to_predict: list of days that have been used for test * nb_cluster_found: value exctracted from Khiops coclustering file; used to parameterize loops and computations. * n_pt_per_day: if there is one point per hour of the day, then this value will be 24. * mean_day: pre-computed mean day for all training ensemble. * oracle: are we in oracle mode? ('Y' known) RETURN ------------------------- * Tuple(mean_mse_non_proba, mean_mse_proba, classifier_acc, mean_mse_mean_day) => Meaned MSE ! :param y_pred: :param clusts_names: :param centroids: :param days_to_predict: :param nb_cluster_found: :param n_pt_per_day: :param mean_day: :param y_pred_target: :param o: """ len_y_pred = len(y_pred) # (A): Compute classifier accuracy classifier_acc = self._compute_accuracy(len_y_pred, y_pred, y_pred_target) if classifier_acc == 0.0: logging.info( "This classifier is very not good and failed all the time") # Init empty arrays mses_non_proba = [] mses_proba = [] mses_mean_day = [] mae_non_proba = [] mae_proba = [] mae_mean_day = [] mase_non_proba = [] mase_proba = [] mase_mean_day = [] # Think about removing last day, which has not days after n_days_ = days_to_predict["n_day_"].unique()[:-1] # Also got to remove the first day, which has not day before # Wrangling data to have good types. n_days_int = list(map(int, n_days_)) n_days_int.remove(min(n_days_int)) n_days_ = list(map(str, n_days_int)) for ref_in_classif_ensemble, z in enumerate(n_days_): """ (B): NPA approach """ indice_today = str(int(z) - 1) today_vals = days_to_predict[days_to_predict["n_day_"] == indice_today]["val_"].values tomor_vals = days_to_predict[days_to_predict["n_day_"] == z]["val_"].values tom_class_pred_cluster_y = y_pred.iloc[ref_in_classif_ensemble][ "Predictedy"] tom_pred_y = centroids[str(tom_class_pred_cluster_y)] # Append mses mses_non_proba.append( self.utils.compute_mse(tomor_vals, tom_pred_y)) mses_mean_day.append(self.utils.compute_mse(tomor_vals, mean_day)) mae_non_proba.append(self.utils.compute_mae( tomor_vals, tom_pred_y)) mae_mean_day.append(self.utils.compute_mae(tomor_vals, mean_day)) mase_non_proba.append( self.utils.compute_mase(today_vals, tomor_vals, tom_pred_y)) mase_mean_day.append( self.utils.compute_mase(today_vals, tomor_vals, mean_day)) mean_days_pond = [] p_tot = 0 """ (C): PA approach """ for c in clusts_names: try: # Get probability to be in groupe number "c" p = y_pred.iloc[ref_in_classif_ensemble]["Proby" + str(c)] # p_tot variable for debug purposes p_tot = p_tot + p except: logging.debug("Did not find %s in the prediction matrix", c) p = 0 pass # Retrieve mean day for cluster c day_pond = centroids[c].copy() # Scale according to proba day_pond[:] = [x * p for x in day_pond] mean_days_pond.append(day_pond) logging.debug("p_tot is %s", p_tot) # Here it should be 100. # Sum up everything to create prediction tomo_pred_with_pond = [0] * n_pt_per_day for d_p in mean_days_pond: for idx, e in enumerate(d_p): tomo_pred_with_pond[idx] = tomo_pred_with_pond[idx] + e mses_proba.append( self.utils.compute_mse(tomor_vals, tomo_pred_with_pond)) mae_proba.append( self.utils.compute_mae(tomor_vals, tomo_pred_with_pond)) mase_proba.append( self.utils.compute_mase(today_vals, tomor_vals, tomo_pred_with_pond)) # (D): Wrap up: mean MSEs (so we have, at the end, Meaned Mean Square # Error) # MMSES mean_mse_non_proba = np.mean(mses_non_proba) mean_mse_proba = np.mean(mses_proba) mean_mse_mean_day = np.mean(mses_mean_day) mean_mae_non_proba = np.mean(mae_non_proba) mean_mae_proba = np.mean(mae_proba) mean_mae_mean_day = np.mean(mae_mean_day) mean_mase_non_proba = np.mean(mase_non_proba) mean_mase_proba = np.mean(mase_proba) mean_mase_mean_day = np.mean(mase_mean_day) # MSTD std_mse_non_proba = np.std(mses_non_proba) std_mse_proba = np.std(mses_proba) std_mse_mean_day = np.std(mses_mean_day) std_mae_non_proba = np.std(mae_non_proba) std_mae_proba = np.std(mae_proba) std_mae_mean_day = np.std(mae_mean_day) std_mase_non_proba = np.std(mase_non_proba) std_mase_proba = np.std(mase_proba) std_mase_mean_day = np.std(mase_mean_day) return ((mean_mse_non_proba, mean_mse_proba, mean_mse_mean_day), (mean_mae_non_proba, mean_mae_proba, mean_mae_mean_day), (mean_mase_non_proba, mean_mase_proba, mean_mase_mean_day), classifier_acc, (std_mse_non_proba, std_mse_proba, std_mse_mean_day), (std_mae_non_proba, std_mae_proba, std_mae_mean_day), (std_mase_non_proba, std_mase_proba, std_mase_mean_day))
# one-hotエンコード用 OHE_COLUMNS = ['sales', 'salary'] # 評価指標を選択 # metrics # accuracy, precision, recall, f1, auc metrics = 'accuracy' # metrics = 'precision' # metrics = 'recall' # metrics = 'f1' # metrics = 'auc' selector_method = 'RFE' # selector_method = 'PCA' myutils = MyUtils('./param_clf.yml', OBJ_TYPE, metrics) print(myutils.get_str_timestamp()) # mode = 'train' mode = 'score' if mode == 'train': # データ読み込み _, X, y = myutils.read_data(TRAIN_PATH) # 前処理 X = myutils.train_data__preprocessing_with_imputer(X, OHE_COLUMNS) # 初回探索
def __init__(self): self.utils_cl = MyUtils() logging.info("HMM machine instantiated")
class HmmMachine: def __init__(self): self.utils_cl = MyUtils() logging.info("HMM machine instantiated") def compute_raw_hmm(self, tuple_array, order=1): """ From an array of typle ((srv1, 2), (srv2, 5), (srv3, 4), ...): * sort it, * create a List(2,5,4,...), * compute HMM (2,5),(5,4), ... """ # flatmap + only keep first elem of tuple; need to sort to exhibits the sequences of days! iterable = [x[1] for x in self.utils_cl.my_sort_func(tuple_array, 0)] i = iter(iterable) win = [] for e in range(0, order + 1): win.append(next(i)) yield win for e in i: win = win[1:] + [e] yield win return list(win) @staticmethod def compute_hmm_transition_mat_1d(raw_hmm_data, n_cluster): """ Knowing the number of clusters in the KNN and with numeric results of the KNN, compute the HMM transition matrice, e.g. percentages of going from one state to another. """ order = len(raw_hmm_data[0]) - 1 pow2 = int(n_cluster * n_cluster) pow3 = int(n_cluster * n_cluster * n_cluster) if order == 1: d = pd.DataFrame(0, index=np.arange(n_cluster), columns=range(0, n_cluster)) for tup in raw_hmm_data: d.at[tup[0], tup[1]] = d.at[tup[0], tup[1]] + 1 # sum of lines sum_col = d.sum(1) for i in range(0, n_cluster): for j in range(0, n_cluster): if d.at[i, j] != 0 and sum_col[i] != 0: perc = 100 * (d.at[i, j] / sum_col[i]) else: perc = 0 if not math.isnan(perc): d.at[i, j] = perc else: d.at[i, j] = 0 elif order == 2: d = pd.DataFrame(0, index=np.arange( pow2), columns=range(0, n_cluster)) # First construct raw data for tup in raw_hmm_data: d.at[tup[0] * n_cluster + tup[1], tup[2]] = d.at[tup[0] * n_cluster + tup[1], tup[2]] + 1 sum_col = d.sum(1) # Then percentagise it for i in range(0, pow2): for j in range(0, n_cluster): if d.at[i, j] != 0 and sum_col[i] != 0: perc = 100 * (d.at[i, j] / sum_col[i]) else: perc = 0 perc = 100 * (d.at[i, j] / sum_col[i]) if not math.isnan(perc): d.at[i, j] = perc else: d.at[i, j] = 0 elif order == 3: d = pd.DataFrame(0, index=np.arange( pow3), columns=range(0, n_cluster)) for tup in raw_hmm_data: d.at[tup[0] * pow2 + tup[1] * n_cluster + tup[2], tup[3] ] = d.at[tup[0] * pow2 + tup[1] * n_cluster + tup[2], tup[3]] + 1 sum_col = d.sum(1) for i in range(0, pow3): for j in range(0, n_cluster): if d.at[i, j] != 0 and sum_col[i] != 0: perc = 100 * (d.at[i, j] / sum_col[i]) else: perc = 0 perc = 100 * (d.at[i, j] / sum_col[i]) if not math.isnan(perc): d.at[i, j] = perc else: d.at[i, j] = 0 return d @staticmethod def compute_hmm_transition_mat_2d(raw_hmm_data1, raw_hmm_data2, n_cluster): """ Knowing the number of clusters in the KNN and with numeric results of the KNN, compute the HMM transition matrice, e.g. percentages of going from one state to another. Two dimentional (e.g. using two sequences of two TS to use more info for predictions) """ order = len(raw_hmm_data1[0]) - 1 pow2 = int(n_cluster * n_cluster) pow3 = int(n_cluster * n_cluster * n_cluster) pow4 = int(n_cluster * n_cluster * n_cluster * n_cluster) raw_hmm_data = list(zip(raw_hmm_data1, raw_hmm_data2)) if order == 1: d = pd.DataFrame(0, index=np.arange(pow2), columns=range(0, pow2)) for (tup1, tup2) in raw_hmm_data: d.at[tup1[0] * n_cluster + tup2[0], tup1[1] * n_cluster + tup2[1] ] = d.at[tup1[0] * n_cluster + tup2[0], tup1[1] * n_cluster + tup2[1]] + 1 # sum of lines sum_col = d.sum(1) for i in range(0, pow2): for j in range(0, pow2): perc = 100 * (d.at[i, j] / sum_col[i]) if not math.isnan(perc): d.at[i, j] = perc else: d.at[i, j] = 0 return d