def test_optimisation_RNAClassifier(): # ----- Trace la distribution suivie par le log d'une variable suivant la distribution powerlognorm que l'on va choisir pour le paramètre alpha ----- fig, ax = plt.subplots(1, 1) c, s = 1, 1 y = stats.powerlognorm.rvs(c, s, scale=0.01, size=10000) # scale règle le centrage. ax.hist(np.log10(y), bins=30) plt.show() # ----- Le Randomized search ----- # Importe les données excel_table = readMultipleCsv('names') # Estimateur de base, permettant notamment de sélectionner le solveur à tester base_estimator = MLPClassifier(solver="sgd", max_iter=1000, verbose=True) # ou solver="adam" hidden_layer_sizes = [ tuple(np.random.randint(20, 35, np.random.randint(3, 5, 1))) for i in range(500) ] supprDoublon(hidden_layer_sizes) # Parametres a modifier suivant ce que nous voulons tester param = { "alpha": stats.powerlognorm(1, 1, scale=0.01), "hidden_layer_sizes": hidden_layer_sizes, "activation": ["logistic", "tanh", "relu"], "learning_rate": ["constant", "invscaling", "adaptive"], "learning_rate_init": stats.powerlognorm(1, 1, scale=0.001), "batch_size": np.arange( 200, 500, 10 ) # batch_size : nombre de données sur lesquelles l'estimateur s'entraine } # "learning_rate" n'est que pour le solver sgd RNAClassifier_random_search(excel_table, base_estimator, param, 1)
rv = expon(loc=loc1,scale=scale1) rv2=expon(loc=loc2,scale=scale2) x = rv.rvs(size=1000) x2 = rv2.rvs(size=1000) isOutlier=[ True if expon.pdf(data,loc=loc1,scale=scale1)<0.01 else False for data in x ] data=[[xi,isOutlieri] for xi,isOutlieri in zip (x,isOutlier)] isOutlier2=[ True if expon.pdf(data,loc=loc2,scale=scale2)<0.01 else False for data in x2 ] data=data+[[xi,isOutlieri] for xi,isOutlieri in zip (x2,isOutlier2)] fig, ax = plt.subplots(1, 1) ax.hist([i[0] for i in data], density=True, histtype='stepfilled', alpha=0.2) plt.show() activitiesTimes.append(data) #powerlognorm c, s = 2.14, 0.446 rv=powerlognorm(c,s,scale=3,loc=5) x=rv.rvs(size=2000) isOutlier=[ True if powerlognorm.pdf(data,c=c,s=s,loc=5,scale=3)<0.01 else False for data in x ] data=[[xi,isOutlieri] for xi,isOutlieri in zip (x,isOutlier)] fig, ax = plt.subplots(1, 1) ax.hist([i[0] for i in data], density=True, histtype='stepfilled', alpha=0.2) plt.show() activitiesTimes.append(data) #we have 4 types of events and now create traces: untill all are used from data minTrace,maxTrace=5,25 numberOfEvents=sum([len(i) for i in activitiesTimes]) traces=[] dataVectors=[[] for _ in range(len(activitiesTimes))] while numberOfEvents >0: eventsInTrace=random.randint(minTrace,maxTrace)
def all_dists(): # dists param were taken from scipy.stats official # documentaion examples # Total - 89 return { "alpha": stats.alpha(a=3.57, loc=0.0, scale=1.0), "anglit": stats.anglit(loc=0.0, scale=1.0), "arcsine": stats.arcsine(loc=0.0, scale=1.0), "beta": stats.beta(a=2.31, b=0.627, loc=0.0, scale=1.0), "betaprime": stats.betaprime(a=5, b=6, loc=0.0, scale=1.0), "bradford": stats.bradford(c=0.299, loc=0.0, scale=1.0), "burr": stats.burr(c=10.5, d=4.3, loc=0.0, scale=1.0), "cauchy": stats.cauchy(loc=0.0, scale=1.0), "chi": stats.chi(df=78, loc=0.0, scale=1.0), "chi2": stats.chi2(df=55, loc=0.0, scale=1.0), "cosine": stats.cosine(loc=0.0, scale=1.0), "dgamma": stats.dgamma(a=1.1, loc=0.0, scale=1.0), "dweibull": stats.dweibull(c=2.07, loc=0.0, scale=1.0), "erlang": stats.erlang(a=2, loc=0.0, scale=1.0), "expon": stats.expon(loc=0.0, scale=1.0), "exponnorm": stats.exponnorm(K=1.5, loc=0.0, scale=1.0), "exponweib": stats.exponweib(a=2.89, c=1.95, loc=0.0, scale=1.0), "exponpow": stats.exponpow(b=2.7, loc=0.0, scale=1.0), "f": stats.f(dfn=29, dfd=18, loc=0.0, scale=1.0), "fatiguelife": stats.fatiguelife(c=29, loc=0.0, scale=1.0), "fisk": stats.fisk(c=3.09, loc=0.0, scale=1.0), "foldcauchy": stats.foldcauchy(c=4.72, loc=0.0, scale=1.0), "foldnorm": stats.foldnorm(c=1.95, loc=0.0, scale=1.0), # "frechet_r": stats.frechet_r(c=1.89, loc=0.0, scale=1.0), # "frechet_l": stats.frechet_l(c=3.63, loc=0.0, scale=1.0), "genlogistic": stats.genlogistic(c=0.412, loc=0.0, scale=1.0), "genpareto": stats.genpareto(c=0.1, loc=0.0, scale=1.0), "gennorm": stats.gennorm(beta=1.3, loc=0.0, scale=1.0), "genexpon": stats.genexpon(a=9.13, b=16.2, c=3.28, loc=0.0, scale=1.0), "genextreme": stats.genextreme(c=-0.1, loc=0.0, scale=1.0), "gausshyper": stats.gausshyper(a=13.8, b=3.12, c=2.51, z=5.18, loc=0.0, scale=1.0), "gamma": stats.gamma(a=1.99, loc=0.0, scale=1.0), "gengamma": stats.gengamma(a=4.42, c=-3.12, loc=0.0, scale=1.0), "genhalflogistic": stats.genhalflogistic(c=0.773, loc=0.0, scale=1.0), "gilbrat": stats.gilbrat(loc=0.0, scale=1.0), "gompertz": stats.gompertz(c=0.947, loc=0.0, scale=1.0), "gumbel_r": stats.gumbel_r(loc=0.0, scale=1.0), "gumbel_l": stats.gumbel_l(loc=0.0, scale=1.0), "halfcauchy": stats.halfcauchy(loc=0.0, scale=1.0), "halflogistic": stats.halflogistic(loc=0.0, scale=1.0), "halfnorm": stats.halfnorm(loc=0.0, scale=1.0), "halfgennorm": stats.halfgennorm(beta=0.675, loc=0.0, scale=1.0), "hypsecant": stats.hypsecant(loc=0.0, scale=1.0), "invgamma": stats.invgamma(a=4.07, loc=0.0, scale=1.0), "invgauss": stats.invgauss(mu=0.145, loc=0.0, scale=1.0), "invweibull": stats.invweibull(c=10.6, loc=0.0, scale=1.0), "johnsonsb": stats.johnsonsb(a=4.32, b=3.18, loc=0.0, scale=1.0), "johnsonsu": stats.johnsonsu(a=2.55, b=2.25, loc=0.0, scale=1.0), "ksone": stats.ksone(n=1e03, loc=0.0, scale=1.0), "kstwobign": stats.kstwobign(loc=0.0, scale=1.0), "laplace": stats.laplace(loc=0.0, scale=1.0), "levy": stats.levy(loc=0.0, scale=1.0), "levy_l": stats.levy_l(loc=0.0, scale=1.0), "levy_stable": stats.levy_stable(alpha=0.357, beta=-0.675, loc=0.0, scale=1.0), "logistic": stats.logistic(loc=0.0, scale=1.0), "loggamma": stats.loggamma(c=0.414, loc=0.0, scale=1.0), "loglaplace": stats.loglaplace(c=3.25, loc=0.0, scale=1.0), "lognorm": stats.lognorm(s=0.954, loc=0.0, scale=1.0), "lomax": stats.lomax(c=1.88, loc=0.0, scale=1.0), "maxwell": stats.maxwell(loc=0.0, scale=1.0), "mielke": stats.mielke(k=10.4, s=3.6, loc=0.0, scale=1.0), "nakagami": stats.nakagami(nu=4.97, loc=0.0, scale=1.0), "ncx2": stats.ncx2(df=21, nc=1.06, loc=0.0, scale=1.0), "ncf": stats.ncf(dfn=27, dfd=27, nc=0.416, loc=0.0, scale=1.0), "nct": stats.nct(df=14, nc=0.24, loc=0.0, scale=1.0), "norm": stats.norm(loc=0.0, scale=1.0), "pareto": stats.pareto(b=2.62, loc=0.0, scale=1.0), "pearson3": stats.pearson3(skew=0.1, loc=0.0, scale=1.0), "powerlaw": stats.powerlaw(a=1.66, loc=0.0, scale=1.0), "powerlognorm": stats.powerlognorm(c=2.14, s=0.446, loc=0.0, scale=1.0), "powernorm": stats.powernorm(c=4.45, loc=0.0, scale=1.0), "rdist": stats.rdist(c=0.9, loc=0.0, scale=1.0), "reciprocal": stats.reciprocal(a=0.00623, b=1.01, loc=0.0, scale=1.0), "rayleigh": stats.rayleigh(loc=0.0, scale=1.0), "rice": stats.rice(b=0.775, loc=0.0, scale=1.0), "recipinvgauss": stats.recipinvgauss(mu=0.63, loc=0.0, scale=1.0), "semicircular": stats.semicircular(loc=0.0, scale=1.0), "t": stats.t(df=2.74, loc=0.0, scale=1.0), "triang": stats.triang(c=0.158, loc=0.0, scale=1.0), "truncexpon": stats.truncexpon(b=4.69, loc=0.0, scale=1.0), "truncnorm": stats.truncnorm(a=0.1, b=2, loc=0.0, scale=1.0), "tukeylambda": stats.tukeylambda(lam=3.13, loc=0.0, scale=1.0), "uniform": stats.uniform(loc=0.0, scale=1.0), "vonmises": stats.vonmises(kappa=3.99, loc=0.0, scale=1.0), "vonmises_line": stats.vonmises_line(kappa=3.99, loc=0.0, scale=1.0), "wald": stats.wald(loc=0.0, scale=1.0), "weibull_min": stats.weibull_min(c=1.79, loc=0.0, scale=1.0), "weibull_max": stats.weibull_max(c=2.87, loc=0.0, scale=1.0), "wrapcauchy": stats.wrapcauchy(c=0.0311, loc=0.0, scale=1.0), }
x = np.linspace(powerlognorm.ppf(0.01, c, s), powerlognorm.ppf(0.99, c, s), 100) ax.plot(x, powerlognorm.pdf(x, c, s), 'r-', lw=5, alpha=0.6, label='powerlognorm pdf') # Alternatively, the distribution object can be called (as a function) # to fix the shape, location and scale parameters. This returns a "frozen" # RV object holding the given parameters fixed. # Freeze the distribution and display the frozen ``pdf``: rv = powerlognorm(c, s) ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf') # Check accuracy of ``cdf`` and ``ppf``: vals = powerlognorm.ppf([0.001, 0.5, 0.999], c, s) np.allclose([0.001, 0.5, 0.999], powerlognorm.cdf(vals, c, s)) # True # Generate random numbers: r = powerlognorm.rvs(c, s, size=1000) # And compare the histogram: ax.hist(r, normed=True, histtype='stepfilled', alpha=0.2)
def test_optimisation_RNAClassifier(): # ----- Trace la distribution suivie par le log d'une variable suivant la distribution powerlognorm que l'on va choisir pour le paramètre alpha ----- fig, ax = plt.subplots(1, 1) c, s = 1, 1 y = stats.powerlognorm.rvs(c, s, scale=0.01, size=10000) # scale règle le centrage. ax.hist(np.log10(y), bins=30) plt.show() # ----- Le Randomized search ----- # Importe les données excel_table = readMultipleCsv('names') # Estimateur de base, permettant notamment de sélectionner le solveur à tester base_estimator = MLPClassifier(solver="sgd", max_iter=1000, verbose=True) # ou solver="adam" hidden_layer_sizes = [tuple(np.random.randint(20, 35, np.random.randint(3, 5, 1))) for i in range(500)] supprDoublon(hidden_layer_sizes) # Parametres a modifier suivant ce que nous voulons tester param = { "alpha": stats.powerlognorm(1, 1, scale=0.01), "hidden_layer_sizes": hidden_layer_sizes, "activation": ["logistic", "tanh", "relu"], "learning_rate": ["constant", "invscaling", "adaptive"], "learning_rate_init": stats.powerlognorm(1, 1, scale=0.001), "batch_size": np.arange(200, 500, 10) # batch_size : nombre de données sur lesquelles l'estimateur s'entraine } # "learning_rate" n'est que pour le solver sgd RNAClassifier_random_search(excel_table, base_estimator, param, 1) def RNA_cross_val(data, base_estimator, n_jobs=-3): print("***Pré-traitement des donnees***") # On récupere toutes les donnees utilisables data_usable = BDDminimal(data) # On récupére les features et les label x, y = featuresLabel(data_usable) print("[Separation en set d'entrainement et de test]") x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, test_size=0.2, shuffle=True) # Scale + encodage scaling(x_train, x_test) transformer = LabelEncoder() y_train_encode = transformer.fit_transform(y_train).ravel() print("[Preparation pour cross-validation]") # Separation base test / validation cv = StratifiedKFold(n_splits=5, shuffle=True) scores = cross_val_score(base_estimator, x_train, y_train_encode, cv=cv, n_jobs=n_jobs) print(scores) def alpha_validation_curve(data, base_estimator, n_jobs=-3): print("***Pré-traitement des donnees***") # On récupere toutes les donnees utilisables data_usable = BDDminimal(data) # On récupére les features et les label x, y = featuresLabel(data_usable) print("[Separation en set d'entrainement et de test]") x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, test_size=0.2, shuffle=True) # Scale + encodage scaling(x_train, x_test) transformer = LabelEncoder() y_train_encode = transformer.fit_transform(y_train).ravel() print("[Preparation pour cross-validation]") # Separation base test / validation cv = StratifiedKFold(n_splits=3, shuffle=True) # ----- Paramétrage de la validation curve ----- # Choix du paramètre et de son intervalle de recherche param_range = np.logspace(-11, 0, 55) # Calcul de la validation curve train_scores, valid_scores = validation_curve(base_estimator, x_train, y_train_encode, "alpha", param_range, cv=cv, n_jobs=n_jobs) # Tracé de la validation curve train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) validation_scores_mean = np.mean(valid_scores, axis=1) validation_scores_std = np.std(valid_scores, axis=1) fig, ax = plt.subplots(1, 1) ax.set_title("Validation curve on alpha for Adam") ax.set_xlabel("Alpha") ax.set_ylabel("Score") lw = 2 ax.semilogx(param_range, train_scores_mean, label="Training score", color="darkorange", lw=lw) ax.fill_between(param_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.2, color="darkorange", lw=lw) ax.semilogx(param_range, validation_scores_mean, label="Cross-validation score", color="navy", lw=lw) ax.fill_between(param_range, validation_scores_mean - validation_scores_std, validation_scores_mean + validation_scores_std, alpha=0.2, color="navy", lw=lw) plt.legend(loc="best") plt.show() return (train_scores, valid_scores) def layers_validation_curve(data, base_estimator, n_jobs=-3): print("***Pré-traitement des donnees***") # On récupere toutes les donnees utilisables data_usable = BDDminimal(data) # On récupére les features et les label x, y = featuresLabel(data_usable) print("[Separation en set d'entrainement et de test]") x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, test_size=0.2, shuffle=True) # Scale + encodage scaling(x_train, x_test) transformer = LabelEncoder() y_train_encode = transformer.fit_transform(y_train).ravel() print("[Preparation pour cross-validation]") # Separation base test / validation cv = StratifiedKFold(n_splits=3, shuffle=True) # ----- Paramétrage de la validation curve ----- # Choix du paramètre et de son intervalle de recherche param_range = range(1, 10) hidden_layers_range = [tuple([60 for _ in range(i)]) for i in param_range] # param_range = range(10, 171, 10) # hidden_layers_range = [(i,i,i,i,i) for i in param_range] # Calcul de la validation curve train_scores, valid_scores = validation_curve(base_estimator, x_train, y_train_encode, "hidden_layer_sizes", hidden_layers_range, cv=cv, n_jobs=n_jobs) # Tracé de la validation curve train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) validation_scores_mean = np.mean(valid_scores, axis=1) validation_scores_std = np.std(valid_scores, axis=1) fig, ax = plt.subplots(1, 1) ax.set_title("Validation curve on number of layers for SGD") ax.set_xlabel("Number of layers") ax.set_ylabel("Score") lw = 2 ax.plot(param_range, train_scores_mean, label="Training score", color="darkorange", lw=lw) ax.fill_between(param_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.2, color="darkorange", lw=lw) ax.plot(param_range, validation_scores_mean, label="Cross-validation score", color="navy", lw=lw) ax.fill_between(param_range, validation_scores_mean - validation_scores_std, validation_scores_mean + validation_scores_std, alpha=0.2, color="navy", lw=lw) plt.legend(loc="best") plt.show() return (train_scores, valid_scores) def trace_VC(): excel_table = readMultipleCsv('names') # === Modèles de départs === # base_estimator = MLPClassifier(solver="sgd", activation='relu', batch_size=250, learning_rate='adaptive', # learning_rate_init=0.003803490162088419, max_iter=1000, verbose=True, # hidden_layer_sizes=(50, 30, 57, 51, 56), alpha=0.00579294106283857) # base_estimator = MLPClassifier(solver="adam", activation='relu', batch_size=440, # learning_rate_init=0.0036049822428060574, max_iter=1000, verbose=True, # hidden_layer_sizes= (31, 56, 58, 41), alpha=0.006600250942968936) # === Modèles standardisés === base_estimator = MLPClassifier(solver="sgd", activation='relu', batch_size=250, learning_rate='adaptive', learning_rate_init=0.003803490162088419, max_iter=1000, verbose=True, hidden_layer_sizes=(60, 60, 60, 60), alpha=1e-5) # base_estimator = MLPClassifier(solver="adam", activation='relu', batch_size=440, # learning_rate_init=0.0036049822428060574, max_iter=1000, verbose=True, # hidden_layer_sizes= (60, 60, 60, 60), alpha=1e-5) # === Traçage des learning curves et recherche des scores moyens maximaux === # train_scores, valid_scores = alpha_validation_curve(excel_table, base_estimator) train_scores, valid_scores = layers_validation_curve(excel_table, base_estimator) valid_mean = np.mean(valid_scores, axis=1) # arg_max = np.argmax(valid_mean) # print(np.logspace(-11, 0, 55)[arg_max]) # print(valid_mean[arg_max]) arg_max = np.argmax(valid_mean) print(range(1, 10)[arg_max]) print(valid_mean[arg_max]) # Entraine SVC et affiche la precision def training_SVC(x_train, y_train, x_test, y_test): model_SVC = SVC(kernel='linear', gamma='scale', shrinking=False) # Entrainement model_SVC.fit(X_train_scaled, y_train_encode) # calcul de précision print(f'precision SVC de: {model_SVC.score(X_test_scaled, y_test_encode)*100} %') # Entraine Kneighbors et affiche la precision def training_kneighbors(x_train, y_train, x_test, y_test, k): model = KNeighborsClassifier(n_neighbors=k) model.fit(x_train, y_train) print(f'precision KNeighborsClassifier avec {k} voisins de: {model.score(x_test, y_test) *100} %') # Entraine SGDClassifier et affiche la precision def training_SGDClassifier(x_train, y_train, x_test, y_test): model = SGDClassifier(random_state=0) model.fit(x_train, y_train) print(f'precision SGDClassifier de: {model.score(x_test, y_test) *100} %') def score_par_type_de_sol(): """ Fonction qui renvoie les graphes d'apprentissage par type de sol (spécifier le model dans la fonction) """ excel_table = readMultipleCsv('names') data_usable = BDDminimal(excel_table) x, y = featuresLabel(data_usable) x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, test_size=0.2, shuffle=True, random_state=6) # Préparation de la base test scalerX = StandardScaler() x_train1 = scalerX.fit_transform(x_train) x_test = scalerX.transform(x_test) data_test = pd.DataFrame(x_test, columns=["z", "VIA", "Po", "Pi", "Cr", "Pr"]) y_test = pd.DataFrame.to_numpy(y_test) Dico_sol_test = {} data_test['sol'] = y_test for type in groundType: Dico_sol_test[type] = [] i = 0 for type_sol in data_test['sol']: Dico_sol_test[type_sol].append(data_test.iloc[i]) i += 1 # Préparation de la base d'entrainement y_train1 = y_train data = pd.DataFrame(x_train1, columns=["z", "VIA", "Po", "Pi", "Cr", "Pr"]) y_train1 = pd.DataFrame.to_numpy(y_train1) data['sol'] = y_train1 Dico_sol = {} for type in groundType: Dico_sol[type] = [] i = 0 for type_sol in data['sol']: Dico_sol[type_sol].append(data.iloc[i]) i += 1 clefs = list(Dico_sol.keys()) # Coupe de la base d'entraiement à la proportion p proportion = np.linspace(41, 100, 120) list_graph = [] list_prop_graph = [] for p in proportion: proportions_grap = [] Dico_sol1 = {} print(p) for clef in clefs: p_sol = int(len(Dico_sol[clef]) * (p / 100)) proportions_grap.append(p_sol) Dico_sol1[clef] = Dico_sol[clef][0:p_sol] x_train_def = [] y_train1 = [] for i in range(len(clefs)): for j in range(len(Dico_sol1[clefs[i]])): x_train_def.append(pd.DataFrame.to_numpy(Dico_sol1[clefs[i]][j])) y_train1.append(Dico_sol1[clefs[i]][j][-1]) x_train1 = np.delete(x_train_def, 6, 1) # Choix du modèle '''model = MLPClassifier(solver="sgd", activation='relu', batch_size=250, learning_rate='adaptive', learning_rate_init=0.003803490162088419, max_iter=1000, verbose=True, hidden_layer_sizes=(60, 60, 60, 60), alpha=1e-6)''' model = KNeighborsClassifier(n_neighbors=3) model.fit(x_train1, y_train1) # Evaluation du score score_sol = [] for i in range(len(clefs)): x_test1 = Dico_sol_test[clefs[i]] if len(x_test1) == 0: score_sol.append(0) else: x_test1 = np.delete(x_test1, 6, 1) y_pred = model.predict(x_test1) score = 0 for j in range(0, len(y_pred)): if y_pred[j] == clefs[i]: score += 1 score_sol.append(100 * score / len(y_pred)) list_graph.append(score_sol) list_prop_graph.append(proportions_grap) # Tracé des graphes list_graph = np.transpose(list_graph) list_prop_graph = np.transpose(list_prop_graph) for i in range(0, len(clefs)): plt.plot(list_prop_graph[i], list_graph[i]) plt.title(clefs[i]) plt.show() # Exemple de tests concernant sur les classifieurs if __name__ == '__main__': excel_table = readMultipleCsv('names') data_usable = BDDminimal(excel_table) x, y = featuresLabel(data_usable) model = MLPClassifier(solver="sgd", activation='relu', batch_size=250, learning_rate='adaptive', learning_rate_init = 0.003803490162088419, max_iter = 1000, verbose = True, hidden_layer_sizes = (60, 60, 60, 60), alpha = 1e-6) x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, test_size=0.2, shuffle=True) # transformer = LabelEncoder() # y_train_encode = transformer.fit_transform(y_train).ravel() # y_test_encode = transformer.transform(y_test).ravel() scalerX = StandardScaler() x_train = scalerX.fit_transform(x_train) x_test = scalerX.transform(x_test) model.fit(x_train, y_train) model.score(x_test, y_test)