def create_customized_boosting(train_data, features, target, depth, estimators): ''' This function allows to create the GradientBoostingClassifier usning scikit-learn Parameters: :param pandas-dataframe train_data: Training data that include target feature :param list features: List with relevant features to train the model :param str target: Objtective feature for the model :param int estimators: Number of estimators that compose the model :param int depth: Desired depth for the models that constitute the model :return: Trained GradientBoostingClassifier model :rtype: sklearn-model ''' X = train_data[features] y = train_data[target] if (estimators == glod.get_empty_string() and depth == glod.get_empty_string()): gradientboosting = GradientBoostingClassifier() else: gradientboosting = GradientBoostingClassifier(n_estimators=estimators, max_depth=depth) boosting = gradientboosting.fit(X, y) return boosting
def create_customized_forest(train_data, features, target, depth, estimators): ''' This function allows to create RandomForestClassifier using scikit-learn Parameters: :param pandas-dataframe train_data: Training data that include target feature :param list features: List with relevant features to train the model :param str target: Objtective feature for the model :param int estimators: Number of estimators that compose the model :param int depth: Desired depth for the trees that constitute the model :return: Trained RandomForestClassifier model :rtype: sklearn-model ''' criterion_used = "entropy" X = train_data[features] y = train_data[target] if (estimators == glod.get_empty_string() and depth == glod.get_empty_string()): random_forest = RandomForestClassifier(criterion=criterion_used) else: random_forest = RandomForestClassifier(n_estimators=estimators, criterion=criterion_used, max_depth=depth) forest = random_forest.fit(X, y) return forest
def register_target_values_distribution(diccionario_targets_valores, mensaje, array_rutas_ficheros_log, report_dict, enco, opcion=glod.get_empty_string()): ''' This function allows to register in the report_dict the distribution of the targets for the current execution Parameters: :param pandas_dataframe df_datos: dataframe with all the available data for the current event :param target: Current objective feature :param str mensaje: Message to register in the log :param list array_rutas_ficheros_log: List with the path to the log registers :param dict report_dict: Dictionary with the current processed information about the event :return: updated report_dict :rtype: dict<python_hashable_type:python_type> ''' register_log(array_rutas_ficheros_log, mensaje, glod.get_empty_string(), enco) for key in sorted(diccionario_targets_valores): register_log(array_rutas_ficheros_log, "\ttarget: "+ str(key) +" Number of elements: "+ str(diccionario_targets_valores[key]) + "\n", glod.get_empty_string(), enco) if opcion == glod.get_empty_string(): report_dict[glod.get_report_general_info_key()][glod.get_report_generic_target_key()][str(key)] = str(diccionario_targets_valores[key]) elif opcion == glod.get_train_option_key(): report_dict = update_train_division(report_dict, str(key), str(diccionario_targets_valores[key])) elif opcion == glod.get_test_option_key(): report_dict = update_test_division(report_dict, str(key), str(diccionario_targets_valores[key])) return report_dict
def create_customized_mlp(train_data, features, target, layer_sizes, act_function): ''' The function allows to create MLPClassifier model using scikit-learn Parameters: :param pandas-dataframe train_data: Training data that include target feature :param list features: List with relevant features to train the model :param str target: Objtective feature for the model :param tuple layer_sizes: Tuple with the number of layers and percentrons in each one :param str act_func: activation function for the layer :return: Trained MLPClassifier model :rtype: sklearn-model ''' solver_used = 'adam' X = train_data[features] y = train_data[target] if (layer_sizes == glod.get_empty_string() and act_function == glod.get_empty_string()): mlp = MLPClassifier() else: mlp = MLPClassifier(hidden_layer_sizes=layer_sizes, activation=act_function, solver=solver_used) my_mlp = mlp.fit(X, y) return my_mlp
def create_report_prediction(report_dict, event_target, ruta_relativa_datos_auxiliares, ruta_directorio_informes, enco): '''This funcion allows to get the pdf for the current model with the information obtained fate rthe prediction phase''' env = Environment(loader=FileSystemLoader('.')) ruta_plantilla_temporal = os.path.join(ruta_relativa_datos_auxiliares, 'temp_html.html') template = env.get_template(ruta_relativa_datos_auxiliares + '/' + glod.get_prediction_template_name()) event = event_target[0] target_to_predict = event_target[1] dic_info_event = report_dict[event] summary_target_to_predict = glod.get_empty_string() template_vars = {glod.get_title_key(): "Prediction report for " + event, glod.get_logo_key():\ encode_image(report_dict[glod.get_logo_key()].replace('\'', glod.get_empty_string())), } if target_to_predict in dic_info_event: model = str(dic_info_event[target_to_predict][glod.get_best_model_key()]) model = model.split("(") model = model[0] summary_target_to_predict = "<p><strong>Target: <strong>" + ' ' +\ target_to_predict + "</br></br>" summary_target_to_predict += "<p><strong>Model: <strong>" + ' ' + model + "</br>" summary_target_to_predict += "<p><strong>Accuracy: <strong>" + ' ' +\ str(dic_info_event[target_to_predict][glod.get_accuracy_parameter_name()]) + "</br>" summary_target_to_predict += "<strong>Correct classifications: <strong>" + ' ' +\ str(dic_info_event[target_to_predict]['Correct']) + "</br>" summary_target_to_predict += "<strong>Total number of observations: <strong>" + ' ' +\ str(dic_info_event[target_to_predict]['Total']) + "</br>" summary_target_to_predict += "<strong>Total number of unknown observations classified: <strong>" + ' ' + str(dic_info_event[target_to_predict]['Predicted']) + "</br>" cm_target = encode_image(dic_info_event[target_to_predict]\ ['target_to_predict_cm'].replace('\'', glod.get_empty_string())) template_vars['target_to_predict_cm'] = cm_target template_vars['target'] = summary_target_to_predict with codecs.open(ruta_plantilla_temporal, glod.get_write_mode(), encoding=enco) as output_file: output_file.write(template.render(template_vars)) with codecs.open(ruta_plantilla_temporal, mode=glod.get_read_mode(), encoding=enco) as read_html: pdf_resultante = os.path.join(ruta_directorio_informes,\ "Prediction_report_for_"+ event +".pdf") with open(pdf_resultante, mode=glod.get_writebyte_mode()) as pdf_gen: pisa.CreatePDF(read_html.read(), pdf_gen) logging.getLogger("xhtml2pdf").addHandler(PisaNullHandler()) if os.path.exists(ruta_plantilla_temporal): os.remove(ruta_plantilla_temporal)
def compare_major_classes(diccionario_clases): '''Compare the classes to get the major one Parameters: diccionario_classes: dictionary with classes and the number of values in each one return key: cluster number of the majoritary class ''' mayoritaria = glod.get_empty_string() key = float(glod.get_nan_string()) for key in diccionario_clases: if mayoritaria == glod.get_empty_string(): mayoritaria = key elif diccionario_clases[key] > diccionario_clases[mayoritaria]: mayoritaria = key return key
def create_customized_ada(train_data, features, target, estimators): ''' It allows to create AdaBoostClassifier model using scikit-learn Parameters: :param pandas-dataframe train_data: Training data that include target feature :param list features: List with relevant features to train the model :param str target: Objtective feature for the model :param int estimators: Number of estimators that compose the model :return: Trained AdaBoostClassifier model :rtype: sklearn-model ''' X = train_data[features] y = train_data[target] if estimators == glod.get_empty_string(): mi_ada = AdaBoostClassifier(base_estimator=None, learning_rate=1.0, algorithm='SAMME.R', random_state=None) else: mi_ada = AdaBoostClassifier(base_estimator=None, n_estimators=estimators, learning_rate=1.0, algorithm='SAMME.R', random_state=None) ada = mi_ada.fit(X, y) return ada
def create_trained_model(model_name, train_data, features_target, params_array, diccionario_modelos_no_supervisado): ''' The function allows to create and train a model with the specified name Parameters: :param str model_name: name of the model to be trained :param pandas-dataframe train_data: Data to train de model. It includes the target column :param list features: List with the relevant features to rain the model :param str target: Target feature :param list params_array: Array with the specific parameters of the model to be trained :return: specified model trained :rtype: sklearn-model ''' features = features_target[0] target = features_target[1] modelo_creado = glod.get_empty_string() if model_name == diccionario_modelos_no_supervisado[1]:#Kmeans modelo_creado = create_customized_kmeans(train_data, features, target, params_array[0], params_array[1]) return modelo_creado
def create_customized_kmeans(train_data, features, target, num_clusters, numero_inicializaciones): '''This function allows to create KMeans model using sklearn library Parameters: train_data: training data features: list of relevant features to train the model target: objective feature num_clusters: number of clusters numero_inicializaciones: number of initializations fo the centroids return kmeans: Kmeans trained model''' init_method = 'k-means++' X = train_data[features] y = train_data[target] if(num_clusters == glod.get_empty_string() and numero_inicializaciones == glod.get_empty_string()): kmeans = KMeans(init=init_method) else: kmeans = KMeans(n_clusters=num_clusters, init=init_method, n_init=numero_inicializaciones) kmeans = kmeans.fit(X, y) return kmeans
def create_report_data_dict(evento, umbral, target, lista_variables_descartadas, ruta_logo): '''This funcion allows to create the structure for the report data dictionary for the current event''' report_data = {glod.get_objective_target_key(): target, glod.get_event_key():evento, glod.get_logo_key():ruta_logo, glod.get_report_general_info_key():{glod.get_report_generic_target_key():{}, glod.get_variables_key():{glod.get_deleted_by_user_key():lista_variables_descartadas, glod.get_empty_or_constant_key():[], glod.get_score_relevant_key():[]}, glod.get_training_division_key():{}, glod.get_test_division_key():{}, }, glod.get_umbral_key(): str(umbral), glod.get_warning_key(): glod.get_empty_string() } return report_data
def initialize_model(model_name, params_array, diccionario_modelos_supervisado): ''' The function allows to create and train a model with the specified name Parameters: :param str model_name: name of the model to be trained :param pandas-dataframe train_data: Data to train de model. It includes the target column :param list features: List with the relevant features to rain the model :param str target: Target feature :param list params_array: Array with the specific parameters of the model to be trained :return: specified model trained :rtype: sklearn-model ''' modelo_inicializado = glod.get_empty_string() criterion_used = "entropy" algorithm_used = 'SAMME.R' if model_name == diccionario_modelos_supervisado[1]: #Tree modelo_inicializado = DecisionTreeClassifier(max_depth=params_array[0], criterion=criterion_used) elif model_name == diccionario_modelos_supervisado[2]: #Ada modelo_inicializado = AdaBoostClassifier(base_estimator=None, n_estimators=params_array[0], learning_rate=1.0, algorithm=algorithm_used, random_state=None) elif model_name == diccionario_modelos_supervisado[3]: #Boosting modelo_inicializado = GradientBoostingClassifier( n_estimators=params_array[0], max_depth=params_array[1]) elif model_name == diccionario_modelos_supervisado[4]: #RandomForest modelo_inicializado = RandomForestClassifier( n_estimators=params_array[1], criterion=criterion_used, max_depth=params_array[0]) elif model_name == diccionario_modelos_supervisado[5]: #MLP modelo_inicializado = MLPClassifier( hidden_layer_sizes=(params_array[0]), activation=params_array[1]) return modelo_inicializado
def get_trained_model(model_name, train_data, features_target, params_array, diccionario_modelos_supervisado): ''' The function allows to create and train a model with the specified name Parameters: :param str model_name: name of the model to be trained :param pandas-dataframe train_data: Data to train de model. It includes the target column :param list features: List with the relevant features to rain the model :param str target: Target feature :param list params_array: Array with the specific parameters of the model to be trained :return: specified model trained :rtype: sklearn-model ''' features = features_target[0] target = features_target[1] modelo_creado = glod.get_empty_string() if model_name == diccionario_modelos_supervisado[1]: #Tree modelo_creado = create_customized_tree(train_data, features, target, params_array[0]) elif model_name == diccionario_modelos_supervisado[2]: #Ada modelo_creado = create_customized_ada(train_data, features, target, params_array[0]) elif model_name == diccionario_modelos_supervisado[3]: #Boosting modelo_creado = create_customized_boosting(train_data, features, target, params_array[0], params_array[1]) elif model_name == diccionario_modelos_supervisado[4]: #RandomForest modelo_creado = create_customized_forest(train_data, features, target, params_array[0], params_array[1]) elif model_name == diccionario_modelos_supervisado[5]: #MLP modelo_creado = create_customized_mlp(train_data, features, target, params_array[0], params_array[1]) return modelo_creado
def create_basic_report_data_dict(basic_parameters, lista_variables_descartadas, ruta_logo): '''This funcion allows to create the structure for the report data dictionary''' umbral = basic_parameters[0] target = basic_parameters[1] main_metric = basic_parameters[2] feature_selection_method = basic_parameters[3] penalize_falses = basic_parameters[4] report_data = {glod.get_title_key(): "Overview With Execution Information", glod.get_logo_key():ruta_logo, glod.get_umbral_key(): str(umbral), glod.get_main_metric_key(): str(main_metric), glod.get_feature_selection_key(): str(feature_selection_method), glod.get_penalization_name(): str(penalize_falses), glod.get_objective_target_key(): target, glod.get_variables_key():{glod.get_deleted_by_user_key():\ lista_variables_descartadas}, glod.get_general_info_execution_key():glod.get_empty_string() } return report_data
def initialize_model(model_name, params_array, diccionario_modelos_no_supervisado): ''' The function allows to create and train a model with the specified name Parameters: :param str model_name: name of the model to be trained :param pandas-dataframe train_data: Data to train de model. It includes the target column :param list features: List with the relevant features to rain the model :param str target: Target feature :param list params_array: Array with the specific parameters of the model to be trained :return: specified model trained :rtype: sklearn-model ''' modelo_inicializado = glod.get_empty_string() if model_name == diccionario_modelos_no_supervisado[1]:#Kmeans modelo_inicializado = KMeans(n_clusters=params_array[0], n_init=params_array[1]) return modelo_inicializado
def register_log(array_rutas_ficheros, mensaje, opcion, enco): ''' This function allows to register information in the logs Parameters: :param list array_rutas_ficheros: list with the path to the lgos files :param str mensaje: Message to register in the log files :param str/int opcion: Mode for operating :param str enco: encoding :return: updated report_dict :rtype: dict<python_hashable_type:python_type> ''' modo = glod.get_append_mode() file_to_operate = glod.get_empty_string() for ruta_fichero in array_rutas_ficheros: if opcion == 0: modo = glod.get_write_mode() if enco != glod.get_none_encoding(): file_to_operate = codecs.open(ruta_fichero, modo, encoding=enco) else: file_to_operate = open(ruta_fichero, modo) file_to_operate.write(mensaje) file_to_operate.close()
def create_customized_tree(train_data, features, target, depth): ''' This function allows to create the DecisionTreeClassifier model using sckikit-learn Parameters: :param pandas-dataframe train_data: Training data that include target feature :param list features: List with relevant features to train the model :param str target: Objtective feature for the model :param int depth: Desired depth for the trees that constitute the model :return: Trained DecisionTreeClassifier model :rtype: sklearn-model ''' criterion_used = "entropy" X = train_data[features] y = train_data[target] if depth == glod.get_empty_string(): decision_tree = DecisionTreeClassifier(criterion=criterion_used) else: decision_tree = DecisionTreeClassifier(max_depth=int(depth), criterion=criterion_used) my_tree = decision_tree.fit(X, y) return my_tree
def get_dictionary_of_reasignation_of_labels(modelo, dataset, target): '''This function resturns the dictionary with the original labels mapped to the ones created by the Kmeans model Parameters modelo: Kmeans model dataset: current data target: objective target return asociacion_cluster_target: dictionary with original labels mapped to clusters ''' diccionario_plantilla = {} clusters = set(list(modelo.labels_)) relacion_cluster_clase = {} clases_originales = list(set(dataset[target])) diccionario_clases_originales_contador = {} for clase_original in clases_originales: diccionario_clases_originales_contador[clase_original] = 0 for cluster in clusters: diccionario_plantilla[cluster] = diccionario_clases_originales_contador.copy() indices_cluster = list(np.where(modelo.labels_ == cluster)[0]) diccionario_contador = {} relacion_cluster_clase[cluster] = [] acum = 0 for indice in indices_cluster: clase = dataset.iloc[int(indice)][target] if clase not in diccionario_contador: diccionario_contador[clase] = 1 acum += 1 else: valores = diccionario_contador[clase] valores += 1 acum += 1 diccionario_contador[clase] = valores for clase_original in diccionario_contador: diccionario_plantilla[cluster][clase_original] = diccionario_contador[clase_original] targets_originales_ordenados = [] for indice in range(len(clases_originales)): for cluster in diccionario_plantilla: mayoritario = glod.get_empty_string() numero_mayoritario = 0 for target_original in diccionario_plantilla[cluster]: if target_original not in targets_originales_ordenados: if mayoritario == glod.get_empty_string(): mayoritario = target_original numero_mayoritario = diccionario_plantilla[cluster][mayoritario] else: numero_candidato = diccionario_plantilla[cluster][target_original] numero_mayoritario, mayoritario = check_majority(numero_candidato, numero_mayoritario, mayoritario, target_original) if mayoritario != glod.get_empty_string(): targets_originales_ordenados.append(mayoritario) clases_originales = targets_originales_ordenados asociacion_cluster_target = {} for clase in clases_originales: elementos_cluster_clase = {} for cluster_actual in clusters: elementos_cluster_clase[cluster_actual] = diccionario_plantilla[cluster_actual][clase] mayoritaria = glod.get_empty_string() for cluster in elementos_cluster_clase: if mayoritaria == glod.get_empty_string(): mayoritaria = cluster else: #comprobamos actual = elementos_cluster_clase[mayoritaria] candidata = elementos_cluster_clase[cluster] if candidata > actual: mayoritaria = cluster if mayoritaria != glod.get_empty_string(): clusters.remove(mayoritaria) asociacion_cluster_target[mayoritaria] = clase diccionario_plantilla_recodificado = {} for cluster in asociacion_cluster_target: reco = asociacion_cluster_target[cluster] diccionario_plantilla_recodificado[reco] = diccionario_plantilla[cluster] return asociacion_cluster_target
def create_report_current_execution(report_dict, basic_lists, diccionario_aprendizajes, ruta_relativa_datos_auxiliares, ruta_directorio_resultados): '''This funcion allows to create a pdf with the information about the current process that is going to take place''' env = Environment(loader=FileSystemLoader('.')) ruta_plantilla_temporal = os.path.join(ruta_relativa_datos_auxiliares, 'temp_html.html') template = env.get_template(ruta_relativa_datos_auxiliares + '/' +\ glod.get_general_execution_template_name()) template_vars = {glod.get_title_key(): report_dict[glod.get_title_key()], glod.get_logo_key():encode_image(report_dict[glod.get_logo_key()].replace('\'', glod.get_empty_string())), glod.get_general_info_execution_key():glod.get_empty_string() } lista_eventos = basic_lists[0] lista_variables_usuario = basic_lists[1] lista_listas_variables_descartadas = basic_lists[2] lista_aprendizajes = basic_lists[3] lista_modelos = basic_lists[4] #General parameters (target,umbral,variables_descartadas) target = report_dict[glod.get_objective_target_key()] umbral = report_dict[glod.get_umbral_key()] main_metric = report_dict[glod.get_main_metric_key()] feature_selection_method = report_dict[glod.get_feature_selection_key()] penalize_falses = report_dict[glod.get_penalization_name()] lista_variables_descartadas = report_dict[glod.get_variables_key()]\ [glod.get_deleted_by_user_key()] tabulacion = " " informacion = "<h3>Common Parameters </h3></p>" informacion += tabulacion+tabulacion + "<i>Objective Target: </i>" + target + "</br></br>" informacion += tabulacion+tabulacion + "<i>Percentil for Scoring Function: </i>" + umbral +\ "</br></br>" informacion += tabulacion+tabulacion + "<i>Main metric: </i>" + main_metric + "</br></br>" informacion += tabulacion+tabulacion + "<i>Feature selection method: </i>" + \ feature_selection_method + "</br></br>" informacion += tabulacion+tabulacion + "<i>Penalize falses: </i>" + penalize_falses +\ "</br></br>" informacion += tabulacion+tabulacion + "<i>Common Discarded Variables:</i></br>" for variable_descartada in lista_variables_descartadas: informacion += tabulacion+tabulacion+tabulacion + variable_descartada + "</br>" if lista_variables_descartadas == []: informacion += tabulacion+"No variables were selected to be discarded</br>" informacion += "</p>" informacion += "<h3>Events to be processed: </h3><p>" for indice in range(len(lista_eventos)): informacion += tabulacion+"<strong>"+ lista_eventos[indice] + "</strong></br>" informacion += tabulacion+tabulacion+"<i>Important features for the user:</i> </br>" if lista_variables_usuario[indice]: for variable in lista_variables_usuario[indice]: informacion += tabulacion+tabulacion+tabulacion+variable + "</br>" else: informacion += tabulacion+tabulacion+tabulacion + \ "No important features were specified</br>" informacion += "</br>" informacion += tabulacion+tabulacion+"<i>Discarded variables by the user:</i> </br>" if lista_listas_variables_descartadas[indice]: for variable in lista_listas_variables_descartadas[indice]: informacion += tabulacion+tabulacion+tabulacion+variable + "</br>" else: informacion += tabulacion+tabulacion+tabulacion+"No variables were discarded</br>" informacion += "</br>" informacion += tabulacion+tabulacion+"<i>Learnings to be applied: </i></br>" aprendizaje = lista_aprendizajes[indice] modelos = lista_modelos[indice] if aprendizaje == glod.get_all_learning_modes_name():#looping supervised models informacion += tabulacion+tabulacion+tabulacion+"<u>" +\ str(diccionario_aprendizajes[1]) + "</u>:</br>" modelos_sup = modelos[0] for modelo_act in modelos_sup: informacion += tabulacion+tabulacion+tabulacion+tabulacion + modelo_act + "</br>" informacion += "</br>" else: informacion += tabulacion+tabulacion+tabulacion+"<u>"+aprendizaje + "</u>:</br>" for modelo_act in modelos: informacion += tabulacion+tabulacion+tabulacion+tabulacion + modelo_act + "</br>" informacion += "</p>" template_vars[glod.get_general_info_execution_key()] = informacion with codecs.open(ruta_plantilla_temporal, glod.get_write_mode()) as output_file: output_file.write(template.render(template_vars)) with codecs.open(ruta_plantilla_temporal, glod.get_read_mode()) as html_leido: pdf_resultante = os.path.join(ruta_directorio_resultados, "General_execution_report_"+ target +".pdf") with open(pdf_resultante, glod.get_writebyte_mode()) as gen_report: pisa.CreatePDF(html_leido.read(), gen_report) logging.getLogger("xhtml2pdf").addHandler(PisaNullHandler()) if os.path.exists(ruta_plantilla_temporal): os.remove(ruta_plantilla_temporal)
def create_report_current_model(report_dict, lista_modelos, ruta_relativa_datos_auxiliares, ruta_directorio_informes, enco): '''This funcion allows to get information of the current model in pdf format with the full charactristics fo the model''' env = Environment(loader=FileSystemLoader('.')) ruta_plantilla_temporal = os.path.join(ruta_relativa_datos_auxiliares, 'temp_html.html') if lista_modelos == []: #if process not completed template = env.get_template(ruta_relativa_datos_auxiliares + '/' +\ glod.get_incomplete_event_report_template_name()) template_vars = {glod.get_title_key(): "Incomplete Execution Report", glod.get_logo_key(): \ encode_image(report_dict[glod.get_logo_key()].replace('\'', glod.get_empty_string())), glod.get_report_generic_target_key(): report_dict[glod.get_objective_target_key()], glod.get_event_key(): report_dict[glod.get_event_key()], glod.get_info_key(): " " +\ report_dict[glod.get_warning_key()] } with codecs.open(ruta_plantilla_temporal, glod.get_write_mode(), encoding=enco) as output_file: output_file.write(template.render(template_vars)) with codecs.open(ruta_plantilla_temporal, glod.get_read_mode(), encoding=enco) as html_leido: pdf_resultante = os.path.join(ruta_directorio_informes, "report_" +\ report_dict[glod.get_event_key()]+"_incomplete.pdf") with open(pdf_resultante, glod.get_writebyte_mode()) as incomplete_rep: pisa.CreatePDF(html_leido.read(), incomplete_rep) logging.getLogger("xhtml2pdf").addHandler(PisaNullHandler()) else: lista_pares_modelo_indice = auxf.order_models_by_score_and_time(report_dict, lista_modelos) template = env.get_template(ruta_relativa_datos_auxiliares + '/' + glod.get_report_template_name()) for modelo in lista_modelos: if modelo in report_dict: observations_targets = "<p><strong>Target distribution of observations\ </strong></br>" final_targets_list = list(report_dict[glod.get_report_general_info_key()]\ [glod.get_report_generic_target_key()].keys()) for ob_target in auxf.natsorted(final_targets_list): observations_targets += " "+ "With target " +\ str(ob_target) + " :"+ str(report_dict[glod.get_report_general_info_key()]\ [glod.get_report_generic_target_key()][ob_target]) + "</br>" observations_targets += "</p>" variables_summary = "<p><strong>Summary of variables</strong></br>" discarded_for_event = report_dict[glod.get_report_general_info_key()]\ [glod.get_variables_key()][glod.get_user_discarded_key()] variables_summary += "<br><i><u>Deleted by the user at the begining:</i></u></br>" for deleted_var in report_dict[glod.get_report_general_info_key()]\ [glod.get_variables_key()][glod.get_deleted_by_user_key()]: variable_dis = glod.get_empty_string() if deleted_var in discarded_for_event: variable_dis = "<strong>" + deleted_var + "</strong>" else: variable_dis = deleted_var variables_summary += " "+ variable_dis + "</br>" variables_summary += " <i>*variables in bold were\ specified by the user to be discarded specifically for this event<i></br>" variables_summary += "</br>" variables_summary += "<br><i><u>Deleted in execution time(Empty or Constant)\ :</i></u></br>" for emp_con_var in report_dict[glod.get_report_general_info_key()]\ [glod.get_variables_key()][glod.get_empty_or_constant_key()]: variables_summary += " "+ emp_con_var + "</br>" variables_summary += "</br>" variables_summary += "<br><i><u>Requested for the event by the user:</i></u></br>" for req_var in report_dict[glod.get_report_general_info_key()]\ [glod.get_variables_key()][glod.get_user_requested_key()]: variables_summary += " "+ req_var + "</br>" variables_summary += "</br>" variables_summary += "<br><i><u>Used during the process:</i></u></br>" diccionario_relevantes_mif = report_dict[glod.get_report_general_info_key()]\ [glod.get_variables_key()][glod.get_score_relevant_key()] sorted_relevant_vars = sorted(diccionario_relevantes_mif.items(), key=operator.itemgetter(1), reverse=True) for relevant_var in sorted_relevant_vars: rel_variable = relevant_var[0] rel_variable = "<strong>" + rel_variable +' \ '+\ str(diccionario_relevantes_mif[rel_variable]) +"</strong>" variables_summary += " "+ rel_variable + "</br>" for relevant_var in report_dict[glod.get_report_general_info_key()][glod.get_variables_key()][glod.get_used_in_process()]: if relevant_var not in diccionario_relevantes_mif: variables_summary += " "+ relevant_var + "</br>" variables_summary += " <i>*variables in bold were used\ to train the models<i></br>" variables_summary += "</p>" #Information about the model accuracy = "</br></br> \ <strong>Accuracy: "+\ str(float(round(report_dict[modelo][glod.get_accuracy_parameter_name()], 5)))+\ "</strong>" ranking = get_string_with_ranking_of_models(lista_pares_modelo_indice, modelo) model_info = "<p><strong>Parameters used to configure the model</strong></br>" for param in report_dict[modelo][glod.get_parameters_key()]: model_info += " <i>"+ param + "</i>: " +\ str(report_dict[modelo][glod.get_parameters_key()][param]) + "</br>" model_info += "</p>" time_info = "<p><strong>Time elapsed</strong></br>" tiempo_seleccion_parametros = report_dict[modelo][glod.get_time_parameters_key()][glod.get_time_sel_finish_key()] - report_dict[modelo][glod.get_time_parameters_key()][glod.get_time_sel_init_key()] tiempo_entrenamiento = report_dict[modelo][glod.get_time_parameters_key()][glod.get_time_train_finish_key()] - report_dict[modelo][glod.get_time_parameters_key()][glod.get_time_train_init_key()] time_info += " "+ "Parameters selection time: "+\ str(tiempo_seleccion_parametros) + "</br>" time_info += " "+ "Training time: "+\ str(tiempo_entrenamiento) + "</br>" time_info += "</p>" total_train = 0.0 vector_of_targets = [] vector_of_values_by_target = [] vector_of_percentages_by_target = [] train_distribution_info = "<p></br><strong>Training Data Distribution\ </strong></br>" for train_target in auxf.natsorted(list(report_dict[glod.get_report_general_info_key()][glod.get_training_division_key()].keys())): train_distribution_info += " "+ "With target " + str(train_target) + " :"+ str(report_dict[glod.get_report_general_info_key()][glod.get_training_division_key()][train_target]) + "</br>" vector_of_targets.append(train_target) vector_of_values_by_target.append(float(report_dict[glod.get_report_general_info_key()][glod.get_training_division_key()][train_target])) total_train += float(report_dict[glod.get_report_general_info_key()][glod.get_training_division_key()][train_target]) train_distribution_info += "</p>" #getting null train accuracy null_train_accuracy = 0.0 for indice_t in range(len(vector_of_values_by_target)): vector_of_percentages_by_target.append(round(vector_of_values_by_target[indice_t]/total_train, 4)) null_train_accuracy = max(vector_of_percentages_by_target) total_test = 0.0 vector_of_targets = [] vector_of_values_by_target = [] vector_of_percentages_by_target = [] test_distribution_info = "<p><strong>Test Data Distribution</strong></br>" for test_target in auxf.natsorted(list(report_dict[glod.get_report_general_info_key()][glod.get_test_division_key()].keys())): test_distribution_info += " "+ "With target " + str(test_target) + " :"+ str(report_dict[glod.get_report_general_info_key()][glod.get_test_division_key()][test_target]) + "</br>" vector_of_targets.append(test_target) vector_of_values_by_target.append(float(report_dict[glod.get_report_general_info_key()][glod.get_test_division_key()][test_target])) total_test += float(report_dict[glod.get_report_general_info_key()][glod.get_test_division_key()][test_target]) test_distribution_info += "</p>" null_test_accuracy = 0.0 for indice_t in range(len(vector_of_values_by_target)): vector_of_percentages_by_target.append(round(vector_of_values_by_target[indice_t]/total_test, 4)) null_test_accuracy = max(vector_of_percentages_by_target) event = report_dict[glod.get_event_key()] template_vars = {glod.get_title_key(): "Execution Report", glod.get_logo_key():encode_image(report_dict[glod.get_logo_key()].replace('\'', glod.get_empty_string())), glod.get_model_key(): modelo, glod.get_report_generic_target_key():\ report_dict[glod.get_objective_target_key()], glod.get_event_key(): event, glod.get_accuracy_parameter_name():\ str(accuracy)+"<br> \ <strong>Null train acc: "+\ str(null_train_accuracy)+"</strong>"+"<br> \ \ <strong>Null test acc: "+ str(null_test_accuracy)+\ "</strong></p>", glod.get_models_ranking_key(): ranking, glod.get_observations_targets_key(): observations_targets, glod.get_variables_summary_key(): variables_summary, glod.get_models_info_key(): model_info, glod.get_time_info_key(): time_info, glod.get_train_distribution_info_key(): train_distribution_info, glod.get_test_distribution_info_key(): test_distribution_info } template_vars[glod.get_metrics_info_key()] = glod.get_empty_string() for metric in report_dict[modelo][glod.get_metrics_micro_avg_key()]: template_vars[glod.get_metrics_info_key()] += "<p>"+"<strong>"+metric+"</strong>: " + report_dict[modelo][glod.get_metrics_micro_avg_key()][metric] +"</br>" template_vars[glod.get_metrics_info_key()] += "</p>" if glod.get_model_parameters_plot_name() in report_dict[modelo]: template_vars[glod.get_image_parameters_accuracy_key()] = encode_image(report_dict[modelo][glod.get_model_parameters_plot_name()].replace('\'', glod.get_empty_string())) if glod.get_confussion_matrix_train_path_key() in report_dict[modelo]: template_vars[glod.get_conf_train_img_key()] = encode_image(report_dict[modelo][glod.get_confussion_matrix_train_path_key()].replace('\'', glod.get_empty_string())) if glod.get_confussion_matrix_test_path_key() in report_dict[modelo]: template_vars[glod.get_conf_test_img_key()] = encode_image(report_dict[modelo][glod.get_confussion_matrix_test_path_key()].replace('\'', glod.get_empty_string())) if glod.get_learning_curve_key() in report_dict[modelo]: template_vars[glod.get_learning_curve_key()] = encode_image(report_dict[modelo][glod.get_learning_curve_key()].replace('\'', glod.get_empty_string())) metrics_by_label = "<table width='100%' border='1' cellspacing='0' cellpadding='5'>" keys = glod.get_empty_string() for elemento in auxf.natsorted(list(report_dict[modelo][glod.get_metrics_key()].keys())): if keys == glod.get_empty_string(): keys = report_dict[modelo][glod.get_metrics_key()][elemento].keys() metrics_by_label += "<tr><td align='center' class='black'>"+ glod.get_report_generic_target_key() +"</td>" for cabecera in keys: metrics_by_label += "<td align='center' class='black'>" + cabecera +"</td>" metrics_by_label += "</tr>" metrics_by_label += "<tr><td>" + elemento.replace('target_', glod.get_empty_string()) + "</td>" for key in keys: metrics_by_label += "<td>"+str(report_dict[modelo][glod.get_metrics_key()][elemento][key])+"</td>" metrics_by_label += "</tr>" metrics_by_label += "</table>" template_vars[glod.get_metrics_by_label_key()] = metrics_by_label #generamos el html with codecs.open(ruta_plantilla_temporal, glod.get_write_mode(), encoding=enco) as output_file: output_file.write(template.render(template_vars)) #generamos el pdf with codecs.open(ruta_plantilla_temporal, mode=glod.get_read_mode(), encoding=enco) as read_html: pdf_resultante = os.path.join(ruta_directorio_informes, modelo + "_report_for_"+ event +".pdf") with open(pdf_resultante, mode=glod.get_writebyte_mode()) as pdf_gen: pisa.CreatePDF(read_html.read(), pdf_gen) logging.getLogger("xhtml2pdf").addHandler(PisaNullHandler()) if os.path.exists(ruta_plantilla_temporal): os.remove(ruta_plantilla_temporal)
def create_report_current_dictionary_models(dictionary_of_models, basic_paths, list_of_parameters_models_events_dict, logo_path, enco): '''This funcion allows to get the pdf file with the current status of the models, relevant features and the events to which are applied''' ruta_relativa_datos_auxiliares = basic_paths[0] ruta_directorio_resultados = basic_paths[1] env = Environment(loader=FileSystemLoader('.')) ruta_plantilla_temporal = os.path.join(ruta_relativa_datos_auxiliares, 'temp_html.html') template = env.get_template(ruta_relativa_datos_auxiliares + '/' +\ glod.get_dictionary_models_template_name()) tabulacion = " " template_vars = {glod.get_title_key(): "Report of the information of the Dictionary of models", glod.get_logo_key(): encode_image(logo_path.replace('\'', glod.get_empty_string())) } list_elements = [list_of_parameters_models_events_dict[0], list_of_parameters_models_events_dict[3], list_of_parameters_models_events_dict[1]] informacion = glod.get_empty_string() for event in dictionary_of_models: informacion += "<strong><u>"+ event +"</u></strong></br></br>" for target in dictionary_of_models[event]: informacion += tabulacion + tabulacion + "<strong><i>Target:</i></strong>" + " " + target + "</br>" for key in list_elements: informacion += tabulacion + tabulacion + "<strong><i>" + key + ": </i></strong>" if type(list()) == type(dictionary_of_models[event][target][key]): informacion += "<br>" contador = 0 ordered_list_features = sorted(dictionary_of_models[event][target][key]) while contador < len(ordered_list_features): element = ordered_list_features[contador] informacion += tabulacion + tabulacion + tabulacion +tabulacion + element + "</br>" contador += 1 else: informacion += dictionary_of_models[event][target][key] + "</br>" if key == list_of_parameters_models_events_dict[0]: informacion += tabulacion + tabulacion + "<strong><i>best model: </i></strong> " + dictionary_of_models[event][target][list_of_parameters_models_events_dict[1]].split('_')[-1].split('.')[0] + "</br>" #get model name if dictionary_of_models[event][target][key] == glod.get_unsupervised_name(): informacion += tabulacion + tabulacion + "<strong><i>dic_reassingment: </i></strong> " + str(dictionary_of_models[event][target][list_of_parameters_models_events_dict[2]]) + "</br>" informacion += "</br>" if informacion == glod.get_empty_string(): informacion = "No models were created yet" template_vars[glod.get_info_key()] = informacion #html with codecs.open(ruta_plantilla_temporal, glod.get_write_mode(), encoding=enco) as output_file: renderizado = template.render(template_vars) output_file.write(renderizado) #pdf with codecs.open(ruta_plantilla_temporal, mode=glod.get_read_mode(), encoding=enco) as read_html: pdf_resultante = os.path.join(ruta_directorio_resultados, "Current_status_dictionary_events_and_models.pdf") with open(pdf_resultante, mode=glod.get_writebyte_mode()) as pdf_gen: pisa.CreatePDF(read_html.read().encode(enco, 'ignore').decode(enco), pdf_gen) if os.path.exists(ruta_plantilla_temporal): os.remove(ruta_plantilla_temporal)
def generate_model_report(porc_acierto_test, porc_acierto_validacion, ruta_directorio_informes_accuracy, nombre_informe): '''This function geenrates a report for the prediction phase with the information about the performance of the best model''' datos_accuracy = np.array([porc_acierto_test, porc_acierto_validacion]) save_data_to_file(datos_accuracy, ruta_directorio_informes_accuracy, nombre_informe, glod.get_empty_string(), 'txt')