Пример #1
0
def store_results(y_true, yp, net, model_path, model):
    phones = fixed_phones(net)
    with open(os.path.join(os.getcwd(), model_path + os.sep + 'results.txt'),
              'w') as f:
        f.write('acc: {}\n'.format(str(get_accuracy(y_true, yp))))
        f.write('edit: {}\n'.format(
            str(
                eval_edit_dist(y_true,
                               yp,
                               net.test,
                               feature_name=net.feature_name))))
        f.write('f1-score: {}\n'.format(str(get_f1_score(y_true, yp))))
        report = get_classification_report(y_true, yp, phones)
        f.write(str(report))
    cm = get_confusion_matrix(y_true, yp)
    net.plot_confusion_matrix(
        cm,
        phones,
        os.path.join(os.getcwd(),
                     model_path + os.sep + 'confusion_matrix.png'),
        normalize=True)
    model.save(os.path.join(os.getcwd(), model_path + os.sep + 'model.h5'))
Пример #2
0
        test_labels_3 = test_labels[fu_3_idx[0]]

        for n_comp in n_components:
            train_data, test_data =\
                train_HMM.train_HMM(n_comp, train_obs, train_len, test_obs,
                                    test_len)

            for svm_f in SVM_folds:
                print("Iteration: %d, Components: %d, Fold: %d, SVM Fold: %d" %
                      (iteration, n_comp, f, svm_f))
                predicted = svm_cross_validation(train_data, train_labels,
                                                 test_data, svm_f)
                predicted = np.array(predicted)
                predicted_3 = predicted[fu_3_idx[0]]
                """ Get Metrics """
                cm = metrics.get_confusion_matrix(test_labels, predicted)
                CM[svm_f][n_comp] = np.add(cm, CM[svm_f][n_comp])

                sens = metrics.get_sensitivity(test_labels, predicted)
                sensitivity[svm_f][n_comp] += sens

                spec = metrics.get_specificity(test_labels, predicted)
                specificity[svm_f][n_comp] += spec
                """ Get Metrics 3 FU """
                cm_3 = metrics.get_confusion_matrix(test_labels_3, predicted_3)
                CM_3[svm_f][n_comp] = np.add(cm_3, CM_3[svm_f][n_comp])

                sens_3 = metrics.get_sensitivity(test_labels_3, predicted_3)
                sensitivity_3[svm_f][n_comp] += sens_3

                spec_3 = metrics.get_specificity(test_labels_3, predicted_3)
Пример #3
0
 def get_confusion_matrix(self):
     return metrics.get_confusion_matrix(self.test_labels, self.pred_labels)
Пример #4
0
def prediction_function(BASE_PATH):
    ''' Step 0: Reading configuration parameters and creating log files'''
    '''Creating variable that parses the configuration file. If the file is
    not found, an exception is thrown and finishes the execution'''

    path_to_configuration_file = os.path.join(BASE_PATH,
                                              glod.get_config_parser_name())
    config_parser = conp.ConfigParser()
    config_parser.optionxform = str
    enco = glod.get_encoding()

    if (os.path.exists(path_to_configuration_file)):
        config_parser_file = open(path_to_configuration_file, encoding=enco)
        config_parser.readfp(config_parser_file)
    else:
        raise Exception('Configuration file (conf.ini) was not found')

    logs_section = glod.get_log_section_name()
    auxiliary_data_section = glod.get_auxiliary_section_name()
    input_data_section = glod.get_input_section_name()
    prediction_section = glod.get_prediction_section_name()
    '''Creating log files'''
    log_path = os.path.join(
        BASE_PATH,
        config_parser.get(logs_section, glod.get_log_directory_name()))
    execution_log_path = os.path.join(
        log_path,
        config_parser.get(logs_section,
                          glod.get_prediction_log_execution_name()) + '.' +
        glod.get_log_files_extension())
    time_log_path = os.path.join(
        log_path,
        config_parser.get(logs_section,
                          glod.get_prediction_log_time_execution_name()) +
        '.' + glod.get_log_files_extension())
    ruta_modelos_prediccion = config_parser.get(
        prediction_section, glod.get_path_to_prediction_models_name())
    auxf.create_directory(log_path)

    step_init_time = datetime.datetime.fromtimestamp(time.time())
    repg.register_log([execution_log_path],
                      '>>>>>>Prediction Phase <<<<<<<<   \n' +
                      step_init_time.strftime('%Y-%m-%d %H:%M:%S') + "\n", 0,
                      enco)
    repg.register_log([execution_log_path],
                      '>>>> Step 0 - Reading parameters from conf.ini \n' +
                      step_init_time.strftime('%Y-%m-%d %H:%M:%S') + "\n", '',
                      enco)
    repg.register_log([time_log_path], '>>>>Step 0 starts:\n', 0, enco)
    '''Reading from conf.ini necessary variables for the prediction phase'''
    extension = glod.get_input_files_extension()
    name = config_parser.get(input_data_section,
                             glod.get_event_name_feature_name())

    observation_number = config_parser.get(input_data_section,
                                           glod.get_obsnumber_parameter_name())
    input_files_delimiter_not_catalogued_data = config_parser.get(
        prediction_section, glod.get_delimiter_non_catalogued_data_name())
    input_files_delimiter_not_catalogued_data = input_files_delimiter_not_catalogued_data.replace(
        '\'', '')
    label_non_catalogued_data = int(
        config_parser.get(input_data_section,
                          glod.get_non_catalogued_label_name()))

    maximum_number_of_files_to_catalogue = int(
        config_parser.get(prediction_section,
                          glod.get_number_of_files_parameter_name()))
    path_to_directory_with_input_files_to_catalogue = os.path.join(
        BASE_PATH,
        config_parser.get(prediction_section,
                          glod.get_name_directory_to_input_files_catalogue()))

    step_finish_time = datetime.datetime.fromtimestamp(time.time())
    repg.register_log([execution_log_path], '>>>> Step 0 ends \n' +
                      step_init_time.strftime('%Y-%m-%d %H:%M:%S') + "\n", '',
                      enco)
    repg.register_log(
        [time_log_path],
        '>>>>Step 0 - Reading parameters from conf.ini total elapsed time :' +
        str(step_finish_time - step_init_time) + '\n', '', enco)
    ''' Step 1: Reading observations from files and concatenating them '''
    step_init_time = datetime.datetime.fromtimestamp(time.time())
    repg.register_log(
        [execution_log_path],
        '>>>>Step 1 Loading observations from files into dataframes \n' +
        step_init_time.strftime('%Y-%m-%d %H:%M:%S') + "\n", '', enco)
    repg.register_log([time_log_path], '>>>>Step 1 starts:\n', '', enco)
    vector_fullpaths_to_input_files_with_observations_to_catalogue = auxf.get_all_files_in_dir_with_extension(
        path_to_directory_with_input_files_to_catalogue,
        maximum_number_of_files_to_catalogue, extension)
    ''' Path to file with relevant variables '''
    auxiliary_directory_filename = config_parser.get(
        auxiliary_data_section, glod.get_auxiliary_directory_parameter_name())
    path_to_directory_auxiliary_files = os.path.join(
        BASE_PATH,
        config_parser.get(auxiliary_data_section,
                          glod.get_auxiliary_directory_parameter_name()))

    report_dict = {
        glod.get_logo_key():
        "'" +
        os.path.join(path_to_directory_auxiliary_files, glod.get_logo_name()) +
        "'"
    }
    ''' Substep 1.1 - Reading input files '''
    substep_init_time = datetime.datetime.fromtimestamp(time.time())
    list_registers_to_catalogue = []
    repg.register_log([execution_log_path], '>>>>Step 1.1 \n', '', enco)
    for i in range(
            len(vector_fullpaths_to_input_files_with_observations_to_catalogue)
    ):
        repg.register_log(
            [execution_log_path],
            '>>Reading Csv to predict number ' + str(i) + ': ' +
            vector_fullpaths_to_input_files_with_observations_to_catalogue[i] +
            '\n', '', enco)
        print(
            "To catalogue : ",
            vector_fullpaths_to_input_files_with_observations_to_catalogue[i])
        print("\n")
        original_data = pd.read_csv(
            vector_fullpaths_to_input_files_with_observations_to_catalogue[i],
            sep=input_files_delimiter_not_catalogued_data)
        list_registers_to_catalogue.append(original_data)

    substep_finish_time = datetime.datetime.fromtimestamp(time.time())
    repg.register_log([time_log_path],
                      '>>>>Subtep 1.1 - reading csv total elapsed time: ' +
                      str(substep_finish_time - substep_init_time) + '\n', '',
                      enco)
    repg.register_log([execution_log_path],
                      '>>>>Subtep 1.1 - reading csv total elapsed time: ' +
                      str(substep_finish_time - substep_init_time) + '\n', '',
                      enco)

    if (list_registers_to_catalogue == list()):
        repg.register_log(
            [time_log_path],
            '>>>> Prediction process finished: Observations were not found ' +
            str(substep_finish_time - substep_init_time) + '\n', '', enco)
        repg.register_log(
            [execution_log_path],
            '>>>> Prediction process finished: Obervations were not found ' +
            str(substep_finish_time - substep_init_time) + '\n', '', enco)
        print('>>>> Prediction process finished: Obervations were not found ')
    ''' Substep 1.2 - Concatenating read csv'''
    substep_init_time = datetime.datetime.fromtimestamp(time.time())
    df_data_to_catalogue = pd.concat(list_registers_to_catalogue)
    reco_pandas_features = []
    for feature in df_data_to_catalogue.columns:
        reco_pandas_features.append(feature)
    df_data_to_catalogue.columns = reco_pandas_features

    try:
        df_data_to_catalogue[name]
    except Exception as e:
        repg.register_log([
            execution_log_path
        ], '>> An Eception has happened: Incorrect name of feature with events '
                          + str(e) + ' ' + datetime.datetime.fromtimestamp(
                              time.time()).strftime('%Y-%m-%d %H:%M:%S') +
                          '\n', '', enco)
        print(
            '>> An Exception has happened, check configuration file: Incorrect name of feature with events "'
            + str(e) + '"')
        error_trace = "Full trace:\n" + str(traceback.format_exc())
        repg.register_log([execution_log_path], error_trace, '', enco)
        raise Exception(e)
    ''' Erasing indexes introduced by pandas, if any '''
    if 'index' in df_data_to_catalogue.columns:
        df_data_to_catalogue = df_data_to_catalogue.drop('index', axis=1)
    if 'Unnamed: 0' in df_data_to_catalogue.columns:
        df_data_to_catalogue = df_data_to_catalogue.drop('Unnamed: 0', axis=1)

    substep_finish_time = datetime.datetime.fromtimestamp(time.time())
    step_finish_time = datetime.datetime.fromtimestamp(time.time())
    total_time_step_1 = step_finish_time - step_init_time
    repg.register_log([
        time_log_path
    ], '>>>> Substep 1.2 - Loading observations from files into dataframes total elapsed time: '
                      + str(substep_finish_time - substep_init_time) + "\n",
                      '', enco)
    repg.register_log([execution_log_path], '>>>>Substep 1.2 ends ' +
                      step_init_time.strftime('%Y-%m-%d %H:%M:%S') + "\n", '',
                      enco)
    repg.register_log([
        time_log_path
    ], '>>>> Step 1 - Reading and concatenating csv into dataframe total elapsed time: '
                      + str(total_time_step_1) + "\n", '', enco)
    repg.register_log([execution_log_path], '>>>>Step 1 ends ' +
                      step_init_time.strftime('%Y-%m-%d %H:%M:%S') + "\n", '',
                      enco)
    ''' Step 2: Reading prediction models dictionary and preloading best pkl models'''
    step_init_time = datetime.datetime.fromtimestamp(time.time())
    repg.register_log(
        [execution_log_path],
        '>>>>Step 2 Reading models dict and preload best pkl models \n' +
        step_init_time.strftime('%Y-%m-%d %H:%M:%S') + "\n", '', enco)
    repg.register_log([time_log_path], '>>>>Step 2 starts:\n', '', enco)
    '''Getting dictionary features in order to recodify and the events to catalogue'''
    substep_init_time = datetime.datetime.fromtimestamp(time.time())
    dic_event_model, handler = auxf.open_dictionary_pickle_format_for_reading(
        ruta_modelos_prediccion)

    substep_finish_time = datetime.datetime.fromtimestamp(time.time())
    repg.register_log([
        time_log_path
    ], '>>>> Substep 2.1 - Reading dictionary with models total elapsed time: '
                      + str(substep_finish_time - substep_init_time) + "\n",
                      '', enco)
    repg.register_log([execution_log_path],
                      '>>>>Substep 2.1 Reading dictionary with models ends ' +
                      substep_finish_time.strftime('%Y-%m-%d %H:%M:%S') + "\n",
                      '', enco)
    '''Preloading models in memory'''
    substep_init_time = datetime.datetime.fromtimestamp(time.time())
    repg.register_log([execution_log_path],
                      '>>>>Substep 2.2 - Preloading best pkl models \n' +
                      substep_init_time.strftime('%Y-%m-%d %H:%M:%S') + "\n",
                      '', enco)
    list_features_to_catalogue = []
    dictionary_of_events_preloaded_models = {}
    print('''
                                          Events, target and predictions models
             ##############################################################################################
             ##                                                                                          ##'''
          )
    for event in dic_event_model.keys():
        dictionary_of_events_preloaded_models[event] = {}
        try:
            for target_trained in dic_event_model[event]:
                dictionary_of_events_preloaded_models[event][
                    target_trained] = {}
                best_model = joblib.load(dic_event_model[event][target_trained]
                                         [glod.get_model_path_key()])
                print(
                    "\t\t\t\t", dic_event_model[event][target_trained][
                        glod.get_model_path_key()])
                dictionary_of_events_preloaded_models[event][target_trained][
                    glod.get_best_model_key()] = best_model
                list_features_to_catalogue += dic_event_model[event][
                    target_trained][glod.get_current_features_key()]
        except Exception as e:
            print('''
             ##                                                                                          ##
             ##############################################################################################'''
                  )
            print(
                'The pkl neccesary for the prediction of the observations of '
                + event + ' was not found ')
            repg.register_log(
                [execution_log_path],
                'The pkl neccesary for the prediction of the observations of '
                + event + ' was not found ' +
                substep_finish_time.strftime('%Y-%m-%d %H:%M:%S') + "\n", '',
                enco)
            raise Exception(e)
    print('''
             ##                                                                                          ##
             ##############################################################################################'''
          )

    substep_finish_time = datetime.datetime.fromtimestamp(time.time())
    repg.register_log(
        [time_log_path],
        '>>>>Substep 2.2 - Preloading best pkl models total elapsed time: ' +
        str(substep_finish_time - substep_init_time) + '\n', '', enco)
    repg.register_log([execution_log_path],
                      '>>>>Substep 2.2 - Preloading best pkl models ends \n' +
                      substep_finish_time.strftime('%Y-%m-%d %H:%M:%S') + "\n",
                      '', enco)
    step_finish_time = datetime.datetime.fromtimestamp(time.time())
    total_time_step_2 = step_finish_time - step_init_time
    repg.register_log([
        time_log_path
    ], '>>>> Step 2 - Reading models dict and preload best pkl models total elapsed time: '
                      + str(total_time_step_2) + "\n", '', enco)
    repg.register_log([execution_log_path], '>>>>Step 2 ends \n' +
                      step_finish_time.strftime('%Y-%m-%d %H:%M:%S') + "\n",
                      '', enco)
    ''' Step 3: Classifying observations usin preloaded models '''

    maximum_number_of_observations_to_catalogue = len(df_data_to_catalogue)

    step_init_time = datetime.datetime.fromtimestamp(time.time())
    substep_init_time = datetime.datetime.fromtimestamp(time.time())

    repg.register_log([time_log_path], '>>>> Step 3 starts \n', '', enco)
    repg.register_log([execution_log_path],
                      '>>>>Step 3 - Predicting targets using best models \n' +
                      substep_init_time.strftime('%Y-%m-%d %H:%M:%S') + "\n",
                      '', enco)
    repg.register_log(
        [execution_log_path],
        '>>>>Substep 3.1 - Preparing global dataframe of results \n' +
        substep_init_time.strftime('%Y-%m-%d %H:%M:%S') + "\n", '', enco)
    observations = df_data_to_catalogue.iloc[
        0:maximum_number_of_observations_to_catalogue]
    events_to_predict = list(set(observations[name].values))

    #target to predict
    target_to_predict = config_parser.get(prediction_section,
                                          glod.get_target_parameter_name())

    #column for predictions
    prediction_column = target_to_predict + '_pred'

    df_global_predictions = pd.DataFrame(
        data=[], columns=[observation_number, prediction_column])

    substep_finish_time = datetime.datetime.fromtimestamp(time.time())
    repg.register_log([
        time_log_path
    ], '>>>>Subtep 3.1 - Preparing global dataframe of results total elapsed time: '
                      + str(substep_finish_time - substep_init_time) + "\n",
                      '', enco)
    repg.register_log([execution_log_path], '>>>>Substep 3.1 ends \n' +
                      substep_finish_time.strftime('%Y-%m-%d %H:%M:%S') + "\n",
                      '', enco)
    total_number_predicted_observations = 0
    final_list_events_to_predict = []
    for event in events_to_predict:
        substep_init_time = datetime.datetime.fromtimestamp(time.time())
        repg.register_log([time_log_path],
                          '>>>>Subtep 3.2 - Predicting targets for event ' +
                          event + ' \n', '', enco)
        repg.register_log(
            [execution_log_path],
            '>>>>Substep 3.2 - Predicting targets for event ' + event +
            substep_init_time.strftime('%Y-%m-%d %H:%M:%S') + "\n", '', enco)
        df_event = observations[observations[name] == event]
        df_event_obsnumber = pd.DataFrame(
            data=df_event[observation_number].values,
            columns=[observation_number])

        try:
            dic_event = dictionary_of_events_preloaded_models[event]
            total_number_predicted_observations += len(df_event)

            if target_to_predict not in df_event.columns:
                repg.register_log(
                    [execution_log_path],
                    '>> ###Error: The target ' + target_to_predict +
                    ' does not exist in the dataset of the event ' + event +
                    '\n\n', '', enco)
                raise Exception(
                    '>> ###Error: The target ' + target_to_predict +
                    ' does not exist in the dataset of the event ' + event)

            if target_to_predict in dic_event:
                repg.register_log(
                    [execution_log_path],
                    '>> The event ' + event + ' (with ' + str(len(df_event)) +
                    ' observations), has a model for predict target labels \n',
                    '', enco)
                features_event = dic_event_model[event][target_to_predict][
                    glod.get_current_features_key()]

                model_event = dictionary_of_events_preloaded_models[event][
                    target_to_predict][
                        glod.get_best_model_key()]  #se referencia al modelo
                predictions = model_event.predict(df_event[features_event])
                df_event_obsnumber[prediction_column] = predictions

                recatalogued_predictions = []
                if dic_event_model[event][target_trained][
                        glod.get_learning_key()] == glod.get_unsupervised_name(
                        ):
                    for pred in predictions:
                        recatalogued_predictions.append(
                            dic_event_model[event][target_trained][
                                glod.get_reasignment_dict_key()][pred])
                    predictions = recatalogued_predictions
                df_event_obsnumber[prediction_column] = predictions
                df_event_obsnumber[name] = event

            else:
                repg.register_log(
                    [execution_log_path],
                    '>> The event ' + event + ' (with ' + str(len(df_event)) +
                    ' observations), has not models for predicting target (' +
                    target_to_predict + '). Taking original prediction \n', '',
                    enco)
                total_number_predicted_observations += len(df_event)
                df_event_obsnumber[prediction_column] = df_event[
                    target_to_predict].values

            final_list_events_to_predict.append(event)

        except Exception as excep:  #no predictions models
            repg.register_log([execution_log_path],
                              '>> The prediction process has been aborted ' +
                              str(excep) + '\n', '', enco)
            #raise Exception(e)

        df_global_predictions = pd.concat(
            [df_global_predictions, df_event_obsnumber])
        df_global_predictions[observation_number] = df_global_predictions[
            observation_number].apply(int)

        substep_finish_time = datetime.datetime.fromtimestamp(time.time())
        repg.register_log([time_log_path],
                          '>>>>Substep 3.2 - Predicting targets for event ' +
                          event + ' total elapsed time: ' +
                          str(substep_finish_time - substep_init_time) + "\n",
                          '', enco)
        repg.register_log([
            time_log_path
        ], '>>>>Substep 3.2 - Estimated elapsed time predicting one observation for event '
                          + event + ': ' + str(
                              float((substep_finish_time -
                                     substep_init_time).total_seconds()) /
                              float(len(df_event))) + "\n", '', enco)
        repg.register_log(
            [execution_log_path],
            '>>>>Substep 3.2 - Predicting targets for event ' + event +
            ' ends ' + substep_init_time.strftime('%Y-%m-%d %H:%M:%S') + "\n",
            '', enco)

    type_observation_number = df_global_predictions[observation_number].dtypes
    observations[observation_number] = observations[observation_number].astype(
        type_observation_number)
    observations = pd.merge(observations,
                            df_global_predictions,
                            on=[observation_number, name])

    for not_proc_event in set.difference(set(events_to_predict),
                                         set(final_list_events_to_predict)):
        repg.register_log(
            [execution_log_path], '>> WARNING: Event ' + not_proc_event +
            ' has not models, but validation/unkown samples dataset was provided\n',
            '', enco)
        print(
            "**WARNING**: Event " + not_proc_event +
            " has not models, but validation/unkown samples dataset was provided"
        )

    for event in events_to_predict:
        print('\n-> Event: ', event)
        df_event = observations[observations[name] == event]
        path_to_predicted_data_root = 'Prediction_models'
        path_to_predicted_data = config_parser.get(
            prediction_section, glod.get_path_predicted_data_key())

        #Accuracy print
        if (event in dic_event_model
                and target_to_predict in dic_event_model[event]):
            report_dict[event] = {target_to_predict: {}}
            print(
                '\t\tObservations with known target ',
                len(df_event[
                    df_event[target_to_predict] != label_non_catalogued_data]))
            print(
                '\t\tObservations with unknown target ',
                len(df_event[df_event[target_to_predict] ==
                             label_non_catalogued_data]))
            df_observaciones = df_event[
                df_event[target_to_predict] != label_non_catalogued_data]
            total_obs = len(df_observaciones)

            #computing confusion matrix
            pred_labels = list(df_observaciones[prediction_column].values)
            true_labels = list(df_observaciones[target_to_predict].values)

            if (pred_labels != [] and true_labels != []):
                df_observaciones_temp = df_observaciones[
                    df_observaciones[target_to_predict] ==
                    df_observaciones[prediction_column]]
                total_aciertos = len(df_observaciones_temp)

                confusion_matrix = metr.get_confusion_matrix(
                    true_labels, pred_labels,
                    sorted(
                        list(set(df_observaciones[target_to_predict].values))))
                confusion_matrix_name = 'confusion_matrix_' + event + '_' + target_to_predict
                metr.save_confusion_matrix(
                    confusion_matrix,
                    sorted(
                        list(set(df_observaciones[target_to_predict].values))),
                    os.path.join(path_to_predicted_data_root,
                                 confusion_matrix_name), 'png')

                report_dict[event][target_to_predict][
                    glod.get_best_model_key()] = str(
                        dictionary_of_events_preloaded_models[event]
                        [target_to_predict][glod.get_best_model_key()])
                report_dict[event][target_to_predict][
                    'Correct'] = total_aciertos
                report_dict[event][target_to_predict]['Total'] = len(df_event[
                    df_event[target_to_predict] != label_non_catalogued_data])
                report_dict[event][target_to_predict][
                    glod.get_accuracy_parameter_name()] = float(
                        float(total_aciertos) / float(
                            len(df_event[df_event[target_to_predict] !=
                                         label_non_catalogued_data])))
                report_dict[event][target_to_predict][
                    'target_to_predict_cm'] = os.path.join(
                        path_to_predicted_data_root,
                        confusion_matrix_name) + '.png'
                report_dict[event][target_to_predict]['Predicted'] = len(
                    df_event[df_event[target_to_predict] ==
                             label_non_catalogued_data])

        else:
            total_obs = 0

        if (total_obs != 0):
            repg.register_log(
                [time_log_path],
                '>>>>Substep 3.2 Extra - Accuracy of the model for event ' +
                event + ' and target ' + target_to_predict + '(' +
                str(float(total_aciertos)) + '/' + str(float(total_obs)) +
                '): ' + str(float(total_aciertos) / float(total_obs)) + "\n",
                '', enco)
            repg.register_log(
                [execution_log_path],
                '>>>>Substep 3.2 Extra - Accuracy of the model for event ' +
                event + ' and target ' + target_to_predict + '(' +
                str(float(total_aciertos)) + '/' + str(float(total_obs)) +
                '): ' + str(float(total_aciertos) / float(total_obs)) + "\n",
                '', enco)
        else:
            repg.register_log(
                [time_log_path],
                '>>>>Substep 3.2 Extra - Accuracy of the model for event ' +
                event + ' and target ' + target_to_predict +
                ': not calculated (no observations found) \n', '', enco)
            repg.register_log(
                [execution_log_path],
                '>>>>Substep 3.2 Extra - Accuracy of the model for event ' +
                event + ' and target ' + target_to_predict +
                ': not calculated (no observations found) \n', '', enco)

        print('''
                                            Clasification for known targets results
             ##############################################################################################'
             ##                                                                                          ##'''
              )
        if (total_obs != 0):
            print('\t\t\t\tCorrect predictions performed for ' +
                  target_to_predict + ' of event ' + event + ': ' +
                  str(total_aciertos) + '/' + str(total_obs))
        else:
            print('\t\t\t\tNo predictions performed for severity of event ' +
                  event)

        print('\t\t\t\tCheck output data at: ', path_to_predicted_data)
        print(
            '\t\t\t\tCheck predictions log for accuracy summary and more information'
        )
        print('\n\t\t\t\t\t\tThanks for using RADSSo')
        print('''             
             ##                                                                                          ##
             ##############################################################################################'''
              )

        #Determinamos cuantas predicciones sobre datos desconocidos se han realizado
        if (event in dic_event_model
                and target_to_predict in dic_event_model[event]):
            df_observaciones = df_event[df_event[target_to_predict] ==
                                        label_non_catalogued_data]
            total_obs = len(df_observaciones)

        print('''
                                                Prediction of unknown target results
             ##############################################################################################'
             ##                                                                                          ##'''
              )
        print(
            '\t\t\t\tTotal of predictions performed for ' + target_to_predict +
            ': ', str(total_obs))
        print('\t\t\t\tCheck output data at: ', path_to_predicted_data)
        print(
            '\t\t\t\tCheck predictions log for accuracy summary and more information'
        )
        print('\n\t\t\t\t\t\tThanks for using RADSSo')
        print('''             
             ##                                                                                          ##
             ##############################################################################################\n'''
              )
        try:
            if (event in report_dict):
                if (pred_labels != [] and true_labels != []):
                    repg.create_report_prediction(
                        report_dict, [event, target_to_predict],
                        auxiliary_directory_filename, 'Prediction_models',
                        enco)
        except Exception as e:
            print(''''**********************
                   ****Critical Exception****
                   **************************''')
            print(e)

    step_finish_time = datetime.datetime.fromtimestamp(time.time())
    total_time_step_3 = step_finish_time - step_init_time
    print('\n\n--- Total Elapsed time --- ' + str(total_time_step_3))

    repg.register_log(
        [time_log_path],
        '>>>>Step 3 - Predicting using best models total elapsed time: ' +
        str(total_time_step_3) + "\n", '', enco)

    repg.register_log([time_log_path],
                      '>>>> Number of observations processed by second ' + str(
                          float(total_number_predicted_observations) /
                          float(total_time_step_3.total_seconds())) + "\n", '',
                      enco)
    repg.register_log([time_log_path],
                      '>>>> Number of seconds by prediction ' + str(
                          float(total_time_step_3.total_seconds()) /
                          float(total_number_predicted_observations)) + "\n",
                      '', enco)

    repg.register_log(
        [time_log_path],
        '>>>> Recodification and Prediction Phase - total elapsed time: ' +
        str(total_time_step_1 + total_time_step_2 + total_time_step_3) + "\n",
        '', enco)

    observations.to_csv(path_to_predicted_data,
                        sep=input_files_delimiter_not_catalogued_data,
                        encoding=enco)
    return ()
Пример #5
0
def train_and_test_scikit_classifier(features,
                                     labels,
                                     splits,
                                     model_class,
                                     model_args=None,
                                     gridcv_params=None,
                                     gridcv_args=None,
                                     fit_args=None,
                                     feature_norm=True,
                                     return_models=False):
    """Routine for contructing, training and testing correlation classifier
       
       Arguments: 
           features: (K, M) feature array where K = number of stimuli and M = number of features
           labels: length-K vector of labels to be predicted
           splits: splits of data (constructed by calling the get_splits function)
           model_class: the actual live pythone object that is the classifier "class" object
           model_args: dictionary of arguments for instantiating the classifier class obejct
           gridcv_params: dictionary of params for applying gridSearch cross-validation to
           gridcv_args: additional arguments to the GridSearcCV construction function
           fit_args: additional arguments to send to the model's fit method during fitting
           feature_norm: apply featurewise_norm
           return_models: return actual trained models for each split
           
       Returns:
           dictionary summary of training and testing results
    
    """
    train_confmats = []
    test_confmats = []

    if model_args is None:
        model_args = {}
    if fit_args is None:
        fit_args = {}

    training_sidedata = []
    train_classes = validate_splits(splits, labels)

    models = []

    for split in splits:

        #here we instantiate the general classifier, whatever it is
        model = model_class(**model_args)
        if gridcv_params is not None:
            if gridcv_args is None:
                gridcv_args = {}
            model = GridSearchCV(model, gridcv_params, **gridcv_args)

        train_inds = split['train']
        test_inds = split['test']
        train_features = features[train_inds]
        train_labels = labels[train_inds]
        test_features = features[test_inds]
        test_labels = labels[test_inds]

        if feature_norm:
            train_features, fmean, fvar = featurewise_norm(train_features)
            sidedata = {'fmean': fmean, 'fvar': fvar}
            training_sidedata.append(sidedata)

        model.fit(train_features, train_labels, **fit_args)
        classes_ = model.classes_
        assert set(model.classes_) == set(train_classes)
        sidedata['classes_'] = classes_

        train_predictions = model.predict(train_features)
        train_confmat = metrics.get_confusion_matrix(train_predictions,
                                                     train_labels,
                                                     train_classes)
        train_confmats.append(train_confmat)

        if feature_norm:
            test_features, _ignore, _ignore = featurewise_norm(test_features,
                                                               fmean=fmean,
                                                               fvar=fvar)

        test_predictions = model.predict(test_features)
        test_confmat = metrics.get_confusion_matrix(test_predictions,
                                                    test_labels, train_classes)
        test_confmats.append(test_confmat)

        if return_models:
            models.append(model)

    train_confmats = np.array(train_confmats)
    train_results = metrics.evaluate_results(train_confmats, train_classes)
    test_confmats = np.array(test_confmats)
    test_results = metrics.evaluate_results(test_confmats, train_classes)
    results = {
        'train': train_results,
        'test': test_results,
        'training_sidedata': training_sidedata
    }
    if return_models:
        results['models'] = models

    return results, train_classes