def __init__(self, unique_section_names, target_dir, mode):
        super().__init__()

        n_files_ea_section = []  # number of files for each section
        n_vectors_ea_file = []  # number of vectors for each file
        data = numpy.empty(
            (0, CONFIG["feature"]["n_frames"] * CONFIG["feature"]["n_mels"]),
            dtype=float,
        )

        for section_name in unique_section_names:
            # get file list for each section
            # all values of y_true are zero in training
            print("target_dir : %s" % (target_dir + "_" + section_name))
            files, _ = util.file_list_generator(
                target_dir=target_dir,
                section_name=section_name,
                dir_name="train",
                mode=mode,
            )
            print("number of files : %s" % (str(len(files))))

            n_files_ea_section.append(len(files))

            # extract features from audio files and
            # concatenate them into Numpy array.
            features, n_features = concat_features(files)

            data = numpy.append(data, features, axis=0)
            n_vectors_ea_file.append(n_features)

        n_vectors_ea_file = flatten(n_vectors_ea_file)

        # make target labels for conditioning
        # they are not one-hot vector!
        labels = numpy.zeros((data.shape[0]), dtype=int)
        start_index = 0
        for section_index in range(unique_section_names.shape[0]):
            for file_id in range(n_files_ea_section[section_index]):
                labels[
                    start_index : start_index + n_vectors_ea_file[file_id]
                ] = section_index
                start_index += n_vectors_ea_file[file_id]

        # 1D vector to 2D image (1ch)
        self.data = data.reshape(
            (
                data.shape[0],
                1,  # number of channels
                CONFIG["feature"]["n_frames"],
                CONFIG["feature"]["n_mels"],
            )
        )

        self.labels = labels
def fit_gamma_dist(model, target_dir, mode):
    """
    - Calculate anomaly scores over sections.
    - Fit gamma distribution for anomaly scores.
    - Save the parameters of the distribution.
    """

    section_names = util.get_section_names(target_dir, dir_name="train")
    dataset_scores = numpy.array([], dtype=numpy.float64)

    # calculate anomaly scores over sections
    for section_index, section_name in enumerate(section_names):
        section_files, _ = util.file_list_generator(
            target_dir=target_dir,
            section_name=section_name,
            dir_name="train",
            mode=mode,
        )
        section_scores = [0.0 for k in section_files]
        for file_idx, file_path in enumerate(section_files):
            section_scores[file_idx] = calc_anomaly_score(
                model, file_path=file_path, section_index=section_index
            )

        section_scores = numpy.array(section_scores)
        dataset_scores = numpy.append(dataset_scores, section_scores)

    dataset_scores = numpy.array(dataset_scores)

    # fit gamma distribution for anomaly scores
    gamma_params = scipy.stats.gamma.fit(dataset_scores)
    gamma_params = list(gamma_params)

    # save the parameters of the distribution
    score_file_path = "{model}/score_distr_{machine_type}.pkl".format(
        model=CONFIG["model_directory"], machine_type=os.path.split(target_dir)[1]
    )
    joblib.dump(gamma_params, score_file_path)
Exemplo n.º 3
0
def main():
    """
    Perform model evaluation.
    """

    # check mode
    # "development": mode == True
    # "evaluation": mode == False
    mode = util.command_line_chk()  # constant: True or False
    if mode is None:
        sys.exit(-1)

    # make result directory
    os.makedirs(CONFIG["result_directory"], exist_ok=True)

    # load base_directory list
    dir_list = util.select_dirs(config=CONFIG, mode=mode)

    # initialize lines in csv for AUC and pAUC
    csv_lines = []
    performance = {"section": None, "all": None}

    # anomaly scores and decision results
    score_list = {"anomaly": None, "decision": None}

    if mode:
        performance["all"] = []

    for idx, target_dir in enumerate(dir_list):
        print("===============================================")
        print("[%d/%d] %s" % (idx + 1, len(dir_list), target_dir))

        print("================ MODEL LOAD =================")
        model = load_model(config=CONFIG,
                           machine_type=os.path.split(target_dir)[1])
        decision_threshold = calc_decision_threshold(target_dir)

        if mode:
            # results for each machine type
            csv_lines.append([os.path.split(target_dir)[1]
                              ])  # append machine type
            csv_lines.append([
                "section", "domain", "AUC", "pAUC", "precision", "recall",
                "F1 score"
            ])
            performance["section"] = []

        for dir_name in ["source_test", "target_test"]:
            for section_name in util.get_section_names(target_dir,
                                                       dir_name=dir_name):
                # load test file
                test_files, y_true = util.file_list_generator(
                    target_dir=target_dir,
                    section_name=section_name,
                    dir_name=dir_name,
                    mode=mode,
                )

                print(
                    "============== BEGIN TEST FOR A SECTION %s OF %s =============="
                    % (section_name, dir_name))
                # - perform test for a section
                # - anomaly scores and decision results are saved in score_list
                y_pred = test_section(model, test_files, decision_threshold,
                                      score_list)

                # save anomaly scores and decision results
                save_anomaly_score(score_list, target_dir, section_name,
                                   dir_name)

                if mode:
                    # evaluation_scores (tuple): auc, p_auc, prec, recall, f1_score
                    eval_scores = calc_evaluation_scores(
                        y_true, y_pred, decision_threshold)
                    csv_lines.append([
                        section_name.split("_", 1)[1],
                        dir_name.split("_", 1)[0],
                        *eval_scores,  # unpack
                    ])
                    performance["section"].append(eval_scores)
                    performance["all"].append(eval_scores)

                print(
                    "============ END OF TEST FOR A SECTION %s OF %s ============\n"
                    % (section_name, dir_name))

        if mode:
            # calculate averages for AUCs and pAUCs
            csv_lines = calc_performance_section(performance["section"],
                                                 csv_lines)

        del model

    if mode:
        # calculate averages for AUCs and pAUCs over all sections
        csv_lines = calc_performance_all(performance["all"], csv_lines)

        # output results
        save_result(csv_lines)
Exemplo n.º 4
0
def main():
    """
    Perform model training and validation.
    """

    # check mode
    # "development": mode == True
    # "evaluation": mode == False
    mode = util.command_line_chk()  # constant: True or False
    if mode is None:
        sys.exit(-1)

    # make output directory
    os.makedirs(CONFIG["model_directory"], exist_ok=True)

    # load base_directory list
    dir_list = util.select_dirs(config=CONFIG, mode=mode)
    for idx, target_dir in enumerate(dir_list):
        print("===============================================")
        print("[%d/%d] %s" % (idx + 1, len(dir_list), target_dir))

        print("\n============== DATASET_GENERATOR ==============")
        # generate file list under "target_dir" directory.
        files, _ = util.file_list_generator(
            target_dir=target_dir,
            section_name="*",
            dir_name="train",
            mode=mode,
        )
        dcase_dataset = DcaseDataset(files)  # generate dataset from file list.
        print("===============================================")

        print("\n=========== DATALOADER_GENERATOR ==============")
        data_loader = {"train": None, "val": None}
        data_loader["train"], data_loader["val"] = get_dataloader(
            dcase_dataset)
        print("===============================================")

        print("\n================ MODEL TRAINING ===============")
        model = get_model().to(DEVICE)
        optimizer, _ = get_optimizer(model)
        # optimizer, scheduler = get_optimizer(model)  # optional

        # display summary of model through torchinfo
        summary(
            model,
            input_size=(
                CONFIG["training"]["batch_size"],
                CONFIG["feature"]["n_mels"] * CONFIG["feature"]["n_frames"],
            ),
        )

        # training loop
        for epoch in range(1, CONFIG["training"]["epochs"] + 1):
            print("Epoch {:2d}: ".format(epoch), end="")
            training(
                model=model,
                data_loader=data_loader["train"],
                optimizer=optimizer,
                # scheduler=scheduler  # optional
            )
            validation(model=model, data_loader=data_loader["val"])

        del data_loader  # delete the dataset for training.

        # fit gamma distribution for anomaly scores
        # and save the parameters of the distribution
        fit_gamma_dist(dcase_dataset, model, target_dir)

        print("============== SAVE MODEL ==============")
        save_model(
            model,
            model_dir=CONFIG["model_directory"],
            machine_type=os.path.split(target_dir)[1],
        )

        print("============== END TRAINING ==============")