Пример #1
0
def main(
        selected_pd = "JetHT",
        cutoff_eventlumi = False, 
        is_dropna = True,
        is_fillna_zero = True,
        data_preprocessing_mode = 'minmaxscalar',
        DATA_SPLIT_TRAIN = [1.0 for i in range(3)],
    ):
    # setting
    model_name = "OneClassSVM_{}_f{}".format(selected_pd, FEATURE_SET_NUMBER)

    features = utility.get_full_features(selected_pd)
    df_good = utility.read_data(selected_pd=selected_pd, pd_data_directory=PD_GOOD_DATA_DIRECTORY, cutoff_eventlumi=cutoff_eventlumi)
    df_bad = utility.read_data(selected_pd=selected_pd, pd_data_directory=PD_BAD_DATA_DIRECTORY, cutoff_eventlumi=cutoff_eventlumi)
    if is_dropna:
        df_good = df_good.dropna()
        df_bad = df_bad.dropna()
    if is_fillna_zero:
        df_good = df_good.fillna(0)
        df_bad = df_bad.fillna(0)
    x = df_good[features]
    x_train_full, x_valid, x_test = utility.split_dataset(x, frac_test=FRAC_TEST, frac_valid=FRAC_VALID)
    y_test = np.concatenate((np.full(x_test.shape[0], 0), np.full(df_bad[features].shape[0], 1)))
    x_test = np.concatenate([x_test, df_bad[features].to_numpy()])

    model_list = [svm.OneClassSVM(
    nu=0.1, kernel="rbf", gamma=0.1
    )for i in range(len(DATA_SPLIT_TRAIN))]

    for dataset_fraction, model in zip(DATA_SPLIT_TRAIN, model_list):
        print("Model: {}, Chunk of Training Dataset fraction: {}".format(model_name, dataset_fraction))

        x_train = x_train_full[:int(dataset_fraction*len(x_train_full))]
        # Data Preprocessing
        if data_preprocessing_mode == 'standardize':
            transformer = StandardScaler()
        elif data_preprocessing_mode == 'minmaxscalar':
            transformer = MinMaxScaler(feature_range=(0,1))
        if data_preprocessing_mode == 'normalize':
            x_train_tf = normalize(x_train, norm='l1')
            x_valid_tf = normalize(x_valid, norm='l1')
            x_test_tf = normalize(x_test, norm='l1')
        else:
            transformer.fit(x_train)
            x_train_tf = transformer.transform(x_train)
            x_valid_tf = transformer.transform(x_valid)
            x_test_tf = transformer.transform(x_test)
        model.fit(x_train_tf)
        try:
            file_eval = open('report/reco/eval/{} {}.txt'.format(model_name, dataset_fraction), 'w')
        except FileNotFoundError:
            os.makedirs("./report/reco/eval/")
            file_eval = open('report/reco/eval/{} {}.txt'.format(model_name, dataset_fraction), 'w')
        file_eval.write("fpr tpr threshold\n")
        fprs, tprs, thresholds = roc_curve(y_test, -model.decision_function(x_test_tf))
        for fpt, tpr, threshold in zip(fprs, tprs, thresholds):
            file_eval.write("{} {} {}\n".format(fpt, tpr, threshold))
        file_eval.close()

        print("AUC {}".format(auc(fprs, tprs)))
Пример #2
0
def error_features(
    selected_pd="JetHT",
    Autoencoder=VanillaAutoencoder,
    model_name="Vanilla",
    number_model=1,
    include_bad_failure=False,
    cutoff_eventlumi=False,
    is_dropna=True,
    is_fillna_zero=True,
    BS=2**15,
    data_preprocessing_mode='minmaxscalar',
    gpu_memory_growth=True,
    dir_log='report/reco',
):
    features = utility.get_full_features(selected_pd)
    df_good = utility.read_data(selected_pd=selected_pd,
                                pd_data_directory=PD_GOOD_DATA_DIRECTORY,
                                cutoff_eventlumi=cutoff_eventlumi)
    if include_bad_failure:
        df_bad_human = utility.read_data(
            selected_pd=selected_pd,
            pd_data_directory=PD_BAD_DATA_DIRECTORY,
            cutoff_eventlumi=cutoff_eventlumi)
        df_bad_failure = utility.read_data(
            selected_pd=selected_pd,
            pd_data_directory=PD_FAILURE_DATA_DIRECTORY,
            cutoff_eventlumi=cutoff_eventlumi)
        df_bad = pd.concat([df_bad_human, df_bad_failure], ignore_index=True)
    else:
        df_bad = utility.read_data(selected_pd=selected_pd,
                                   pd_data_directory=PD_BAD_DATA_DIRECTORY,
                                   cutoff_eventlumi=cutoff_eventlumi)
    if is_dropna:
        df_good = df_good.dropna()
        df_bad = df_bad.dropna()
    if is_fillna_zero:
        df_good = df_good.fillna(0.0)
        df_bad = df_bad.fillna(0.0)
    x = df_good[features]
    x_train_full, x_valid, x_test_good = utility.split_dataset(
        x, frac_test=FRAC_TEST, frac_valid=FRAC_VALID)
    y_test = np.concatenate([
        np.full(x_test_good.shape[0], 0.0),
        np.full(df_bad[features].shape[0], 1.0)
    ])
    x_test = np.concatenate([x_test_good, df_bad[features].to_numpy()])

    x_train = x_train_full

    # Data Preprocessing
    if data_preprocessing_mode == 'standardize':
        transformer = StandardScaler()
    elif data_preprocessing_mode == 'minmaxscalar':
        transformer = MinMaxScaler(feature_range=(0, 1))
    if data_preprocessing_mode == 'normalize':
        x_test_good_tf = normalize(x_test_good, norm='l1')
        x_test_bad_tf = normalize(df_bad[features].to_numpy(), norm='l1')
    else:
        transformer.fit(x_train)
        x_test_good_tf = transformer.transform(x_test_good)
        x_test_bad_tf = transformer.transform(df_bad[features].to_numpy())

    autoencoder = Autoencoder(
        input_dim=[len(features)],
        model_name="{}_model_{}_f{}_{}".format(model_name, selected_pd,
                                               FEATURE_SET_NUMBER,
                                               number_model),
        batch_size=BS,
    )
    autoencoder.restore()

    vec_avg_sd_good = np.mean(autoencoder.get_sd(x_test_good_tf), axis=0)
    vec_avg_sd_bad = np.mean(autoencoder.get_sd(x_test_bad_tf), axis=0)
    vec_sum_sd_good = np.sum(autoencoder.get_sd(x_test_good_tf), axis=0)
    vec_sum_sd_bad = np.sum(autoencoder.get_sd(x_test_bad_tf), axis=0)
    # visualize
    x = range(1, len(features) + 1)

    fig, axs = plt.subplots(2, 1, constrained_layout=True)
    axs[0].plot(x, vec_avg_sd_good)
    axs[0].set_title('Good LS')
    axs[0].set_xlabel("Feature Number")
    axs[0].set_ylabel("|x - $\~{x}|^2$")

    axs[1].plot(x, vec_avg_sd_bad)
    axs[1].set_title('Bad LS')
    axs[1].set_xlabel("Feature Number")
    axs[1].set_ylabel("|x - $\~{x}|^2$")

    fig.suptitle(
        "Average reconstruction error over testing sample ({}, {})".format(
            selected_pd, model_name))
    plt.savefig('avg_sd_{}_{}_f{}_{}.png'.format(model_name, selected_pd,
                                                 FEATURE_SET_NUMBER,
                                                 number_model))

    fig, axs = plt.subplots(2, 1, constrained_layout=True)
    axs[0].plot(x, vec_sum_sd_good)
    axs[0].set_title('Good LS')
    axs[0].set_xlabel("Feature Number")
    axs[0].set_ylabel("|x - $\~{x}|^2$")

    axs[1].plot(x, vec_sum_sd_bad)
    axs[1].set_title('Bad LS')
    axs[1].set_xlabel("Feature Number")
    axs[1].set_ylabel("|x - $\~{x}|^2$")

    fig.suptitle(
        "Sum reconstruction error over testing sample ({}, {})".format(
            selected_pd, model_name))
    plt.savefig('sum_sd_{}_{}_f{}_{}.png'.format(model_name, selected_pd,
                                                 FEATURE_SET_NUMBER,
                                                 number_model))

    print(
        features[48:58],
        '\n',
        features[78:85],
        '\n',
        features[85:95],
        '\n',
        features[99:108],
        '\n',
    )
Пример #3
0
def compute_ms_dist(
    selected_pd="JetHT",
    Autoencoder=VanillaAutoencoder,
    model_name="Vanilla",
    number_model=1,
    include_bad_failure=False,
    cutoff_eventlumi=False,
    is_dropna=True,
    is_fillna_zero=True,
    BS=2**15,
    data_preprocessing_mode='minmaxscalar',
    gpu_memory_growth=True,
    dir_log='report/reco',
):
    features = utility.get_full_features(selected_pd)
    df_good = utility.read_data(selected_pd=selected_pd,
                                pd_data_directory=PD_GOOD_DATA_DIRECTORY)
    if include_bad_failure:
        df_bad_human = utility.read_data(
            selected_pd=selected_pd,
            pd_data_directory=PD_BAD_DATA_DIRECTORY,
            cutoff_eventlumi=cutoff_eventlumi)
        df_bad_failure = utility.read_data(
            selected_pd=selected_pd,
            pd_data_directory=PD_FAILURE_DATA_DIRECTORY,
            cutoff_eventlumi=cutoff_eventlumi)
        df_bad = pd.concat([df_bad_human, df_bad_failure], ignore_index=True)
    else:
        df_bad = utility.read_data(selected_pd=selected_pd,
                                   pd_data_directory=PD_BAD_DATA_DIRECTORY,
                                   cutoff_eventlumi=cutoff_eventlumi)
    if is_dropna:
        df_good = df_good.dropna()
        df_bad = df_bad.dropna()
    if is_fillna_zero:
        df_good = df_good.fillna(0.0)
        df_bad = df_bad.fillna(0.0)
    x = df_good[features]
    x_train_full, x_valid, x_test_good = utility.split_dataset(
        x, frac_test=FRAC_TEST, frac_valid=FRAC_VALID)
    y_test = np.concatenate([
        np.full(x_test_good.shape[0], 0.0),
        np.full(df_bad[features].shape[0], 1.0)
    ])
    x_test = np.concatenate([x_test_good, df_bad[features].to_numpy()])

    x_train = x_train_full

    # Data Preprocessing
    if data_preprocessing_mode == 'standardize':
        transformer = StandardScaler()
    elif data_preprocessing_mode == 'minmaxscalar':
        transformer = MinMaxScaler(feature_range=(0, 1))
    if data_preprocessing_mode == 'normalize':
        x_test_good_tf = normalize(x_test_good, norm='l1')
        x_test_bad_tf = normalize(df_bad[features].to_numpy(), norm='l1')
    else:
        transformer.fit(x_train)
        x_test_good_tf = transformer.transform(x_test_good)
        x_test_bad_tf = transformer.transform(df_bad[features].to_numpy())
    run_good, lumi_good = df_good['runId'].iloc[
        -int(FRAC_TEST * len(x)):].to_numpy(
        ), df_good['lumiId'].iloc[-int(FRAC_TEST * len(x)):].to_numpy()
    run_bad, lumi_bad = df_bad['runId'].to_numpy(), df_bad['lumiId'].to_numpy()

    autoencoder = Autoencoder(
        input_dim=[len(features)],
        summary_dir="model/reco/summary",
        model_name="{}_model_{}_f{}_{}".format(model_name, selected_pd,
                                               FEATURE_SET_NUMBER,
                                               number_model),
        batch_size=BS,
    )
    autoencoder.restore()

    with open(
            os.path.join(
                dir_log,
                'good_totalSE_{}_{}_f{}_{}.txt'.format(model_name, selected_pd,
                                                       FEATURE_SET_NUMBER,
                                                       number_model)),
            'w') as f:
        f.write('total_se run lumi\n')
        for good_totalsd, run, lumi in zip(
                autoencoder.get_sd(x_test_good_tf, scalar=True), run_good,
                lumi_good):
            f.write('{} {} {}\n'.format(good_totalsd, run, lumi))
    with open(
            os.path.join(
                dir_log,
                'bad_totalSE_{}_{}_f{}_{}.txt'.format(model_name, selected_pd,
                                                      FEATURE_SET_NUMBER,
                                                      number_model)),
            'w') as f:
        f.write('total_se run lumi\n')
        for bad_totalsd, run, lumi in zip(
                autoencoder.get_sd(x_test_bad_tf, scalar=True), run_bad,
                lumi_bad):
            f.write('{} {} {}\n'.format(bad_totalsd, run, lumi))
Пример #4
0
def main(
    selected_pd="JetHT",
    include_bad_failure=False,
    cutoff_eventlumi=False,
    is_dropna=True,
    is_fillna_zero=True,
    BS=2**15,
    EPOCHS=1200,
    data_preprocessing_mode='minmaxscalar',
    DATA_SPLIT_TRAIN=[1.0 for i in range(10)],
    gpu_memory_growth=True,
):
    features = utility.get_full_features(selected_pd)
    df_good = utility.read_data(selected_pd=selected_pd,
                                pd_data_directory=PD_GOOD_DATA_DIRECTORY,
                                cutoff_eventlumi=cutoff_eventlumi)
    if include_bad_failure:
        df_bad_human = utility.read_data(
            selected_pd=selected_pd,
            pd_data_directory=PD_BAD_DATA_DIRECTORY,
            cutoff_eventlumi=cutoff_eventlumi)
        df_bad_failure = utility.read_data(
            selected_pd=selected_pd,
            pd_data_directory=PD_FAILURE_DATA_DIRECTORY,
            cutoff_eventlumi=cutoff_eventlumi)
        df_bad = pd.concat([df_bad_human, df_bad_failure], ignore_index=True)
    else:
        df_bad = utility.read_data(selected_pd=selected_pd,
                                   pd_data_directory=PD_BAD_DATA_DIRECTORY,
                                   cutoff_eventlumi=cutoff_eventlumi)
    if is_dropna:
        df_good = df_good.dropna()
        df_bad = df_bad.dropna()
    if is_fillna_zero:
        df_good = df_good.fillna(0.0)
        df_bad = df_bad.fillna(0.0)
    x = df_good[features]
    x_train_full, x_valid, x_test_good = utility.split_dataset(
        x, frac_test=FRAC_TEST, frac_valid=FRAC_VALID)
    y_test = np.concatenate([
        np.full(x_test_good.shape[0], 0.0),
        np.full(df_bad[features].shape[0], 1.0)
    ])
    x_test = np.concatenate([x_test_good, df_bad[features].to_numpy()])

    file_auc = open('report/reco/eval/roc_auc_{}.txt'.format(selected_pd), 'w')
    file_auc.write("model_name data_fraction roc_auc\n")
    for model_name, Autoencoder in zip(
        [
            "SparseContractive", "SparseVariational", "ContractiveVariational",
            "Standard"
        ],  # [ "Vanilla", "Sparse", "Contractive", "Variational"], 
        [
            SparseContractiveAutoencoder, SparseVariationalAutoencoder,
            ContractiveVariationalAutoencoder, StandardAutoencoder
        ]  # [ VanillaAutoencoder, SparseAutoencoder, ContractiveAutoencoder, VariationalAutoencoder], 
    ):
        model_list = [
            Autoencoder(
                input_dim=[len(features)],
                summary_dir="model/reco/summary",
                model_name="{}_model_{}_f{}_{}".format(model_name, selected_pd,
                                                       FEATURE_SET_NUMBER, i),
                batch_size=BS,
                gpu_memory_growth=gpu_memory_growth,
            ) for i in range(1,
                             len(DATA_SPLIT_TRAIN) + 1)
        ]
        for dataset_fraction, autoencoder in zip(DATA_SPLIT_TRAIN, model_list):
            print("Model: {}, Chunk of Training Dataset fraction: {}".format(
                autoencoder.model_name, dataset_fraction))
            file_log = open(
                'report/reco/logs/{}.txt'.format(autoencoder.model_name), 'w')
            file_log.write("EP loss_train loss_valid\n")

            x_train = x_train_full[:int(dataset_fraction * len(x_train_full))]
            print(
                "Data # training: {}, # validation: {}, # testing good {}, # testing bad {}"
                .format(
                    x_train.shape[0],
                    x_valid.shape[0],
                    x_test_good.shape[0],
                    df_bad[features].shape[0],
                ))
            # Data Preprocessing
            if data_preprocessing_mode == 'standardize':
                transformer = StandardScaler()
            elif data_preprocessing_mode == 'minmaxscalar':
                transformer = MinMaxScaler(feature_range=(0, 1))
            if data_preprocessing_mode == 'normalize':
                x_train = normalize(x_train, norm='l1')
                x_valid = normalize(x_valid, norm='l1')
                x_test = normalize(x_test, norm='l1')
            else:
                transformer.fit(x_train)
                x_train_tf = transformer.transform(x_train)
                x_valid_tf = transformer.transform(x_valid)
                x_test_tf = transformer.transform(x_test)
            autoencoder.init_variables()
            for EP in range(EPOCHS):
                x_train_shuf = shuffle(x_train_tf)
                for iteration_i in range(int(len(x_train_shuf) / BS)):
                    x_batch = x_train_shuf[BS * iteration_i:BS *
                                           (iteration_i + 1)]
                    autoencoder.train(x_batch)
                autoencoder.log_summary(x_train_tf, EP)
                file_log.write("{} {} {}\n".format(
                    EP + 1,
                    autoencoder.get_loss(x_train_tf)["loss_total"],
                    autoencoder.get_loss(x_valid_tf)["loss_total"]))
            file_log.close()

            try:
                file_eval = open(
                    'report/reco/eval/{} {}.txt'.format(
                        autoencoder.model_name, dataset_fraction), 'w')
            except FileNotFoundError:
                os.makedirs("./report/reco/eval/")
                file_eval = open(
                    'report/reco/eval/{} {}.txt'.format(
                        autoencoder.model_name, dataset_fraction), 'w')
            file_eval.write("fpr tpr threshold\n")
            ### Tracking Error
            print(
                "Error tracking for model: {}, # NaN in SD: {}, # inf in SD: {} "
                .format(
                    model_name,
                    len(
                        list(
                            filter(
                                lambda x: x == True,
                                np.isnan(
                                    autoencoder.get_sd(x_test_tf,
                                                       scalar=True))))),
                    len(
                        list(
                            filter(
                                lambda x: x == True,
                                np.isinf(
                                    autoencoder.get_sd(x_test_tf,
                                                       scalar=True)))))))
            ###
            fprs, tprs, thresholds = roc_curve(
                y_test, autoencoder.get_sd(x_test_tf, scalar=True))
            for fpt, tpr, threshold in zip(fprs, tprs, thresholds):
                file_eval.write("{} {} {}\n".format(fpt, tpr, threshold))
            file_eval.close()

            print("AUC {}".format(auc(fprs, tprs)))
            file_auc.write("{} {} {}\n".format(model_name, dataset_fraction,
                                               auc(fprs, tprs)))

            autoencoder.save()
Пример #5
0
def plot_bad_good_separate_case(
        selected_pds =  ["JetHT", "ZeroBias", ],
        data_preprocessing_mode = 'minmaxscalar',
        is_dropna = True,
        is_fillna_zero = True,
    ):
    # styling
    COLORS_SEPARATE = ('green', 'red', 'purple', 'orange')
    HUMAN_LABELS_SEPARATE = ('Good', 'Bad_Human', 'Bad_FailureScenario', 'Bad_DCS')
    MARKERS = ('o', '^', '^', '^')

    for selected_pd in selected_pds:
        print("\n\n Processing {} \n\n".format(selected_pd))
        features = utility.get_full_features(selected_pd)
        df_good = utility.read_data(selected_pd=selected_pd, pd_data_directory=PD_GOOD_DATA_DIRECTORY)
        df_bad_human = utility.read_data(selected_pd=selected_pd, pd_data_directory=PD_BAD_DATA_DIRECTORY)
        df_bad_failure = utility.read_data(selected_pd=selected_pd, pd_data_directory=PD_FAILURE_DATA_DIRECTORY)
        df_bad_dcs = utility.read_data(selected_pd=selected_pd, pd_data_directory=PD_DCS_BAD_DATA_DIRECTORY)
        if is_dropna:
            df_good = df_good.dropna()
            df_bad_human = df_bad_human.dropna()
            df_bad_failure = df_bad_failure.dropna()
            df_bad_dcs = df_bad_dcs.dropna()
        elif is_fillna_zero:
            df_good = df_good.fillna(0)
            df_bad_human = df_bad_human.fillna(0)
            df_bad_failure = df_bad_failure.fillna(0)
            df_bad_dcs = df_bad_dcs.fillna(0)
        x = df_good[features]
        x_train_full, x_valid, x_test_good = utility.split_dataset(x, frac_test=FRAC_TEST, frac_valid=FRAC_VALID)
        y_test = np.concatenate((
            np.full(x_test_good.shape[0], 0),
            np.full(df_bad_human[features].shape[0], 1),
            np.full(df_bad_dcs[features].shape[0], 1)
            ))
        x_test = np.concatenate([
            x_test_good,
            df_bad_human[features].to_numpy(),
            df_bad_failure[features].to_numpy(),
            df_bad_dcs[features].to_numpy(),
            ])

        x_train = x_train_full
        print("Data # training: {}, # validation: {}, # testing good {}, # testing bad_human {}, # testing bad_failure {}, # testing bad DCS {}".format(
            x_train.shape[0],
            x_valid.shape[0],
            x_test_good.shape[0],
            df_bad_human.shape[0],
            df_bad_failure.shape[0],
            df_bad_dcs.shape[0],
        ))
        # Data Preprocessing
        if data_preprocessing_mode == 'standardize':
            transformer = StandardScaler()
        elif data_preprocessing_mode == 'minmaxscalar':
            transformer = MinMaxScaler(feature_range=(0,1))
        if data_preprocessing_mode == 'normalize':
            x_train = normalize(x_train, norm='l1')
            x_valid = normalize(x_valid, norm='l1')
            x_test = normalize(x_test, norm='l1')
        else:
            transformer.fit(x_train)
            x_train = transformer.transform(x_train)
            x_valid = transformer.transform(x_valid)
            x_test = transformer.transform(x_test)
        # Visualization section
        pca = PCA(n_components=2)
        # pca.fit(transformer.transform(df_good[features].to_numpy()))
        pca.fit(np.concatenate([
            transformer.transform(df_good[features].to_numpy()),
            transformer.transform(df_bad_human[features].to_numpy()),
            transformer.transform(df_bad_dcs[features].to_numpy()),
        ]))
        # visulize human
        x_labeled_good = pca.transform(transformer.transform(df_good[features].to_numpy()))
        x_labeled_bad_human = pca.transform(transformer.transform(df_bad_human[features].to_numpy()))
        x_labeled_bad_failure = pca.transform(transformer.transform(df_bad_failure[features].to_numpy()))
        x_labeled_bad_dcs = pca.transform(transformer.transform(df_bad_dcs[features].to_numpy()))
        fig, ax = plt.subplots()
        for color, x, group_label, marker in zip(COLORS_SEPARATE,
                                        [x_labeled_good, x_labeled_bad_human, x_labeled_bad_failure, x_labeled_bad_dcs, ],
                                        HUMAN_LABELS_SEPARATE, MARKERS):
            ax.scatter(
                x[:, 0], x[:, 1], alpha=0.2,
                c = color,
                marker = marker,
                label = group_label
            )
        ax.legend()
        plt.title('Labeled 2018 data ({})'.format(selected_pd))
        plt.xlabel("Principal component 1")
        plt.ylabel("Principal component 2")
        plt.savefig('{}_label_separate.png'.format(selected_pd), bbox_inches='tight')
        plt.ylim((-3,3))
        plt.xlim((-3,3))
        plt.savefig('{}_label_separate_short_range.png'.format(selected_pd), bbox_inches='tight')
Пример #6
0
def plot_subsystem3d(
        selected_pd = "JetHT",
        interested_statuses = {
            'hcal_hcal': 'hcal-hcal',
            'ecal_ecal': 'ecal-ecal',
            'tracker_track': 'tracker-track',
            'muon_muon': 'muon-muon'
        },        
        data_preprocessing_mode = 'minmaxscalar',
        is_dropna = True,
        is_fillna_zero = True,
    ):
    # styling
    COLORS_SEPARATE = ('green', 'orange', 'red', 'purple', 'c')
    HUMAN_LABELS_SEPARATE = ('Good', 'Bad_HCAL', 'Bad_ECAL','Bad_TRACKER', 'Bad_MUON')
    MARKERS = ('o', '^', '^', '^', '^')

    print("\n\n Processing {} \n\n".format(selected_pd))
    features = utility.get_full_features(selected_pd)
    df_good = utility.read_data(selected_pd=selected_pd, pd_data_directory=PD_LABELED_SUBSYSTEM_GOOD_DATA_DIRECTORY)
    df_bad = utility.read_data(selected_pd=selected_pd, pd_data_directory=PD_LABELED_SUBSYSTEM_BAD_DATA_DIRECTORY)
    df_bad_hcal = df_bad.query('hcal_hcal == 0')
    df_bad_ecal = df_bad.query('ecal_ecal == 0')
    df_bad_traker = df_bad.query('tracker_track == 0')
    df_bad_muon = df_bad.query('muon_muon == 0')
    df_bad_human = utility.read_data(selected_pd=selected_pd, pd_data_directory=PD_BAD_DATA_DIRECTORY)
    df_bad_dcs = utility.read_data(selected_pd=selected_pd, pd_data_directory=PD_DCS_BAD_DATA_DIRECTORY)
    print("Before dropna; # Good:{} , # Bad:{}, # HCAL:{}, # ECAL:{}, # TRACKER:{}, # MUON:{}".format(
        df_good.shape[0], df_bad.shape[0], df_bad_hcal.shape[0], df_bad_ecal.shape[0], df_bad_traker.shape[0], df_bad_muon.shape[0]
    ))
    if is_dropna:
        df_good = df_good.dropna()
        df_bad = df_bad.dropna()
        df_bad_hcal = df_bad_hcal.dropna()
        df_bad_ecal = df_bad_ecal.dropna()
        df_bad_traker = df_bad_traker.dropna()
        df_bad_muon = df_bad_muon.dropna()

        df_bad_human = df_bad_human.dropna()
        df_bad_dcs = df_bad_dcs.dropna()
    elif is_fillna_zero:
        df_good = df_good.fillna(0)
        df_bad = df_bad.fillna(0)
        df_bad_hcal = df_bad_hcal.fillna(0)
        df_bad_ecal = df_bad_ecal.fillna(0)
        df_bad_traker = df_bad_traker.fillna(0)
        df_bad_muon = df_bad_muon.fillna(0)

        df_bad_human = df_bad_human.fillna(0)
        df_bad_dcs = df_bad_dcs.fillna(0)
    x = df_good[features]
    x_train_full, x_valid, x_test_good = utility.split_dataset(
                                            x,
                                            frac_test=FRAC_TEST,
                                            frac_valid=FRAC_VALID
                                        )
    y_test = np.concatenate((
        np.full(x_test_good.shape[0], 0),
        np.full(df_bad_hcal[features].shape[0], 1),
        np.full(df_bad_ecal[features].shape[0], 1),
        np.full(df_bad_traker[features].shape[0], 1),
        np.full(df_bad_muon[features].shape[0], 1),
    ))
    x_test = np.concatenate([
        x_test_good,
        df_bad_hcal[features].to_numpy(),
        df_bad_ecal[features].to_numpy(),
        df_bad_traker[features].to_numpy(),
        df_bad_muon[features].to_numpy(),
    ])

    file_auc = open('report/reco/eval/roc_auc.txt', 'w')
    file_auc.write("model_name data_fraction roc_auc\n")

    x_train = x_train_full
    print("Before dropna; # Good:{}, # HCAL:{}, # ECAL:{}, # TRACKER:{}, # MUON:{}".format(
        df_good.shape[0], df_bad_hcal.shape[0], df_bad_ecal.shape[0], df_bad_traker.shape[0], df_bad_muon.shape[0]
    ))
    # Data Preprocessing
    if data_preprocessing_mode == 'standardize':
        transformer = StandardScaler()
    elif data_preprocessing_mode == 'minmaxscalar':
        transformer = MinMaxScaler(feature_range=(0,1))
    if data_preprocessing_mode == 'normalize':
        x_train = normalize(x_train, norm='l1')
        x_valid = normalize(x_valid, norm='l1')
        x_test = normalize(x_test, norm='l1')
    else:
        transformer.fit(x_train)
        x_train = transformer.transform(x_train)
        x_valid = transformer.transform(x_valid)
        x_test = transformer.transform(x_test)
    # Visualization section
    pca = PCA(n_components=3)
    # pca.fit(transformer.transform(df_good[features].to_numpy()))
    pca.fit(np.concatenate([
        transformer.transform(df_good[features].to_numpy()),
        transformer.transform(df_bad_human[features].to_numpy()),
        transformer.transform(df_bad_dcs[features].to_numpy()),
    ]))

    # visualize human
    x_labeled_good = pca.transform(transformer.transform(df_good[features].to_numpy()))
    x_labeled_bad_hcal = pca.transform(transformer.transform(df_bad_hcal[features].to_numpy()))
    x_labeled_bad_ecal = pca.transform(transformer.transform(df_bad_ecal[features].to_numpy()))
    x_labeled_bad_tracker = pca.transform(transformer.transform(df_bad_traker[features].to_numpy()))
    x_labeled_bad_muon = pca.transform(transformer.transform(df_bad_muon[features].to_numpy()))
    # fig, ax = plt.subplots()
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    for color, x, group_label, marker in zip(
            COLORS_SEPARATE,
            [x_labeled_good, x_labeled_bad_hcal, x_labeled_bad_ecal, x_labeled_bad_tracker, x_labeled_bad_muon, ],
            HUMAN_LABELS_SEPARATE, MARKERS
        ):
        ax.scatter(
            x[:, 0], x[:, 1], x[:, 2], alpha=0.2,
            c = color,
            marker = marker,
            label = group_label
        )
    ax.legend()
    plt.title('Labeled 2018 data ({})'.format(selected_pd))
    plt.xlabel("Principal component 1")
    plt.ylabel("Principal component 2")
    plt.savefig('{}_subsystem_label.png'.format(selected_pd), bbox_inches='tight')
    # plt.ylim((-3,3))
    # plt.xlim((-3,3))
    # plt.savefig('{}_subsystem_label_short_range.png'.format(selected_pd), bbox_inches='tight')
    for azimuth in [0, 45, 90, 135, 180]:
        for phi in [0, 45, 90, 135, 180]:
            ax.view_init(azimuth, phi)
            plt.savefig('{}_subsystem_label_short_range({}{}).png'.format(selected_pd, azimuth, phi))
Пример #7
0
def plot_human_label(
        selected_pds = ["ZeroBias", "JetHT", "EGamma", "SingleMuon"],
        data_preprocessing_mode = 'minmaxscalar',
        is_dropna = True,
        is_fillna_zero = True,
    ):
    # styling
    COLORS = ('green', 'blue')
    GROUP_LABELS = ('A', 'B')
    HUMAN_LABELS = ('Good', 'Bad')

    for selected_pd in selected_pds:
        print("\n\n Processing {} \n\n".format(selected_pd))
        features = utility.get_full_features(selected_pd)
        df_good = utility.read_data(selected_pd=selected_pd, pd_data_directory=PD_GOOD_DATA_DIRECTORY)
        df_bad = utility.read_data(selected_pd=selected_pd, pd_data_directory=PD_BAD_DATA_DIRECTORY)
        if is_dropna:
            df_good = df_good.dropna()
            df_bad = df_bad.dropna()
        elif is_fillna_zero:
            df_good = df_good.fillna(0)
            df_bad = df_bad.fillna(0)
        x = df_good[features]
        x_train_full, x_valid, x_test_good = utility.split_dataset(x, frac_test=FRAC_TEST, frac_valid=FRAC_VALID)
        y_test = np.concatenate((np.full(x_test_good.shape[0], 0), np.full(df_bad[features].shape[0], 1)))
        x_test = np.concatenate([x_test_good, df_bad[features].to_numpy()])

        x_train = x_train_full
        print("Data # training: {}, # validation: {}, # testing good {}, # testing bad {}".format(
            x_train.shape[0],
            x_valid.shape[0],
            x_test_good.shape[0],
            df_bad.shape[0],
        ))
        # Data Preprocessing
        if data_preprocessing_mode == 'standardize':
            transformer = StandardScaler()
        elif data_preprocessing_mode == 'minmaxscalar':
            transformer = MinMaxScaler(feature_range=(0,1))
        if data_preprocessing_mode == 'normalize':
            x_train = normalize(x_train, norm='l1')
            x_valid = normalize(x_valid, norm='l1')
            x_test = normalize(x_test, norm='l1')
        else:
            transformer.fit(x_train)
            x_train = transformer.transform(x_train)
            x_valid = transformer.transform(x_valid)
            x_test = transformer.transform(x_test)
        # Visualization section
        pca = PCA(n_components=2)
        pca.fit(np.concatenate([transformer.transform(df_good[features].to_numpy()), transformer.transform(df_bad[features].to_numpy())]))
        # visulize human
        x_labeled_good = pca.transform(transformer.transform(df_good[features].to_numpy()))
        x_labeled_bad = pca.transform(transformer.transform(df_bad[features].to_numpy()))
        fig, ax = plt.subplots()
        for color, x, group_label in zip(COLORS, [x_labeled_good, x_labeled_bad], HUMAN_LABELS):
            ax.scatter(
                x[:, 0], x[:, 1], alpha=0.8,
                c = color,
                label = group_label
            )
        ax.legend()
        plt.title('Labeled by Human ({})'.format(selected_pd))
        plt.xlabel("Principal component 1")
        plt.ylabel("Principal component 2")
        plt.savefig('{}_label.png'.format(selected_pd), bbox_inches='tight')
Пример #8
0
def plot_subsystem(
        selected_pd = "JetHT",
        interested_statuses = {
            'hcal_hcal': 'hcal-hcal',
            'ecal_ecal': 'ecal-ecal',
            'tracker_track': 'tracker-track',
            'muon_muon': 'muon-muon'
        },        
        data_preprocessing_mode = 'minmaxscalar',
        is_dropna = True,
        is_fillna_zero = True,
    ):
    # styling
    COLORS_SEPARATE = ('green', 'orange', 'red', 'purple', 'c')
    HUMAN_LABELS_SEPARATE = ('Good', 'Bad_HCAL', 'Bad_ECAL','Bad_TRACKER', 'Bad_MUON')
    MARKERS = ('o', '^', '^', '^', '^')

    print("\n\n Processing {} \n\n".format(selected_pd))
    features = utility.get_full_features(selected_pd)
    df_good = utility.read_data(selected_pd=selected_pd, pd_data_directory=PD_LABELED_SUBSYSTEM_GOOD_DATA_DIRECTORY)
    df_bad = utility.read_data(selected_pd=selected_pd, pd_data_directory=PD_LABELED_SUBSYSTEM_BAD_DATA_DIRECTORY)
    df_bad_hcal = df_bad.query('hcal_hcal == 0')
    df_bad_ecal = df_bad.query('ecal_ecal == 0')
    df_bad_traker = df_bad.query('tracker_track == 0')
    df_bad_muon = df_bad.query('muon_muon == 0')
    df_bad_human = utility.read_data(selected_pd=selected_pd, pd_data_directory=PD_BAD_DATA_DIRECTORY)
    df_bad_dcs = utility.read_data(selected_pd=selected_pd, pd_data_directory=PD_DCS_BAD_DATA_DIRECTORY)
    print("Before dropna; # Good:{} , # Bad:{}, # HCAL:{}, # ECAL:{}, # TRACKER:{}, # MUON:{}".format(
        df_good.shape[0], df_bad.shape[0], df_bad_hcal.shape[0], df_bad_ecal.shape[0], df_bad_traker.shape[0], df_bad_muon.shape[0]
    ))
    if is_dropna:
        df_good = df_good.dropna()
        df_bad = df_bad.dropna()
        df_bad_hcal = df_bad_hcal.dropna()
        df_bad_ecal = df_bad_ecal.dropna()
        df_bad_traker = df_bad_traker.dropna()
        df_bad_muon = df_bad_muon.dropna()

        df_bad_human = df_bad_human.dropna()
        df_bad_dcs = df_bad_dcs.dropna()
    elif is_fillna_zero:
        df_good = df_good.fillna(0)
        df_bad = df_bad.fillna(0)
        df_bad_hcal = df_bad_hcal.fillna(0)
        df_bad_ecal = df_bad_ecal.fillna(0)
        df_bad_traker = df_bad_traker.fillna(0)
        df_bad_muon = df_bad_muon.fillna(0)

        df_bad_human = df_bad_human.fillna(0)
        df_bad_dcs = df_bad_dcs.fillna(0)
    x = df_good[features]
    x_train_full, x_valid, x_test_good = utility.split_dataset(
                                            x,
                                            frac_test=FRAC_TEST,
                                            frac_valid=FRAC_VALID
                                        )
    y_test = np.concatenate((
        np.full(x_test_good.shape[0], 0),
        np.full(df_bad_hcal[features].shape[0], 1),
        np.full(df_bad_ecal[features].shape[0], 1),
        np.full(df_bad_traker[features].shape[0], 1),
        np.full(df_bad_muon[features].shape[0], 1),
    ))
    x_test = np.concatenate([
        x_test_good,
        df_bad_hcal[features].to_numpy(),
        df_bad_ecal[features].to_numpy(),
        df_bad_traker[features].to_numpy(),
        df_bad_muon[features].to_numpy(),
    ])

    file_auc = open('report/reco/eval/roc_auc.txt', 'w')
    file_auc.write("model_name data_fraction roc_auc\n")

    x_train = x_train_full
    print("Before dropna; # Good:{}, # HCAL:{}, # ECAL:{}, # TRACKER:{}, # MUON:{}".format(
        df_good.shape[0], df_bad_hcal.shape[0], df_bad_ecal.shape[0], df_bad_traker.shape[0], df_bad_muon.shape[0]
    ))
    # Data Preprocessing
    if data_preprocessing_mode == 'standardize':
        transformer = StandardScaler()
    elif data_preprocessing_mode == 'minmaxscalar':
        transformer = MinMaxScaler(feature_range=(0,1))
    if data_preprocessing_mode == 'normalize':
        x_train = normalize(x_train, norm='l1')
        x_valid = normalize(x_valid, norm='l1')
        x_test = normalize(x_test, norm='l1')
    else:
        transformer.fit(x_train)
        x_train = transformer.transform(x_train)
        x_valid = transformer.transform(x_valid)
        x_test = transformer.transform(x_test)
    # Visualization section
    pca = PCA(n_components=2)
    # pca.fit(transformer.transform(df_good[features].to_numpy()))
    pca.fit(np.concatenate([
        transformer.transform(df_good[features].to_numpy()),
        transformer.transform(df_bad_human[features].to_numpy()),
        transformer.transform(df_bad_dcs[features].to_numpy()),
    ]))

    ###
    print(pca.explained_variance_ratio_)
    ## For check inlier and outlier
    # filter_above_muon_malfunc = list(map(lambda x: True if x > 1.0 else False, pca.transform(transformer.transform(df_bad_muon[features].to_numpy()))[:, 1]))
    # filter_below_muon_malfunc = list(map(lambda x: True if x < 1.0 else False, pca.transform(transformer.transform(df_bad_muon[features].to_numpy()))[:, 1]))
    # print("Shape df_bad_muon before cut", df_bad_muon.shape)
    # print("Shape df_bad_muon outlier", df_bad_muon[filter_above_muon_malfunc].shape)
    # print("Shape df_bad_muon inlier", df_bad_muon[filter_below_muon_malfunc].shape)
    # print("Sample muon outlier \n", df_bad_muon[filter_above_muon_malfunc].sample(n=10)[['runId', 'lumiId']])
    # print("Sample muon inlier \n", df_bad_muon[filter_below_muon_malfunc].sample(n=10)[['runId', 'lumiId']])

    ## Component in eigen vector
    # N_FIRST_COMPONENT = 20
    # abs_st_components = list(map(lambda component, feature: {'feature': feature, 'component': component}, abs(pca.components_[0]), features))
    # sorted_abs_st_components = sorted(abs_st_components, key = lambda i: i['component'], reverse=True)
    # df_pc1 = pd.DataFrame(sorted_abs_st_components)
    # df_pc1['axis'] = 1
    # abs_nd_components = list(map(lambda component, feature: {'feature': feature, 'component': component}, abs(pca.components_[1]), features))
    # sorted_abs_nd_components = sorted(abs_nd_components, key = lambda i: i['component'], reverse=True)
    # df_pc2 = pd.DataFrame(sorted_abs_nd_components)
    # df_pc2['axis'] = 2 

    # df_pc = pd.concat([df_pc1, df_pc2], ignore_index=True)
    # df_pc.to_csv("pc_{}.csv".format(selected_pd))
    ###

    # visualize human
    x_labeled_good = pca.transform(transformer.transform(df_good[features].to_numpy()))
    x_labeled_bad_hcal = pca.transform(transformer.transform(df_bad_hcal[features].to_numpy()))
    x_labeled_bad_ecal = pca.transform(transformer.transform(df_bad_ecal[features].to_numpy()))
    x_labeled_bad_tracker = pca.transform(transformer.transform(df_bad_traker[features].to_numpy()))
    x_labeled_bad_muon = pca.transform(transformer.transform(df_bad_muon[features].to_numpy()))
    fig, ax = plt.subplots()
    for color, x, group_label, marker in zip(
            COLORS_SEPARATE,
            [x_labeled_good, x_labeled_bad_hcal, x_labeled_bad_ecal, x_labeled_bad_tracker, x_labeled_bad_muon, ],
            HUMAN_LABELS_SEPARATE, MARKERS
        ):
        ax.scatter(
            x[:, 0], x[:, 1], alpha=0.2,
            c = color,
            marker = marker,
            label = group_label
        )
    ax.legend()
    plt.title('Labeled 2018 data ({})'.format(selected_pd))
    plt.xlabel("Principal component 1")
    plt.ylabel("Principal component 2")
    plt.savefig('{}_subsystem_label.png'.format(selected_pd), bbox_inches='tight')
    plt.ylim((-3,3))
    plt.xlim((-3,3))
    plt.savefig('{}_subsystem_label_short_range.png'.format(selected_pd), bbox_inches='tight')
def main():
    # setting
    model_name = "OneClass_SVM"
    selected_pds = ["ZeroBias", "JetHT", "EGamma", "SingleMuon"]
    data_preprocessing_mode = 'minmaxscalar'
    BS = 2**15
    EPOCHS = 1200
    is_fillna_zero = True

    for selected_pd in selected_pds:
        features = utility.get_full_features(selected_pd)
        df_good = utility.read_data(selected_pd=selected_pd,
                                    pd_data_directory=PD_GOOD_DATA_DIRECTORY)
        df_bad = utility.read_data(selected_pd=selected_pd,
                                   pd_data_directory=PD_BAD_DATA_DIRECTORY)
        if is_fillna_zero:
            df_good_nan = pd.isnull(df_good)
            print(df_good[df_good_nan].index.tolist())
            df_good = df_good.fillna(0)
            df_bad = df_bad.fillna(0)
        x = df_good[features]
        x_train_full, x_valid, x_test_good = utility.split_dataset(
            x, frac_test=FRAC_TEST, frac_valid=FRAC_VALID)
        y_test = np.concatenate(
            (np.full(x_test_good.shape[0],
                     0), np.full(df_bad[features].shape[0], 1)))
        x_test = np.concatenate([x_test_good, df_bad[features].to_numpy()])

        file_auc = open('report/reco/eval/roc_auc.txt', 'w')
        file_auc.write("model_name data_fraction roc_auc\n")

        x_train = x_train_full
        print(
            "Data # training: {}, # validation: {}, # testing good {}, # testing bad {}"
            .format(
                x_train.shape[0],
                x_valid.shape[0],
                x_test_good.shape[0],
                df_bad[features].shape[0],
            ))
        # Data Preprocessing
        if data_preprocessing_mode == 'standardize':
            transformer = StandardScaler()
        elif data_preprocessing_mode == 'minmaxscalar':
            transformer = MinMaxScaler(feature_range=(0, 1))
        if data_preprocessing_mode == 'normalize':
            x_train = normalize(x_train, norm='l1')
            x_valid = normalize(x_valid, norm='l1')
            x_test = normalize(x_test, norm='l1')
        else:
            transformer.fit(x_train)
            x_train = transformer.transform(x_train)
            x_valid = transformer.transform(x_valid)
            x_test = transformer.transform(x_test)

        # Visualization section
        pca = PCA(n_components=2)
        pca.fit(
            np.concatenate([
                transformer.transform(df_good[features].to_numpy()),
                transformer.transform(df_bad[features].to_numpy())
            ]))
        # visulize human
        x_labeled_good = pca.transform(df_good[features].to_numpy())
        x_labeled_bad = pca.transform(df_bad[features].to_numpy())
        fig, ax = plt.subplots()
        for color, x, group_label in zip(COLORS,
                                         [x_labeled_good, x_labeled_bad],
                                         GROUP_LABELS):
            ax.scatter(x[:, 0], x[:, 1], alpha=0.8, c=color, label=group_label)
        ax.legend()
        plt.title('Labeled by Human ({})'.format(selected_pd))
        plt.xlabel("Principal component 1")
        plt.ylabel("Principal component 2")
        plt.savefig('{}_label.png'.format(selected_pd), bbox_inches='tight')
        # random visual
        for rand_i in range(2):
            print("rand number {}".format(rand_i))
            rand_two_features = random.sample(features, 2)
            x_labeled_good = df_good[rand_two_features].to_numpy()
            x_labeled_bad = df_bad[rand_two_features].to_numpy()
            fig, ax = plt.subplots()
            for color, x, group_label in zip(COLORS,
                                             [x_labeled_good, x_labeled_bad],
                                             GROUP_LABELS):
                ax.scatter(x[:, 0],
                           x[:, 1],
                           alpha=0.8,
                           c=color,
                           label=group_label)
            ax.legend()
            plt.title('Labeled by Human ({})'.format(selected_pd))
            plt.xlabel("{}".format(rand_two_features[0]))
            plt.ylabel("{}".format(rand_two_features[1]))
            plt.savefig('{}_label_rand_{}.png'.format(selected_pd, rand_i),
                        bbox_inches='tight')