Пример #1
0
def get_data_by_balanced_folds(ASs, fold_idxs, required_num_samples=None):
    prev_autonomous_systems = global_vars.get('autonomous_systems')
    folds = {i: {'X_train': [], 'X_test': [], 'y_train': [], 'y_test': []} for i in range(global_vars.get('n_folds'))}
    for AS in ASs:
        global_vars.set('autonomous_systems', [AS])
        dataset = get_dataset('all')
        concat_train_val_sets(dataset)
        dataset = unify_dataset(dataset)
        if np.count_nonzero(dataset.X) == 0:
            print(f'dropped AS {AS} - no common handovers')
            continue
        try:
            if required_num_samples is not None:
                assert len(dataset.X) == required_num_samples
            for fold_idx in range(global_vars.get('n_folds')):
                folds[fold_idx]['X_train'].extend(dataset.X[fold_idxs[fold_idx]['train_idxs']])
                folds[fold_idx]['X_test'].extend(dataset.X[fold_idxs[fold_idx]['test_idxs']])
                folds[fold_idx]['y_train'].extend(dataset.y[fold_idxs[fold_idx]['train_idxs']])
                folds[fold_idx]['y_test'].extend(dataset.y[fold_idxs[fold_idx]['test_idxs']])
        except IndexError:
            print(f'dropped AS {AS}')
        except AssertionError:
            print(f'dropped AS {AS}')
    for key in folds.keys():
        for inner_key in folds[key].keys():
            folds[key][inner_key] = np.stack(folds[key][inner_key], axis=0)
    global_vars.set('autonomous_systems', prev_autonomous_systems)
    return folds
Пример #2
0
def find_optimal_samples_report(pretrained_model, dataset, folder_name):
    report_file_name = f'{folder_name}/{global_vars.get("report")}.pdf'
    if os.path.isfile(report_file_name):
        return
    eeg_chans = list(range(global_vars.get('eeg_chans')))
    plot_dict = OrderedDict()
    dataset = unify_dataset(dataset)
    for layer_idx, layer in list(enumerate(pretrained_model.children()))[global_vars.get('layer_idx_cutoff'):]:
        max_examples = get_max_examples_per_channel(dataset.X, layer_idx, pretrained_model)
        for chan_idx, example_idx in enumerate(max_examples):
            tf_data = []
            for eeg_chan in eeg_chans:
                tf_data.append(get_tf_data_efficient(dataset.X[example_idx][None, :, :], eeg_chan, 250))
            max_value = np.max(np.array(tf_data))
            class_str = ''
            if layer_idx >= len(list(pretrained_model.children())) - 3:
                class_str = f', class:{label_by_idx(chan_idx)}'
            plot_dict[(layer_idx, chan_idx)] = tf_plot(tf_data,
                                                      f'TF plot of example {example_idx} for layer '
                                                      f'{layer_idx}, channel {chan_idx}{class_str}',max_value)
            print(f'plot most activating TF for layer {layer_idx}, channel {chan_idx}')

    img_paths = list(plot_dict.values())
    story = []
    story.append(Paragraph('<br />\n'.join([f'{x}:{y}' for x,y in pretrained_model._modules.items()]), style=styles["Normal"]))
    for im in img_paths:
        story.append(get_image(im))
    create_pdf_from_story(report_file_name, story)
    for im in img_paths:
        os.remove(im)
Пример #3
0
def power_diff_report(model, dataset, folder_name):
    report_file_name = f'{folder_name}/{global_vars.get("report")}.pdf'
    dataset = unify_dataset(dataset)
    class_examples = []
    nyquist = int(global_vars.get('frequency') / 2) - 1
    for class_idx in range(global_vars.get('n_classes')):
        class_examples.append(dataset.X[np.where(dataset.y == class_idx)])
    freqs = np.fft.fftfreq(global_vars.get('input_height'), 1 / global_vars.get('frequency'))
    freq_idx = np.argmax(freqs >= nyquist)
    diff_array = np.zeros((global_vars.get('eeg_chans'), freq_idx))
    for chan in list(range(global_vars.get('eeg_chans'))):
        first_power = np.average(np.fft.fft(class_examples[0][:, chan, :]).squeeze(), axis=0)[:freq_idx]
        second_power = np.average(np.fft.fft(class_examples[1][:, chan, :]).squeeze(), axis=0)[:freq_idx]
        power_diff = abs(first_power - second_power)
        diff_array[chan] = power_diff
    fig, ax = plt.subplots(figsize=(18, 10))
    divider = make_axes_locatable(ax)
    cax = divider.append_axes('right', size='5%', pad=0.05)
    im = ax.imshow(diff_array, cmap='hot', interpolation='nearest', aspect='auto', extent=[0, nyquist, 1, global_vars.get('eeg_chans')])
    ax.set_title('frequency diff between classes')
    ax.set_ylabel('channel')
    ax.set_xlabel('frequency')
    fig.colorbar(im, cax=cax, orientation='vertical')
    filename = f'temp/freq_diff.png'
    plt.savefig(filename)
    story = [get_image(tf) for tf in [filename]]
    create_pdf_from_story(report_file_name, story)
    for tf in [filename]:
        os.remove(tf)
Пример #4
0
def avg_class_tf_report(model, dataset, folder_name):
    report_file_name = f'{folder_name}/{global_vars.get("report")}.pdf'
    if os.path.isfile(report_file_name):
        return
    eeg_chans = list(range(global_vars.get('eeg_chans')))
    dataset = unify_dataset(dataset)
    class_examples = []
    for class_idx in range(global_vars.get('n_classes')):
        class_examples.append(dataset.X[np.where(dataset.y == class_idx)])
        if global_vars.get('to_eeglab'):
            tensor_to_eeglab(class_examples[-1], f'{folder_name}/avg_class_tf/{label_by_idx(class_idx)}.mat')
    chan_data = np.zeros((global_vars.get('n_classes'), len(eeg_chans), global_vars.get('num_frex'), global_vars.get('input_height')))
    for class_idx in range(global_vars.get('n_classes')):
        for eeg_chan in eeg_chans:
            chan_data[class_idx, eeg_chan] = get_tf_data_efficient(class_examples[class_idx], eeg_chan,
                                                 global_vars.get('frequency'), global_vars.get('num_frex'),
                                                                   dB=global_vars.get('db_normalization'))
    max_value = np.max(chan_data)
    tf_plots = []
    for class_idx in range(global_vars.get('n_classes')):
        tf_plots.append(tf_plot(chan_data[class_idx], f'average TF for {label_by_idx(class_idx)}', max_value))
    story = [get_image(tf) for tf in tf_plots]
    create_pdf_from_story(report_file_name, story)
    for tf in tf_plots:
        os.remove(tf)
Пример #5
0
def feature_importance_report(model, dataset, folder_name):
    FEATURE_VALUES = {}
    feature_mean = {}
    vmin = np.inf
    vmax = -np.inf
    report_file_name = f'{folder_name}/{global_vars.get("report")}_{global_vars.get("explainer")}.pdf'
    train_data = np_to_var(dataset['train'].X[:, :, :, None])
    model.cpu()
    if 'Ensemble' in type(model).__name__:
        for mod in model.models:
            if 'Ensemble' in type(mod).__name__:
                for inner_mod in mod.models:
                    inner_mod.cpu()
                    inner_mod.eval()
            mod.cpu()
            mod.eval()
    e = globals()[f'{global_vars.get("explainer")}_explainer'](model, train_data)
    shap_imgs = []
    for segment in ['test']:
        if segment == 'both':
            dataset = unify_dataset(dataset)
            segment_data = np_to_var(dataset.X[:, :, :, None])
        else:
            segment_data = np_to_var(dataset[segment].X[:, :, :, None])
        print(f'calculating {global_vars.get("explainer")} values for {int(segment_data.shape[0] * global_vars.get("explainer_sampling_rate"))} samples')
        segment_examples = segment_data[np.random.choice(segment_data.shape[0], int(segment_data.shape[0] * global_vars.get("explainer_sampling_rate")), replace=False)]
        feature_values = e.get_feature_importance(segment_examples)
        feature_val = np.array(feature_values).squeeze()
        if feature_val.ndim == 4:
            feature_mean[segment] = np.mean(feature_val, axis=1)
        else:
            feature_mean[segment] = feature_val
        if global_vars.get('dataset') == 'netflow_asflow':
            save_feature_importances(folder_name, feature_mean[segment])
        else:
            np.save(f'{folder_name}/{global_vars.get("explainer")}_{segment}.npy', feature_mean[segment])
        feature_value = np.concatenate(feature_mean[segment], axis=0)
        feature_value = (feature_value - np.mean(feature_value)) / np.std(feature_value)
        FEATURE_VALUES[segment] = feature_value
        if feature_mean[segment].min() < vmin:
            vmin = feature_mean[segment].min()
        if feature_mean[segment].max() > vmax:
            vmax = feature_mean[segment].max()
    for segment in ['test']:
        img_file = plot_feature_importance_netflow(folder_name, feature_mean[segment], global_vars.get('start_hour'),
                                        global_vars.get('dataset'), segment, global_vars.get('explainer'))
        if global_vars.get('dataset') != 'netflow_asflow':
            plot_topo_feature_importance(folder_name, feature_mean[segment])
        shap_imgs.append(img_file)
    story = []
    for im in shap_imgs:
        story.append(get_image(im))
    create_pdf_from_story(report_file_name, story)
    global_vars.get('sacred_ex').add_artifact(report_file_name)
    for im in shap_imgs:
        os.remove(im)
    gc.collect()
    return FEATURE_VALUES
Пример #6
0
def get_leave_one_out(data_folder, test_subject_id):
    X_train = []
    y_train = []
    X_test = []
    y_test = []
    for subject_id in global_vars.get('subjects_to_check'):
        if subject_id != test_subject_id:
            dataset = get_dataset(subject_id)
            dataset = unify_dataset(dataset)
            X_train.extend(dataset.X)
            y_train.extend(dataset.y)
    test_dataset = get_dataset(test_subject_id)
    test_dataset = unify_dataset(test_dataset)
    X_test.extend(test_dataset.X)
    y_test.extend(test_dataset.y)
    X_train = np.array(X_train)
    X_test = np.array(X_test)
    y_train = np.array(y_train)
    y_test = np.array(y_test)
    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=global_vars.get('valid_set_fraction'))
    train_set, valid_set, test_set = makeDummySignalTargets(
        X_train, y_train, X_val, y_val, X_test, y_test)
    return train_set, valid_set, test_set
Пример #7
0
def feature_importance_minmax_report(model, dataset, folder_name):
    FEATURE_VALUES = {}
    report_file_name = f'{folder_name}/{global_vars.get("report")}_{global_vars.get("explainer")}.pdf'
    train_data = np_to_var(dataset['train'].X[:, :, :, None])
    model.cpu()
    if 'Ensemble' in type(model).__name__:
        for mod in model.models:
            if 'Ensemble' in type(mod).__name__:
                for inner_mod in mod.models:
                    inner_mod.cpu()
                    inner_mod.eval()
            mod.cpu()
            mod.eval()
    e = globals()[f'{global_vars.get("explainer")}_explainer'](model, train_data)
    shap_imgs = []
    # for segment in ['train', 'test', 'both']:
    for segment in ['test']:
        if segment == 'both':
            dataset = unify_dataset(dataset)
            segment_data = np_to_var(dataset.X[:, :, :, None])
        else:
            segment_data = np_to_var(dataset[segment].X[:, :, :, None])
        min_example_idx = np.where(dataset[segment].y.max(axis=1) == np.amin(dataset[segment].y.max(axis=1)))[0]
        max_example_idx = np.where(dataset[segment].y.max(axis=1) == np.amax(dataset[segment].y.max(axis=1)))[0]
        min_example = segment_data[min_example_idx]
        max_example = segment_data[max_example_idx]
        min_feature_values = e.get_feature_importance(min_example)
        max_feature_values = e.get_feature_importance(max_example)
        min_feature_val = np.array(min_feature_values).squeeze()
        max_feature_val = np.array(max_feature_values).squeeze()
        np.save(f'{folder_name}/{global_vars.get("explainer")}_{segment}_min.npy', min_feature_val)
        np.save(f'{folder_name}/{global_vars.get("explainer")}_{segment}_max.npy', max_feature_val)
    for segment in ['test']:
        min_img_file = plot_feature_importance_netflow(folder_name, min_feature_val, global_vars.get('start_hour'),
                                        global_vars.get('dataset'), segment, global_vars.get('explainer'), title='min')
        max_img_file = plot_feature_importance_netflow(folder_name, max_feature_val, global_vars.get('start_hour'),
                                                   global_vars.get('dataset'), segment, global_vars.get('explainer'), title='max')
        shap_imgs.append(min_img_file)
        shap_imgs.append(max_img_file)
    story = []
    for im in shap_imgs:
        story.append(get_image(im))
    create_pdf_from_story(report_file_name, story)
    global_vars.get('sacred_ex').add_artifact(report_file_name)
    for im in shap_imgs:
        os.remove(im)
    gc.collect()
    return FEATURE_VALUES
Пример #8
0
def get_fold_idxs(AS):
    if global_vars.get('k_fold_time'):
        kf = TimeSeriesSplit(n_splits=global_vars.get('n_folds'))
    else:
        kf = KFold(n_splits=global_vars.get('n_folds'), shuffle=True)
    prev_autonomous_systems = global_vars.get('autonomous_systems')
    global_vars.set('autonomous_systems', [AS])
    dataset = get_dataset('all')
    concat_train_val_sets(dataset)
    dataset = unify_dataset(dataset)
    fold_idxs = {i: {} for i in range(global_vars.get('n_folds'))}
    for fold_num, (train_index, test_index) in enumerate(kf.split(list(range(len(dataset.X))))):
        fold_idxs[fold_num]['train_idxs'] = train_index
        fold_idxs[fold_num]['test_idxs'] = test_index
    global_vars.set('autonomous_systems', prev_autonomous_systems)
    return fold_idxs
Пример #9
0
def export_data_to_file(dataset,
                        format,
                        out_folder,
                        classes=None,
                        transpose_time=False,
                        unify=False):
    create_folder(out_folder)
    if unify:
        dataset = unify_dataset(dataset)
        dataset = {'all': dataset}
    for segment in dataset.keys():
        if classes is None:
            X_data = [dataset[segment].X]
            y_data = [dataset[segment].y]
            class_strs = ['']
        else:
            X_data = []
            y_data = []
            class_strs = []
            for class_idx in classes:
                X_data.append(dataset[segment].X[np.where(
                    dataset[segment].y == class_idx)])
                y_data.append(dataset[segment].y[np.where(
                    dataset[segment].y == class_idx)])
                class_strs.append(f'_{label_by_idx(class_idx)}')
        for X, y, class_str in zip(X_data, y_data, class_strs):
            if transpose_time:
                X = np.transpose(X, (0, 2, 1))
            if format == 'numpy':
                np.save(f'{out_folder}/X_{segment}{class_str}', X)
                np.save(f'{out_folder}/y_{segment}{class_str}', y)
            elif format == 'matlab':
                X = np.transpose(X, [1, 2, 0])
                savemat(f'{out_folder}/X_{segment}{class_str}.mat',
                        {'data': X})
                savemat(f'{out_folder}/y_{segment}{class_str}.mat',
                        {'data': y})
Пример #10
0
def perturbation_report(model, dataset, folder_name):
    report_file_name = f'{folder_name}/{global_vars.get("report")}_{global_vars.get("band_filter").__name__}.pdf'
    if os.path.isfile(report_file_name):
        return
    eeg_chans = list(range(get_dummy_input().shape[1]))
    tf_plots = []
    dataset = unify_dataset(dataset)
    for frequency in range(global_vars.get("low_freq"), global_vars.get("high_freq") + 1):
        single_subj_dataset = deepcopy(dataset)
        perturbed_data = global_vars.get('band_filter')(single_subj_dataset.X,
                                                        max(1, frequency - 1), frequency + 1, global_vars.get('frequency'))
        if global_vars.get('to_matlab'):
            tensor_to_eeglab(perturbed_data, f'{folder_name}/perturbation_report/frequency_{frequency}_'
                                             f'{global_vars.get("band_filter")}.mat')
        single_subj_dataset.X = perturbed_data
        subj_tfs = []
        for eeg_chan in eeg_chans:
            subj_tfs.append(get_tf_data_efficient(single_subj_dataset.X, eeg_chan, global_vars.get('frequency')))
        tf_plots.append(tf_plot(subj_tfs, f'average TF for subject {global_vars.get("subject_id")},'
                                          f' frequency {frequency}, {global_vars.get("band_filter").__name__}'))
    story = [get_image(tf) for tf in tf_plots]
    create_pdf_from_story(report_file_name, story)
    for tf in tf_plots:
        os.remove(tf)