def load_datasets(dataset_folder): """Download the pristine dataset and the dataset with 25% vacancies from Dataverse.""" train_set_name = 'pristine_dataset' # the IDs to retrieve the data can be found at this page: # https://dataverse.harvard.edu/api/datasets/export?exporter=dataverse_json&persistentId=doi%3A10.7910/DVN/ZDKBRF url_dataset_info_pristine = "https://dataverse.harvard.edu/api/access/datafile/3238706?format=original" url_x_pristine = "https://dataverse.harvard.edu/api/access/datafile/3238702?format=original" url_y_pristine = "https://dataverse.harvard.edu/api/access/datafile/3238704?format=original" path_to_x_pristine = os.path.join(dataset_folder, train_set_name + '_x.pkl') path_to_y_pristine = os.path.join(dataset_folder, train_set_name + '_y.pkl') path_to_summary_pristine = os.path.join(dataset_folder, train_set_name + '_summary.json') logger.info( "Downloading dataset of pristine structures from the Harvard Dataverse in {}." .format(dataset_folder)) logger.info("Size: ~500MB. This may take a few minutes.") urllib.urlretrieve(url_x_pristine, path_to_x_pristine) urllib.urlretrieve(url_y_pristine, path_to_y_pristine) urllib.urlretrieve(url_dataset_info_pristine, path_to_summary_pristine) test_set_name = 'vac25_dataset' # the IDs to retrieve the data can be found at this page: # https://dataverse.harvard.edu/api/datasets/export?exporter=dataverse_json&persistentId=doi%3A10.7910/DVN/ZDKBRF url_dataset_info_vac25 = "https://dataverse.harvard.edu/api/access/datafile/3238706?format=original" url_x_vac25 = "https://dataverse.harvard.edu/api/access/datafile/3238702?format=original" # this is the same as the pristine labels because we assume that the defects we create do not change the class url_y_vac25 = "https://dataverse.harvard.edu/api/access/datafile/3238704?format=original" path_to_x_vac25 = os.path.join(dataset_folder, test_set_name + '_x.pkl') path_to_y_vac25 = os.path.join(dataset_folder, test_set_name + '_y.pkl') path_to_summary_vac25 = os.path.join(dataset_folder, test_set_name + '_summary.json') logger.info( "Downloading dataset of structures with 25% vacancies from the Harvard Dataverse in {}." .format(dataset_folder)) logger.info("Size: ~500MB. This may take a few minutes.") urllib.urlretrieve(url_dataset_info_vac25, path_to_summary_vac25) urllib.urlretrieve(url_x_vac25, path_to_x_vac25) urllib.urlretrieve(url_y_vac25, path_to_y_vac25) logger.info("Download completed.") # load datasets x_pristine, y_pristine, dataset_info_pristine = load_dataset_from_file( path_to_x_pristine, path_to_y_pristine, path_to_summary_pristine) x_vac25, y_vac25, dataset_info_vac25 = load_dataset_from_file( path_to_x_vac25, path_to_y_vac25, path_to_summary_vac25) return x_pristine, y_pristine, dataset_info_pristine, x_vac25, y_vac25, dataset_info_vac25
# add as target the spacegroup (using spacegroup of the "parental" structure for the defective structure) #targets = ['fcc111', 'fcc111', 'fcc110', 'fcc110', 'fcc100', 'fcc100'] label_encoder = preprocessing.LabelEncoder() label_encoder.fit(targets_list) labels_list = label_encoder.transform(targets_list) print(images_list.shape) print(labels_list.shape) path_to_x, path_to_y, path_to_summary = prepare_dataset_STEM( images_list=images_list, labels_list=labels_list, desc_metadata='diffraction_2d_intensity', dataset_name='STEM_monocrystalline', target_name='target', target_categorical=True, input_dims=(64, 64), configs=configs, dataset_folder=dataset_folder, main_folder=configs['io']['main_folder'], desc_folder=configs['io']['desc_folder'], tmp_folder=configs['io']['tmp_folder'], notes="STEM Dataset with bcc, fcc, and hcp structures, pristine.") x, y, dataset_info = load_dataset_from_file(path_to_x=path_to_x, path_to_y=path_to_y, path_to_summary=path_to_summary) #print(x)
os.path.normpath( os.path.join(dataset_folder, train_set_name + '_summary.json'))) test_set_name = 'disp0.1_dataset' path_to_x_test = os.path.abspath( os.path.normpath(os.path.join(dataset_folder, test_set_name + '_x.pkl'))) path_to_y_test = os.path.abspath( os.path.normpath(os.path.join(dataset_folder, test_set_name + '_y.pkl'))) path_to_summary_test = os.path.abspath( os.path.normpath( os.path.join(dataset_folder, test_set_name + '_summary.json'))) x_train, y_train, dataset_info_train = load_dataset_from_file( path_to_x=path_to_x_train, path_to_y=path_to_y_train, path_to_summary=path_to_summary_train) x_test, y_test, dataset_info_test = load_dataset_from_file( path_to_x=path_to_x_test, path_to_y=path_to_y_test, path_to_summary=path_to_summary_test) params_cnn = { "nb_classes": dataset_info_train["data"][0]["nb_classes"], "classes": dataset_info_train["data"][0]["classes"], # "checkpoint_filename": 'try_'+str(now.isoformat()), "checkpoint_filename": 'ziletti_et_2018_rgb', "batch_size": 32, "img_channels": 3 }
# Download the dataset from the online repository and load it # ============================================================================= #x_pristine, y_pristine, dataset_info_pristine, x_vac25, y_vac25, dataset_info_vac25 = load_datasets(dataset_folder) train_set_name = 'STEM_monocrystalline_train' path_to_x_pristine = os.path.join(dataset_folder, train_set_name + '_x.pkl') path_to_y_pristine = os.path.join(dataset_folder, train_set_name + '_y.pkl') path_to_summary_pristine = os.path.join(dataset_folder, train_set_name + '_summary.json') test_set_name = 'STEM_monocrystalline_test' path_to_x_vac25 = os.path.join(dataset_folder, test_set_name + '_x.pkl') path_to_y_vac25 = os.path.join(dataset_folder, test_set_name + '_y.pkl') path_to_summary_vac25 = os.path.join(dataset_folder, test_set_name + '_summary.json') x_pristine, y_pristine, dataset_info_pristine = load_dataset_from_file(path_to_x_pristine, path_to_y_pristine, path_to_summary_pristine) x_vac25, y_vac25, dataset_info_vac25 = load_dataset_from_file(path_to_x_vac25, path_to_y_vac25, path_to_summary_vac25) #print(x_pristine) # ============================================================================= # Train the convolutional neural network # ============================================================================= # load the convolutional neural network architecture from Ziletti et al., Nature Communications 9, pp. 2775 (2018) #partial_model_architecture = partial(cnn_architecture_ai4STEM, conv2d_filters=[32, 32, 16, 16, 8, 8],#[32, 16, 8, 8, 16, 32], #32, 16, 16, 8, 8], partial_model_architecture = partial(cnn_nature_comm_ziletti2018, conv2d_filters=[32, 32, 16, 16, 8, 8],#[32, 16, 8, 8, 16, 32], #32, 16, 16, 8, 8], kernel_sizes=[3, 3, 3, 3, 3, 3], max_pool_strides=[2, 2], hidden_layer_size=128)
def get_classification_map(configs, path_to_x_test, path_to_y_test, path_to_summary_test, path_to_strided_pattern_pos, checkpoint_dir, checkpoint_filename, mc_samples=100, interpolation='none', results_file=None, calc_uncertainty=True, conf_matrix_file=None, train_set_name='hcp-bcc-sc-diam-fcc-pristine', cmap_uncertainty='hot', interpolation_uncertainty='none', plot_results=False, path_to_summary_train=None): if path_to_summary_train == None: path_to_x, path_to_y, path_to_summary = get_training_data_paths() with open(path_to_summary, 'rb') as f: dataset_info_train = json.load(f) """ path_to_x_train = os.path.join(configs['io']['dataset_folder'], train_set_name + '_x.pkl') path_to_y_train = os.path.join(configs['io']['dataset_folder'], train_set_name + '_y.pkl') path_to_summary_train = os.path.join(configs['io']['dataset_folder'], train_set_name + '_summary.json') x_train, y_train, dataset_info_train = load_dataset_from_file(path_to_x=path_to_x_train, path_to_y=path_to_y_train, path_to_summary=path_to_summary_train) """ x_test, y_test, dataset_info_test = load_dataset_from_file( path_to_x=path_to_x_test, path_to_y=path_to_y_test, path_to_summary=path_to_summary_test) with open(path_to_strided_pattern_pos, 'rb') as input_spm_pos: strided_pattern_pos = pickle.load(input_spm_pos) logger.debug('Strided_pattern_positions-shape: {0}'.format( strided_pattern_pos.shape)) params_cnn = { "nb_classes": dataset_info_train["data"][0]["nb_classes"], "classes": dataset_info_train["data"][0]["classes"], "batch_size": 32, "img_channels": 1 } text_labels = np.asarray(dataset_info_test["data"][0]["text_labels"]) numerical_labels = np.asarray( dataset_info_test["data"][0]["numerical_labels"]) # get classe and numerical to text conversion for plotting below classes_text_labels = dataset_info_train["data"][0]['classes'] numerical_to_text_label = dict( zip(range(len(classes_text_labels)), classes_text_labels)) filename_no_ext = os.path.abspath( os.path.normpath(os.path.join(checkpoint_dir, checkpoint_filename))) model = load_model(filename_no_ext) results = predict(x_test, y_test, model=model, configs=configs, nb_classes=dataset_info_train["data"][0]["nb_classes"], batch_size=params_cnn["batch_size"], mc_samples=mc_samples, conf_matrix_file=conf_matrix_file, numerical_labels=numerical_labels, text_labels=text_labels, results_file=results_file) predictive_mean = results['prob_predictions'] uncertainty = results['uncertainty'] class_plot_pos = np.asarray(strided_pattern_pos) (z_max, y_max, x_max) = np.amax(class_plot_pos, axis=0) + 1 # make a dataframe to order the prob_predictions # this is needed when we read from file - the structures are ordered in a different way after they are saved # this comes into play only if more than 10 values for each directions are used df_positions = pd.DataFrame(data=class_plot_pos, columns=[ 'strided_pattern_positions_z', 'strided_pattern_positions_y', 'strided_pattern_positions_x' ]) # sort predictive mean df_predictive_mean = pd.DataFrame(data=predictive_mean) #df = pd.concat([df_positions, df_predictive_mean], axis=1, join_axes=[df_positions.index]) df = pd.concat([df_positions, df_predictive_mean], axis=1) df = df.reindex(df_positions.index) df_predictive_mean_sorted = df.sort_values([ 'strided_pattern_positions_z', 'strided_pattern_positions_y', 'strided_pattern_positions_x' ], ascending=True) predictive_mean_sorted = df_predictive_mean_sorted.drop(columns=[ 'strided_pattern_positions_z', 'strided_pattern_positions_y', 'strided_pattern_positions_x' ]).values predictive_mean_all_classes = [] for idx_class in range(predictive_mean_sorted.shape[1]): if z_max == 1: prob_prediction_class = predictive_mean_sorted[:, idx_class].reshape( y_max, x_max) else: prob_prediction_class = predictive_mean_sorted[:, idx_class].reshape( z_max, y_max, x_max) predictive_mean_all_classes.append(prob_prediction_class) title = 'Proto ' + numerical_to_text_label[idx_class] + ' Probability' if not plot_results: continue plot_prediction_heatmaps(prob_prediction_class, title=title, class_name=str(idx_class), prefix='prob', main_folder=configs['io']['main_folder'], cmap='viridis', color_nan='lightgrey', interpolation=interpolation, vmin=0.0, vmax=1.0) # added vmin, vmax here np.save( os.path.join(configs['io']['results_folder'], configs['io']['polycrystal_file'] + '_probabilities.npy'), np.array(predictive_mean_all_classes)) if calc_uncertainty: df_uncertainty = pd.DataFrame() for key in uncertainty.keys(): df_uncertainty[key] = uncertainty[key] #df = pd.concat([df_positions, df_uncertainty], axis=1, join_axes=[df_positions.index]) # TODO: join_axes deprecated..since version 0.25.0 - ai4mat at the moment has 0.22.0 BUT NOT ALWAYS df = pd.concat([df_positions, df_uncertainty], axis=1) df = df.reindex(df_positions.index) # if would do reinstallaiton, may fail... is not fixed in the setup folder!! df_uncertainty_sorted = df.sort_values([ 'strided_pattern_positions_z', 'strided_pattern_positions_y', 'strided_pattern_positions_x' ], ascending=True) uncertainty_sorted = df_uncertainty_sorted.drop(columns=[ 'strided_pattern_positions_z', 'strided_pattern_positions_y', 'strided_pattern_positions_x' ]) for key in uncertainty.keys(): if z_max == 1: # make two-dimensional plot uncertainty_prediction = uncertainty_sorted[ key].values.reshape(y_max, x_max) else: uncertainty_prediction = uncertainty_sorted[ key].values.reshape(z_max, y_max, x_max) np.save( os.path.join( configs['io']['results_folder'], configs['io']['polycrystal_file'] + '_' + key + '.npy'), uncertainty_prediction) if not plot_results: continue # for idx_uncertainty in range(predictive_mean_sorted.shape[1]): plot_prediction_heatmaps(uncertainty_prediction, title='Uncertainty ({})'.format(str(key)), main_folder=configs['io']['main_folder'], cmap=cmap_uncertainty, color_nan='lightgrey', prefix='uncertainty', suffix=str(key), interpolation=interpolation_uncertainty)
#dataset_folder = configs['io']['main_folder'] dataset_folder = os.path.join(configs['io']['main_folder'], 'my_datasets') # ============================================================================= # Download the dataset from the online repository and load it # ============================================================================= #x_pristine, y_pristine, dataset_info_pristine, x_vac25, y_vac25, dataset_info_vac25 = load_datasets(dataset_folder) test_set_name = 'STEM_monocrystalline' path_to_x_box = os.path.join(dataset_folder, test_set_name + '_x.pkl') path_to_y_box = os.path.join(dataset_folder, test_set_name + '_y.pkl') path_to_summary_box = os.path.join(dataset_folder, test_set_name + '_summary.json') x_box, y_box, dataset_info_box = load_dataset_from_file( path_to_x_box, path_to_y_box, path_to_summary_box) #print(x_pristine) # ============================================================================= # Train the convolutional neural network # ============================================================================= # load the convolutional neural network architecture from Ziletti et al., Nature Communications 9, pp. 2775 (2018) #partial_model_architecture = partial(cnn_nature_comm_ziletti2018, conv2d_filters=[32, 32, 16, 16, 8, 8], # kernel_sizes=[3, 3, 3, 3, 3, 3], max_pool_strides=[2, 2], # hidden_layer_size=128) # use x_train also for validation - this is only to run the test #results = train_neural_network(x_train=x_pristine, y_train=y_pristine, x_val=x_pristine, y_val=y_pristine, # configs=configs, partial_model_architecture=partial_model_architecture,
def get_classification_map(polycrystal_file, descriptor, desc_metadata, configs, checkpoint_dir, checkpoint_filename, operations_on_structure=None, stride_size=(4., 4., 4.), box_size=12.0, init_sliding_volume=(14., 14., 14.), desc_file=None, desc_only=False, show_plot_lengths=True, calc_uncertainty=True, mc_samples=10, desc_file_suffix_name='_pristine', nb_jobs=-1, interpolation='none', results_file=None, conf_matrix_file=None, train_set_name='hcp-bcc-sc-diam-fcc-pristine', padding_ratio=None, cmap_uncertainty='hot', interpolation_uncertainty='none'): if desc_file is None: logger.info("Calculating system's representation.") desc_file = calc_polycrystal_desc( polycrystal_file, stride_size, box_size, descriptor, configs, desc_file_suffix_name, operations_on_structure, nb_jobs, show_plot_lengths, padding_ratio=padding_ratio, init_sliding_volume=init_sliding_volume) else: logger.info("Using the precomputed user-specified descriptor file.") if not desc_only: target_list, structure_list = load_descriptor(desc_files=desc_file, configs=configs) # create dataset dataset_name = '{0}_stride_{1}_{2}_{3}_box_size_{4}_{5}.tar.gz'.format( polycrystal_file, stride_size[0], stride_size[1], stride_size[2], box_size, desc_file_suffix_name) path_to_x_test, path_to_y_test, path_to_summary_test = prepare_dataset( structure_list=structure_list, target_list=target_list, desc_metadata=desc_metadata, dataset_name=dataset_name, target_name='target', target_categorical=True, input_dims=(52, 32), configs=configs, dataset_folder=configs['io']['dataset_folder'], main_folder=configs['io']['main_folder'], desc_folder=configs['io']['desc_folder'], tmp_folder=configs['io']['tmp_folder']) path_to_x_train = os.path.join(configs['io']['dataset_folder'], train_set_name + '_x.pkl') path_to_y_train = os.path.join(configs['io']['dataset_folder'], train_set_name + '_y.pkl') path_to_summary_train = os.path.join(configs['io']['dataset_folder'], train_set_name + '_summary.json') x_train, y_train, dataset_info_train = load_dataset_from_file( path_to_x=path_to_x_train, path_to_y=path_to_y_train, path_to_summary=path_to_summary_train) x_test, y_test, dataset_info_test = load_dataset_from_file( path_to_x=path_to_x_test, path_to_y=path_to_y_test, path_to_summary=path_to_summary_test) params_cnn = { "nb_classes": dataset_info_train["data"][0]["nb_classes"], "classes": dataset_info_train["data"][0]["classes"], "batch_size": 32, "img_channels": 1 } text_labels = np.asarray(dataset_info_test["data"][0]["text_labels"]) numerical_labels = np.asarray( dataset_info_test["data"][0]["numerical_labels"]) data_set_predict = make_data_sets(x_train_val=x_test, y_train_val=y_test, split_train_val=False, test_size=0.1, x_test=x_test, y_test=y_test) target_pred_class, target_pred_probs, prob_predictions, conf_matrix, uncertainty = predict( data_set_predict, params_cnn["nb_classes"], configs=configs, batch_size=params_cnn["batch_size"], checkpoint_dir=checkpoint_dir, checkpoint_filename=checkpoint_filename, show_model_acc=False, mc_samples=mc_samples, predict_probabilities=True, plot_conf_matrix=True, conf_matrix_file=conf_matrix_file, numerical_labels=numerical_labels, text_labels=text_labels, results_file=results_file, normalize=True) predictive_mean = prob_predictions # get the number of strides in each directions in order to reshape properly strided_pattern_positions = [] for structure in structure_list: strided_pattern_positions.append( structure.info['strided_pattern_positions']) class_plot_pos = np.asarray(strided_pattern_positions) (z_max, y_max, x_max) = np.amax(class_plot_pos, axis=0) + 1 # make a dataframe to order the prob_predictions # this is needed when we read from file - the structures are ordered in a different way after they are saved # this comes into play only if more than 10 values for each directions are used df_positions = pd.DataFrame(data=class_plot_pos, columns=[ 'strided_pattern_positions_z', 'strided_pattern_positions_y', 'strided_pattern_positions_x' ]) # sort predictive mean df_predictive_mean = pd.DataFrame(data=predictive_mean) df = pd.concat([df_positions, df_predictive_mean], axis=1, join_axes=[df_positions.index]) df_predictive_mean_sorted = df.sort_values([ 'strided_pattern_positions_z', 'strided_pattern_positions_y', 'strided_pattern_positions_x' ], ascending=True) predictive_mean_sorted = df_predictive_mean_sorted.drop(columns=[ 'strided_pattern_positions_z', 'strided_pattern_positions_y', 'strided_pattern_positions_x' ]).values for idx_class in range(predictive_mean_sorted.shape[1]): prob_prediction_class = predictive_mean_sorted[:, idx_class].reshape( z_max, y_max, x_max) plot_prediction_heatmaps(prob_prediction_class, title='Probability', class_name=str(idx_class), prefix='prob', main_folder=configs['io']['main_folder'], cmap='viridis', interpolation=interpolation) # mlab.close(all=True) # plt.contour3(prob_prediction_class) #(prob_prediction_class) # make three-dimensional plot if calc_uncertainty: df_uncertainty = pd.DataFrame() for key in uncertainty.keys(): df_uncertainty[key] = uncertainty[key] df = pd.concat([df_positions, df_uncertainty], axis=1, join_axes=[df_positions.index]) df_uncertainty_sorted = df.sort_values([ 'strided_pattern_positions_z', 'strided_pattern_positions_y', 'strided_pattern_positions_x' ], ascending=True) uncertainty_sorted = df_uncertainty_sorted.drop(columns=[ 'strided_pattern_positions_z', 'strided_pattern_positions_y', 'strided_pattern_positions_x' ]) for key in uncertainty.keys(): uncertainty_prediction = uncertainty_sorted[ key].values.reshape(z_max, y_max, x_max) # for idx_uncertainty in range(predictive_mean_sorted.shape[1]): plot_prediction_heatmaps( uncertainty_prediction, title='Uncertainty ({})'.format(str(key)), main_folder=configs['io']['main_folder'], cmap=cmap_uncertainty, prefix='uncertainty', suffix=str(key), interpolation=interpolation_uncertainty)