def run_CaM(parser): # Known important residues common_peaks = [109, 144, 124, 145, 128, 105, 112, 136, 108, 141, 92] shuffle_data = True args = parser.parse_args() working_dir = args.out_directory n_runs = args.number_of_runs samples = np.load(args.feature_list) cluster_indices = np.loadtxt(args.cluster_indices) # Shift cluster indices to start at 0 cluster_indices -= cluster_indices.min() if shuffle_data: # Permute blocks of 100 frames n_samples = samples.shape[0] n_samples = int(n_samples / 100) * 100 inds = np.arange(n_samples) inds = inds.reshape((int(n_samples / 100), 100)) perm_inds = np.random.permutation(inds) perm_inds = np.ravel(perm_inds) samples = samples[perm_inds] cluster_indices = cluster_indices[perm_inds] pdb_file = args.pdb_file labels = cluster_indices lower_distance_cutoff = 1.0 upper_distance_cutoff = 1.0 n_components = 20 # Check if samples format is correct if len(samples.shape) != 2: sys.exit("Matrix with features should have 2 dimensions") kwargs = { 'samples': samples, 'labels': labels, 'filter_by_distance_cutoff': True, 'lower_bound_distance_cutoff': lower_distance_cutoff, 'upper_bound_distance_cutoff': upper_distance_cutoff, 'use_inverse_distances': True, 'n_splits': args.number_of_k_splits, 'n_iterations': args.number_of_iterations, 'scaling': True } feature_extractors = [ fe.PCAFeatureExtractor(variance_cutoff=0.75, **kwargs), fe.RbmFeatureExtractor(relevance_method="from_components", **kwargs), fe.MlpAeFeatureExtractor(activation=relprop.relu, classifier_kwargs={ 'solver': 'adam', 'hidden_layer_sizes': (100, ) }, **kwargs), fe.RandomForestFeatureExtractor( one_vs_rest=True, classifier_kwargs={'n_estimators': 500}, **kwargs), fe.KLFeatureExtractor(**kwargs), fe.MlpFeatureExtractor(classifier_kwargs={ 'hidden_layer_sizes': (120, ), 'solver': 'adam', 'max_iter': 1000000 }, activation=relprop.relu, **kwargs), ] postprocessors = [] for extractor in feature_extractors: tmp_pp = [] for i_run in range(n_runs): extractor.extract_features() # Post-process data (rescale and filter feature importances) p = extractor.postprocessing(working_dir=working_dir, rescale_results=True, filter_results=False, feature_to_resids=None, pdb_file=pdb_file) p.average().evaluate_performance() p.persist() # Add common peaks tmp_pp.append(p) postprocessors.append(tmp_pp) visualization.visualize( postprocessors, show_importance=True, show_projected_data=False, show_performance=False, highlighted_residues=common_peaks, outfile="{}/importance-per-residue.png".format(working_dir)) logger.info("Done")
def compute( extractor_type, n_splits=1, n_iterations=10, feature_type='cartesian_rot', iterations_per_model=10, test_model='linear', overwrite=False, accuracy_method='mse', displacement=1e-1, visualize=True, noise_level=1e-2, # [1e-2, 1e-2, 2e-1, 2e-1], output_dir="output/benchmarking/"): """ :param extractor_type: :param n_splits: :param n_iterations: :param feature_type: :param iterations_per_model: :param test_model: :param overwrite: :param displacement: for toy model important atoms :param noise_level: for toy model frame generation :param output_dir: :return: postprocessors (np.array of dim iterations_per_model, nfeature_extractors) """ all_postprocessors = [] extractor_names = configuration.get_feature_extractors_names( extractor_type, n_splits=n_splits, n_iterations=n_iterations) n_extractors = len(extractor_names) for iter in range(iterations_per_model): modeldir = "{output_dir}/{extractor_type}/{feature_type}/{test_model}/noise-{noise_level}/iter-{iter}/".format( output_dir=output_dir, extractor_type=extractor_type, feature_type=feature_type, test_model=test_model, noise_level=noise_level, iter=iter) finished_extractors = [] for name in extractor_names: if not overwrite and os.path.exists(modeldir): filepath = "{}/{}/importance_per_residue.npy".format( modeldir, name) existing_files = glob.glob(filepath) if len(existing_files) > 0: logger.debug( "File %s already exists. skipping computations", existing_files[0]) finished_extractors.append(name) else: logger.debug( "File %s does not exists. performing computations", filepath) else: os.makedirs(modeldir) needs_computations = len(finished_extractors) < n_extractors dg = DataGenerator( natoms=100, nclusters=3, natoms_per_cluster=[10, 10, 10], nframes_per_cluster=1200 if needs_computations else 2, # Faster generation for postprocessing purposes when we don't need the frames test_model=test_model, noise_natoms=None, noise_level=noise_level, displacement=displacement, feature_type=feature_type) samples, labels = dg.generate_data() cluster_indices = labels.argmax(axis=1) feature_extractors = configuration.create_feature_extractors( extractor_type, samples=samples, labels=cluster_indices, n_splits=n_splits, n_iterations=n_iterations) # First we run the computations if necessary for i_extractor, extractor in enumerate(feature_extractors): if extractor.name in finished_extractors: continue extractor.extract_features() pp = extractor.postprocessing( predefined_relevant_residues=dg.moved_atoms, rescale_results=True, filter_results=False, working_dir=modeldir, accuracy_method=accuracy_method, feature_to_resids=dg.feature_to_resids()) pp.average() pp.evaluate_performance() logger.debug("Saving feature importance") pp.persist() logger.info("Accuracy for %s: %s (%s)", extractor.name, pp.accuracy, pp.accuracy_method) if visualize: visualization.visualize( [[pp]], show_importance=False, show_performance=False, show_projected_data=True, outfile="{}/{}/projected_data.svg".format( modeldir, extractor.name), highlighted_residues=np.array( pp.predefined_relevant_residues).flatten(), show_average=False) visualization.visualize( [[pp]], show_importance=False, show_performance=True, show_projected_data=False, outfile="{}/{}/performance.svg".format( modeldir, extractor.name), highlighted_residues=np.array( pp.predefined_relevant_residues).flatten(), show_average=False) # Delete extractor to free memory feature_extractors[i_extractor] = None del extractor # The we run through them another time, generate figures and loads data # This gives a quick check that the data has been persisted correctly and # saves memory since we don't risk keeping any references to the data and classifier feature_extractors = configuration.create_feature_extractors( extractor_type, samples=samples, labels=cluster_indices, n_splits=n_splits, n_iterations=n_iterations) all_postprocessors.append([]) for i_extractor, extractor in enumerate(feature_extractors): pp = extractor.postprocessing( predefined_relevant_residues=dg.moved_atoms, rescale_results=True, filter_results=False, working_dir=modeldir, accuracy_method=accuracy_method, feature_to_resids=dg.feature_to_resids()) pp.load() pp.compute_accuracy( ) # Recompute performance to handle changes in the accuracy measure logger.info("Accuracy for %s: %s (%s)", extractor.name, pp.accuracy, pp.accuracy_method) if visualize: visualization.visualize( [[pp]], show_importance=True, show_performance=False, show_projected_data=False, outfile="{}/{}/importance_per_residue.svg".format( modeldir, extractor.name), highlighted_residues=np.array( pp.predefined_relevant_residues).flatten(), show_average=False) all_postprocessors[-1].append(pp) return np.array(all_postprocessors)
def run_toy_model(dg, data, labels, supervised=True, filetype="svg", n_iterations=10, variance_cutoff="1_components"): cluster_indices = labels.argmax(axis=1) feature_to_resids = dg.feature_to_resids() suffix = dg.test_model + "_" + dg.feature_type \ + ("_supervised" if supervised else "_unsupervised") \ + ("_var-cutoff=" + str(variance_cutoff) if not supervised else "") kwargs = { 'samples': data, 'labels': cluster_indices, 'filter_by_distance_cutoff': False, 'use_inverse_distances': True, 'n_splits': 1, 'n_iterations': n_iterations, # 'upper_bound_distance_cutoff': 1., # 'lower_bound_distance_cutoff': 1. } supervised_feature_extractors = [ fe.MlpFeatureExtractor( activation="relu", classifier_kwargs={ # 'hidden_layer_sizes': (dg.natoms, dg.nclusters * 2), 'hidden_layer_sizes': (int(dg.natoms / 2), ), # 'hidden_layer_sizes': [int(min(dg.nfeatures, 100) / (i + 1)) for i in range(10)], 'max_iter': 10000, 'alpha': 0.001, }, per_frame_importance_outfile="output/toy_model_perframe.txt", one_vs_rest=True, **kwargs), # fe.ElmFeatureExtractor( # activation="relu", # classifier_kwargs={ # 'hidden_layer_sizes': (dg.nfeatures,), # 'alpha': 50, # }, # **kwargs), fe.KLFeatureExtractor(**kwargs), fe.RandomForestFeatureExtractor( one_vs_rest=True, classifier_kwargs={'n_estimators': 100}, **kwargs), ] unsupervised_feature_extractors = [ fe.MlpAeFeatureExtractor( classifier_kwargs={ # hidden_layer_sizes=(int(data.shape[1]/2),), 'hidden_layer_sizes': (dg.nclusters, ), # 'hidden_layer_sizes': (10, 5, 1, 5, 10,), # hidden_layer_sizes=(100, 1, 100,), # hidden_layer_sizes=(200, 50, 10, 1, 10, 50, 200, ), 'max_iter': 100000, # hidden_layer_sizes=(300, 200, 50, 10, 1, 10, 50, 200, 300,), # max_iter=10000, # 'alpha': 0.0001, 'alpha': 1, 'solver': "adam", }, use_reconstruction_for_lrp=True, activation="logistic", **kwargs), fe.PCAFeatureExtractor(classifier_kwargs={'n_components': None}, variance_cutoff=variance_cutoff, name='PCA', **kwargs), # fe.RbmFeatureExtractor(classifier_kwargs={'n_components': dg.nclusters}, # relevance_method='from_lrp', # name='RBM', # **kwargs), ] feature_extractors = supervised_feature_extractors if supervised else unsupervised_feature_extractors logger.info("Done. using %s feature extractors", len(feature_extractors)) postprocessors = [] filter_results = False for extractor in feature_extractors: extractor.error_limit = 50 logger.info("Computing relevance for extractors %s", extractor.name) extractor.extract_features() p = extractor.postprocessing(working_dir="./{}".format(extractor.name), pdb_file=None, feature_to_resids=feature_to_resids, filter_results=filter_results) p.average() p.evaluate_performance() p.persist() postprocessors.append([p]) logger.info("Done") logger.info( "Actual atoms moved: %s.\n(Cluster generation method %s. Noise level=%s, displacement=%s. frames/cluster=%s)", sorted(dg.moved_atoms), dg.test_model, dg.noise_level, dg.displacement, dg.nframes_per_cluster) visualization.visualize( postprocessors, show_importance=True, show_performance=False, show_projected_data=False, highlighted_residues=dg.moved_atoms, outfile="output/test_importance_per_residue_{suffix}.{filetype}". format(suffix=suffix, filetype=filetype)) # visualization.visualize(postprocessors, # show_importance=False, # show_performance=True, # show_projected_data=False, # outfile="output/test_performance_{suffix}.{filetype}".format(suffix=suffix, # filetype=filetype)) # visualization.visualize(postprocessors, # show_importance=False, # show_performance=False, # show_projected_data=True, # outfile="output/test_projection_{suffix}.{filetype}".format(suffix=suffix, # filetype=filetype)) logger.info( "Done. The settings were n_iterations = {n_iterations}, n_splits = {n_splits}." "\nFiltering (filter_by_distance_cutoff={filter_by_distance_cutoff})". format(**kwargs))
def run_beta2( working_dir="bio_input/beta2/", n_iterations=1, n_splits=1, shuffle_datasets=True, overwrite=False, dt=1, feature_type="ca_inv", # "closest-heavy_inv", "CA_inv", "cartesian_ca", "cartesian_noh" or "compact_ca_inv" filetype="svg", classtype="multiclass", supervised=True, load_trajectory_for_predictions=False, filter_by_distance_cutoff=False, ligand_type='holo'): results_dir = "{}/results/{}/{}/{}/".format(working_dir, classtype, feature_type, "cutoff" if filter_by_distance_cutoff else "nocutoff") samples_dir = "{}/samples/{}/{}".format(working_dir, classtype, feature_type) data = np.load("{}/samples_dt{}.npz".format(samples_dir, dt))['array'] feature_to_resids = np.load("{}/feature_to_resids.npy".format(samples_dir, feature_type)) labels = np.loadtxt("{wd}/cluster_indices/{ct}/cluster_indices_dt{dt}.txt".format(wd=working_dir, ct=classtype, dt=dt)) if classtype == "multiclass": label_names = ["agonist-bound", "protonated-asp79"] mixed_classes = True else: label_names = ["apo", "holo"] mixed_classes = False suffix = str(-1) + "clusters_" + str(n_iterations) + "iterations_" \ + ("distance-cutoff_" if filter_by_distance_cutoff else "") + feature_type labels -= labels.min() if len(data) != len(labels) or data.shape[1] != len(feature_to_resids): raise Exception("Inconsistent input data. The number of features or the number of frames to no match") logger.info("Loaded data of shape %s for feature type %s", data.shape, feature_type) # ## Define the different methods to use # Every method is encapsulated in a so called FeatureExtractor class which all follow the same interface cutoff_offset = 0.2 if "closest-heavy" in feature_type else 0 kwargs = { 'samples': data, 'labels': labels, 'label_names': label_names, 'filter_by_distance_cutoff': filter_by_distance_cutoff, 'lower_bound_distance_cutoff': filtering.lower_bound_distance_cutoff_default - cutoff_offset, 'upper_bound_distance_cutoff': filtering.upper_bound_distance_cutoff_default - cutoff_offset, 'use_inverse_distances': True, 'n_splits': n_splits, 'n_iterations': n_iterations, 'shuffle_datasets': shuffle_datasets # 'upper_bound_distance_cutoff': 1., # 'lower_bound_distance_cutoff': 1. } unsupervised_feature_extractors = [ fe.PCAFeatureExtractor(classifier_kwargs={'n_components': None}, variance_cutoff='auto', # variance_cutoff='1_components', name='PCA', **kwargs), fe.RbmFeatureExtractor(classifier_kwargs={'n_components': 1}, relevance_method='from_lrp', name='RBM', **kwargs), # fe.MlpAeFeatureExtractor( # classifier_kwargs={ # 'hidden_layer_sizes': (100, 30, 2, 30, 100,), # int(data.shape[1]/2),), # # max_iter=10000, # 'alpha': 0.01, # 'activation': "logistic" # }, # use_reconstruction_for_lrp=True, # **kwargs), ] if load_trajectory_for_predictions: other_samples, other_labels = _load_trajectory_for_predictions(ligand_type) else: other_samples, other_labels = None, None supervised_feature_extractors = [ # fe.ElmFeatureExtractor( # activation="relu", # n_nodes=data.shape[1] * 2, # alpha=0.1, # **kwargs), fe.KLFeatureExtractor(**kwargs), fe.RandomForestFeatureExtractor( one_vs_rest=True, classifier_kwargs={'n_estimators': 500}, **kwargs), fe.MlpFeatureExtractor( name="MLP" if other_samples is None else "MLP_predictor_{}".format(ligand_type), classifier_kwargs={ # 'hidden_layer_sizes': [int(min(100, data.shape[1]) / (i + 1)) + 1 for i in range(3)], 'hidden_layer_sizes': (30,), # 'max_iter': 10000, 'alpha': 0.1, 'activation': "relu" }, # per_frame_importance_samples=other_samples, # per_frame_importance_labels=other_labels, # per_frame_importance_outfile="/home/oliverfl/projects/gpcr/mega/Result_Data/beta2-dror/apo-holo/trajectories" # "/mlp_perframe_importance_{}/" # "{}_mlp_perframeimportance_{}clusters_{}cutoff.txt" # .format(ligand_type, feature_type, nclusters, "" if filter_by_distance_cutoff else "no"), **kwargs), ] if supervised is None: feature_extractors = unsupervised_feature_extractors + supervised_feature_extractors else: feature_extractors = supervised_feature_extractors if supervised else unsupervised_feature_extractors logger.info("Done. using %s feature extractors", len(feature_extractors)) highlighted_residues = _get_important_residues(supervised, feature_type) # # Run the relevance analysis postprocessors = [] for extractor in feature_extractors: do_computations = True if os.path.exists(results_dir): existing_files = glob.glob("{}/{}/importance_per_residue.npy".format(results_dir, extractor.name)) if len(existing_files) > 0 and not overwrite: logger.debug("File %s already exists. skipping computations", existing_files[0]) do_computations = False if do_computations: logger.info("Computing relevance for extractors %s", extractor.name) extractor.extract_features() p = extractor.postprocessing(working_dir=results_dir, pdb_file=working_dir + "/trajectories/all.pdb", # pdb_file=working_dir + "/trajectories/protein_noh.pdb", feature_to_resids=feature_to_resids, filter_results=False) if do_computations: p.average() p.evaluate_performance() p.persist() else: p.load() postprocessors.append([p]) # # Visualize results visualization.visualize([[p]], show_importance=True, show_performance=False, show_projected_data=False, mixed_classes=mixed_classes, highlighted_residues=highlighted_residues, outfile=results_dir + "/{extractor}/importance_per_residue_{suffix}_{extractor}.{filetype}".format( suffix=suffix, extractor=extractor.name, filetype=filetype)) if do_computations: visualization.visualize([[p]], show_importance=False, show_performance=True, show_projected_data=False, mixed_classes=mixed_classes, outfile=results_dir + "/{extractor}/performance_{suffix}_{extractor}.{filetype}".format( suffix=suffix, extractor=extractor.name, filetype=filetype)) visualization.visualize([[p]], show_importance=False, show_performance=False, show_projected_data=True, mixed_classes=mixed_classes, outfile=results_dir + "/{extractor}/projected_data_{suffix}_{extractor}.{filetype}".format( suffix=suffix, extractor=extractor.name, filetype=filetype)) logger.info("Done. The settings were n_iterations = {n_iterations}, n_splits = {n_splits}." "\nFiltering (filter_by_distance_cutoff={filter_by_distance_cutoff})".format(**kwargs))
def run_VSD(working_dir="bio_input/VSD/", cluster_for_prediction=None, dt_for_prediction=10, multiclass=False): data = np.load(working_dir + 'frame_i_j_contacts_dt1.npy') cluster_indices = np.loadtxt(working_dir + 'clusters_indices.dat') kwargs = { 'samples': data, 'labels': cluster_indices, 'filter_by_distance_cutoff': True, 'use_inverse_distances': True, 'n_splits': 3, 'n_iterations': 5, 'scaling': True, 'shuffle_datasets': True } if cluster_for_prediction is not None: cluster_traj = md.load("{}/{}_dt{}.xtc".format(working_dir, cluster_for_prediction, dt_for_prediction), top=working_dir + "alpha.pdb") other_samples, _, _ = tp.to_distances(traj=cluster_traj, scheme="closest-heavy", pairs="all-residues", use_inverse_distances=True, ignore_nonprotein=True, periodic=True) logger.debug( "Loaded cluster samples for prediction of shape %s for state %s", other_samples.shape, cluster_for_prediction) cluster_traj = None # free memory else: other_samples = False feature_extractors = [ fe.RandomForestFeatureExtractor( classifier_kwargs={'n_estimators': 100}, one_vs_rest=not multiclass, **kwargs), fe.KLFeatureExtractor(bin_width=0.1, **kwargs), fe.MlpFeatureExtractor( classifier_kwargs={ 'hidden_layer_sizes': [ 100, ], 'max_iter': 100000, 'alpha': 0.0001 }, activation="relu", one_vs_rest=not multiclass, per_frame_importance_samples=other_samples, per_frame_importance_labels= None, # If None the method will use predicted labels for LRP per_frame_importance_outfile="{}/mlp_perframe_importance_{}/" "VSD_mlp_perframeimportance_{}_dt{}.txt".format( working_dir, "multiclass" if multiclass else "binaryclass", cluster_for_prediction, dt_for_prediction), **kwargs) ] common_peaks = { "R1-R4": [294, 297, 300, 303], "K5": [306], "R6": [309], } do_computations = True filetype = "svg" for extractor in feature_extractors: logger.info("Computing relevance for extractors %s", extractor.name) extractor.extract_features() p = extractor.postprocessing(working_dir=working_dir, pdb_file=working_dir + "alpha.pdb", filter_results=False) if do_computations: p.average() p.evaluate_performance() p.persist() else: p.load() visualization.visualize( [[p]], show_importance=True, show_performance=False, show_projected_data=False, highlighted_residues=common_peaks, outfile=working_dir + "{extractor}/importance_per_residue_{suffix}.{filetype}".format( suffix="", extractor=extractor.name, filetype=filetype)) if do_computations: visualization.visualize( [[p]], show_importance=False, show_performance=True, show_projected_data=False, outfile=working_dir + "{extractor}/performance_{suffix}.{filetype}".format( extractor=extractor.name, suffix="", filetype=filetype)) visualization.visualize( [[p]], show_importance=False, show_performance=False, show_projected_data=True, outfile=working_dir + "{extractor}/projected_data_{suffix}.{filetype}".format( extractor=extractor.name, suffix="", filetype=filetype)) logger.info("Done")