import os import re import numpy as np from sklearn.metrics import roc_curve from protfun.models import get_hidden_activations, get_best_params from protfun.utils import save_pickle, load_pickle from protfun.visualizer.molview import MoleculeView from protfun.visualizer.progressview import ProgressView from protfun.visualizer.roc_view import ROCView, micro_macro_roc from protfun.utils.log import get_logger log = get_logger("experiment_visualizer") def create_history_plots(config, model_name, checkpoint=None, until=None): """ Creates training history diagrams for a desired model that has already been trained. :param config: a config dictionary, containing the contents of the config.yaml for the trained model. You can load it from file with protfun.config.get_config(file_path) :param model_name: name (model id) of the model to create diagrams for. Corresponds to the name of the model directory under <data_dir>/models :param checkpoint: (optional) specify a mini-batch at which you want a vertical line visualize to represent when the model was check-pointed :param until: (optional) restrict the number of mini-batches shown in the progress diagram """ model_dir = os.path.join(config["data"]["dir"], "models", model_name) hisotry_files = [ f for f in os.listdir(model_dir) if f.startswith("train_history_ep")
import os from protfun.utils.log import get_logger log = get_logger("protein_fetcher") class EnzymeFetcher(object): """ EnzymeFetcher queries PDB ids for the enzymes EC2PDB data set, extracting them from the EC2PDB website based on desired EC categories. """ def __init__(self, categories, excluded_categories=list(), enzyme_dir=None): """ :param categories: which enzyme categories to download :param excluded_categories: which enzyme categories to exclude :param enzyme_dir: where to download the enzymes """ self.enzyme_dir = enzyme_dir self.excluded_categories = excluded_categories self.leaf_categories = list() log.info("Evaluating the total categorical hierarchy...") for cat in set(categories) - set(excluded_categories): self._find_leaf_categories(cat) self.fetched_prot_codes = dict() def _find_leaf_categories(self, cat):
import os import ntpath import re from glob import glob import itertools from protfun.utils.log import get_logger log = get_logger("validations") class EnzymeValidator(object): """ EnzymeValidator has the task to validate the correctness and completeness of the essential data management steps, e.g. downloading and splitting. This should help finding bugs. """ def __init__(self, enz_classes=None, dirs=None): self.enzyme_classes = enz_classes self.dirs = dirs def check_naming(self, classes): """ checks if the EC classes listed comply with the naming convention, e.g. 1.1.1.1 :param classes: a list of the EC classes :return: """ return sum([ not bool(re.compile(r'[^0-9.]').search(cls)) for cls in classes ]) == len(classes)
import numpy as np import pickle import os import matplotlib matplotlib.use('Agg') import seaborn as sns from protfun.utils.log import get_logger log = get_logger("progress_view") sns.set_style("whitegrid") colors = ['#1b9e77', '#d95f02', '#7570b3', '#e7298a'] sns.set_palette(colors) text = { 'titles': { 'loss': 'Loss progression during training', 'accuracy': 'Accuracy progression during training', 'per_class_accs': 'Accuracy progression per class during training' }, 'y_labels': { 'loss': 'Loss', 'accuracy': 'Accuracy', 'per_class_accs': 'Accuracy' } } class ProgressView(object):
import shutil import abc import numpy as np import os import protfun.data_management.preprocess as prep from protfun.data_management.label_factory import LabelFactory from protfun.data_management.validation import EnzymeValidator from protfun.utils import save_pickle, load_pickle, construct_hierarchical_tree from protfun.utils.log import get_logger log = get_logger("data_manager") class DataManager(object): """ DataManager is a parent class for EnzymeDataManager which stores all data directories and implements a *naive* split strategy described below. """ __metaclass__ = abc.ABCMeta def __init__(self, data_dir, force_download=False, force_process=False, force_split=False, percentage_test=10, percentage_val=20): """ :param data_dir: the path to the root data directory :param force_download: forces the downloading of the enzymes :param force_process: forces the pre-processing steps :param force_split: forces the splitting of the data into training ,validation and test sets :param percentage_test: the portion in % of the test data :param percentage_val: the portion in % of the validation data
import numpy as np import os from protfun.utils.log import get_logger log = get_logger("molview") class MoleculeView(object): """ MoleculeView visualizes the generated 3D input maps of electron density and electrostatic potential """ def __init__(self, data_dir, data=None, info=None): """ Parameters: - data : - info : a dictionary with keys "id", "name" (and more). :param data_dir: a directory where the figures should be stored :param data: a dictionary with keys "density" and "potential" containing 3d numpy arrays with the molecule's electron density and electron potential distribution. :param info: additional info to be printed in the title/legend such as the molecule PDB code """ self.data_dir = data_dir self.figures_dir = os.path.join(self.data_dir, "figures") if not os.path.exists(self.figures_dir): os.makedirs(self.figures_dir) if info is not None: self.molecule_name = info["name"]
import lasagne import numpy as np import theano import theano.tensor.nlinalg import theano.tensor as T from protfun.visualizer.molview import MoleculeView from protfun.utils.log import get_logger log = get_logger("molmap_layer") floatX = theano.config.floatX intX = np.int32 class MoleculeMapLayer(lasagne.layers.MergeLayer): """ This is a Lasagne layer to calculate 3D grid maps (electron density estimated from VdW radii) of molecules. (using Theano, i.e. on the GPU). Usage:: >>> from lasagne.layers import InputLayer >>> minibatch_size = 8 >>> dummy_coords_input = InputLayer(shape=(minibatch_size, None, None)) >>> dummy_vdwradii_input = InputLayer(shape=(minibatch_size, None)) >>> dummy_natoms_input = InputLayer(shape=(minibatch_size,)) >>> molmap_layer = MoleculeMapLayer( >>> incomings=[dummy_coords_input, dummy_vdwradii_input, dummy_natoms_input], >>> minibatch_size=minibatch_size, rotate=True) """
import cPickle import os from protfun.utils.log import get_logger log = get_logger("data_utils") def save_pickle(file_path, data): """ Saves a pickle with the provided data. Usage:: >>> # single save >>> train_prot_codes = dict() >>> save_pickle("data/train_prot_codes.pickle", train_prot_codes) >>> # multi save >>> train_prot_codes, test_prot_codes = dict(), dict() >>> save_pickle(["data/train_prot_codes.pickle", "data/test_prot_codes.pickle"], >>> [train_prot_codes, test_prot_codes]) :param file_path: path (or paths) of the file(s) to be saved :param data: data object (or list of data objects) that will be saved :raises: ValueError if the number of paths and data objects do not match """ if isinstance(data, list) and isinstance(file_path, list): if len(data) == len(file_path): for path, dat in zip(file_path, data): with open(path, 'wb') as f: cPickle.dump(dat, f) else:
import numpy as np import theano import theano.tensor as T import lasagne from protfun.layers.grid_rotate_layer import GridRotationLayer from protfun.utils.log import get_logger log = get_logger("joint_class_model") floatX = theano.config.floatX intX = np.int32 class JointClassModel(object): """ Abstract class, not meant to be instantiated. JointClassModel is the standard generic multi-class classifier model, that uses a single softmax in its output layer. Samples can be thus members of only one single class. """ def __init__(self, name, n_classes, learning_rate): """ :param name: name of the model, used by external mechanisms for saving training history etc. :param n_classes: total number of different classes for the classification. :param learning_rate: initial learning rate """ self.name = name self.n_classes = n_classes self.learning_rate = learning_rate
import abc import numpy as np import theano from os import path from protfun.utils import construct_hierarchical_tree from protfun.utils.log import get_logger log = get_logger("data_feed") floatX = theano.config.floatX intX = np.int32 class DataFeeder(object): """ DataFeeder is an abstract class (not meant to be instantiated). All data feeders implement iterate_{train, test, val}_data() and get_{train, test, val}_data() methods. The iterate methods are mini- batch generators (you can use for loops on them to get mini-batches), whereas the get_ methods return the whole data sets. Thus, the data feeders are meant to be used during training / testing of models to provide the data that must be fed into them. Usage: >>> dummy_feeder = DataFeeder(...) >>> for train_minibatch in dummy_feeder.iterate_train_data(): >>> # do something to the minibatch, e.g. feed forward into >>> # your model """
import numpy as np import os import lasagne import cPickle from protfun.utils.log import get_logger log = get_logger("model_monitor") class ModelMonitor(object): """ Monitors the model during training and testing. Logs the error and accuracy values and can creates checkpoints of the model parameters (triggered in the ModelTrainer whenever the mean validation error is being improved). Optionally dumps the model status on KeyInterrupt. """ def __init__(self, outputs, data_dir, name): """ :param outputs: lasagne output layers of the neural network of the monitored model. Used to checkpoint the model parameters during training. :param data_dir: data directory under which the monitor will create a folder for the currently monitored model (or use an existing one, if already present). The path is: data_dir/models/<model_name> :param name: name of the currently monitored model """ self.network_outputs = outputs self.name = name self.path_to_model_dir = os.path.join(data_dir, "models", self.name)
import numpy as np import theano import lasagne from theano import tensor as T from protfun.utils.log import get_logger log = get_logger("grid_rotate_layer") floatX = theano.config.floatX class GridRotationLayer(lasagne.layers.Layer): """ GridRotationLayer is a dynamic 3D augmentation layer that can be used in the beginning of any neural network. It performs random rotations and (small) translations in 3D space on the fly. Usage:: >>> from lasagne.layers import InputLayer >>> minibatch_size = 8 >>> n_channels = 2 >>> side = 32 >>> input_layer = InputLayer(shape=(minibatch_size, n_channels, side, side, side), >>> input_var=input_grid) >>> # apply the rotation layer >>> rotation_layer = GridRotationLayer(incoming=input_layer, grid_side=side, >>> n_channels=n_channels, interpolation='linear') """ min_dist_from_border = 5 def __init__(self,
import re import numpy as np from protfun.utils import save_pickle from protfun.config import save_config from protfun.data_management.data_feed import EnzymesGridFeeder from protfun.data_management.data_manager import EnzymeDataManager from protfun.models import GridsDisjointClassifier from protfun.models.model_monitor import ModelMonitor from protfun.networks import get_network from protfun.utils.np_utils import pp_array from protfun.visualizer.netview import NetworkView from protfun.visualizer.progressview import ProgressView from protfun.utils.log import get_logger log = get_logger("model_trainer") class ModelTrainer(object): """ ModelTrainer is responsible for the training & testing of a model. It supervises the training procedure, saves information about the training into files and can also validate & test a trained model in the end. It takes a data feeder as an argument in the constructor, and then fetches mini-batches from the data feeder during each training iteration and forwards them to the model under training. The model to be trained is provided as a parameter to the constructor of the model trainer. Usage:: >>> model = GridsDisjointClassifier(...) >>> feeder = EnzymesGridFeeder(...)
import StringIO import theano import lasagne import cPickle import itertools import prody as pd import rdkit.Chem as Chem import rdkit.Chem.rdPartialCharges as rdPC import rdkit.Chem.rdMolTransforms as rdMT import rdkit.Chem.rdmolops as rdMO from protfun.layers import MoleculeMapLayer from protfun.utils.log import get_logger log = get_logger("preprocessor") floatX = theano.config.floatX intX = np.int32 # number of sidechain channels (20 amino, all, nonhydro, hydro, backbone) CNS = 24 class DataProcessor(object): __metaclass__ = abc.ABCMeta def __init__(self, from_dir, target_dir): self.from_dir = from_dir self.target_dir = target_dir @abc.abstractmethod def process(self):