def make_linreg(expt_dir, name, gene_seq_fname, gene_len_fname, tr_codons_fname, te_codons_fname, outputs_fname, rel_cod_idxs=False, rel_nt_idxs=False, rel_struc_idxs=False, struc_fname=False): linreg.linreg_init(expt_dir, name, gene_seq_fname, gene_len_fname, tr_codons_fname, te_codons_fname, outputs_fname, rel_cod_idxs=rel_cod_idxs, rel_nt_idxs=rel_nt_idxs) len_dict = proc.get_len_dict(gene_len_fname) cds_dict = proc.get_cds_dict(gene_seq_fname, len_dict) if struc_fname: struc_dict = proc.get_struc_dict(struc_fname) else: struc_dict = False X_tr, X_te, y_tr, y_te = proc.get_data_matrices( cds_dict, tr_codons_fname, te_codons_fname, outputs_fname, rel_cod_idxs=rel_cod_idxs, rel_nt_idxs=rel_nt_idxs, rel_struc_idxs=rel_struc_idxs, struc_dict=struc_dict) wts, y_tr_hat, y_te_hat = linreg.train_and_predict(X_tr, X_te, y_tr, y_te) linreg_fname = "{0}/linreg/{1}".format(expt_dir, name) proc.pickle_obj(wts, "{0}/wts.pkl".format(linreg_fname)) proc.pickle_obj(y_tr, "{0}/y_tr.pkl".format(linreg_fname)) proc.pickle_obj(y_tr_hat, "{0}/y_tr_hat.pkl".format(linreg_fname)) proc.pickle_obj(y_te, "{0}/y_te.pkl".format(linreg_fname)) proc.pickle_obj(y_te_hat, "{0}/y_te_hat.pkl".format(linreg_fname)) return wts, y_tr_hat, y_te_hat, y_tr, y_te
def make_lasagne_split_nn(name, expt_dir, gene_seq_fname, gene_len_fname, tr_codons_fname, te_codons_fname, outputs_fname, rel_cod_idxs, rel_nt_idxs, nonlinearity="tanh", widths=[200], input_drop_rate=0, hidden_drop_rate=0, num_outputs=1, update_method="sgd", filter_pct=False, rel_struc_idxs=False, struc_fname=False, max_struc_start_idx=None, max_struc_width=None, learning_rate=0.01, momentum=0.9, batch_size=500): X_tr, y_tr, X_te, y_te = setup_lasagne_nn( name, expt_dir, gene_seq_fname, gene_len_fname, tr_codons_fname, te_codons_fname, outputs_fname, rel_cod_idxs=rel_cod_idxs, rel_nt_idxs=rel_nt_idxs, nonlinearity=nonlinearity, widths=widths, input_drop_rate=input_drop_rate, hidden_drop_rate=hidden_drop_rate, num_outputs=num_outputs, update_method=update_method, filter_max=filter_max, filter_pct=filter_pct, rel_struc_idxs=rel_struc_idxs, struc_fname=struc_fname, max_struc_start_idx=max_struc_start_idx, max_struc_width=max_struc_width, learning_rate=learning_rate, momentum=momentum, batch_size=batch_size) out_dir = expt_dir + "/lasagne_nn" _, _, _, params = inspect.getargvalues(inspect.currentframe()) proc.pickle_obj(params, out_dir + "/{0}/init_data/init_data.pkl".format(name)) my_nn = lasagnenn.SplitMLP(X_tr, y_tr, X_te, y_te, rel_cod_idxs, rel_nt_idxs, name=name, out_dir=out_dir, learning_rate=learning_rate, update_method=update_method, widths=widths, nonlinearity=nonlinearity, input_drop_rate=input_drop_rate, hidden_drop_rate=hidden_drop_rate, num_outputs=num_outputs, momentum=momentum, batch_size=batch_size) return my_nn
def make_lasagne_feedforward_nn(name, expt_dir, gene_seq_fname, gene_len_fname, tr_codons_fname, te_codons_fname, outputs_fname, rel_cod_idxs=[], rel_nt_idxs=[], nonlinearity="tanh", widths=[200], input_drop_rate=0, hidden_drop_rate=0, num_outputs=1, update_method="sgd", filter_max=False, filter_test=False, filter_pct=False, rel_struc_idxs=False, struc_fname=False, max_struc_start_idx=None, max_struc_width=None, aa_feats=False, learning_rate=0.01, lr_decay=16, momentum=0.9, batch_size=500, log_y=False, scaled_psct=0, raw_psct=False, loss_fn="L2", drop_zeros=False, nonnegative=True): """ Sets up neural network model directory, initializes neural network model, saves initial parameters, and returns neural network model Args: name (str) - name of neural network model expt_dir (str) - name of experiment directory gene_seq_fname (str) - name of transcriptome fasta file gene_len_fname (str) - name of gene lengths file tr_codons_fname (str) - name of training set codons file te_codons_fname (str) - name of test set codons file outputs_fname (str) - name of outputs file rel_cod_idxs (list of ints) - indices of codon features in model rel_nt_idxs (list of ints) - indices of nucleotide features in model nonlinearity (str) - name of nonlinearity fn [tanh|rectify|linear] widths (list of ints) - # of units in each hidden layer, in order input_drop_rate (float) - dropout rate for inputs hidden_drop_rate (float) - dropout rate for hidden unit num_outputs (int) - number of units in output layer update_method (str) - name of update method [sgd|momentum|nesterov] NOTE: more arg descriptions here Returns: my_nn (lasagnenn.FeedforwardMLP) - neural network object """ # Initialize neural network directories setup_lasagne_nn(name, expt_dir) # Load neural network data matrices X_tr, y_tr, X_te, y_te = proc.load_lasagne_data( gene_len_fname, gene_seq_fname, tr_codons_fname, te_codons_fname, outputs_fname, rel_cod_idxs=rel_cod_idxs, rel_nt_idxs=rel_nt_idxs, rel_struc_idxs=rel_struc_idxs, struc_fname=struc_fname, max_struc_start_idx=max_struc_start_idx, max_struc_width=max_struc_width, aa_feats=aa_feats, filter_max=filter_max, filter_pct=filter_pct, filter_test=filter_test) # NOTE: Should I remove this? # Handle log transformation of y values if log_y: #Must have either a scaled psct to add, or a raw psct that has already #been put in the counts_by_codon when making the outputs file # Maybe change this scheme for raw pscts in the future? if scaled_psct <= 0 and not raw_psct and not drop_zeros: raise ValueError("Pseudocount must be >= 0 for log y") if scaled_psct > 0: y_tr = np.log(y_tr + scaled_psct) y_te = np.log(y_te + scaled_psct) if (not scaled_psct > 0) and raw_psct: y_tr = np.log(y_tr) y_te = np.log(y_te) if ((not scaled_psct > 0) and not raw_psct) and drop_zeros: positive = (y_tr > 0).ravel() y_tr = y_tr[positive] X_tr = X_tr[positive] # Save initial parameters out_dir = expt_dir + "/lasagne_nn" _, _, _, params = inspect.getargvalues(inspect.currentframe()) del params["X_tr"] del params["X_te"] proc.pickle_obj(params, out_dir + "/{0}/init_data/init_data.pkl".format(name)) # Make neural network object my_nn = lasagnenn.FeedforwardMLP(X_tr, y_tr, X_te, y_te, name=name, out_dir=out_dir, learning_rate=learning_rate, lr_decay=lr_decay, update_method=update_method, widths=widths, nonlinearity=nonlinearity, input_drop_rate=input_drop_rate, hidden_drop_rate=hidden_drop_rate, num_outputs=num_outputs, momentum=momentum, batch_size=batch_size, loss_fn=loss_fn, nonnegative=nonnegative) # Return neural network object return my_nn