示例#1
0
def make_linreg(expt_dir,
                name,
                gene_seq_fname,
                gene_len_fname,
                tr_codons_fname,
                te_codons_fname,
                outputs_fname,
                rel_cod_idxs=False,
                rel_nt_idxs=False,
                rel_struc_idxs=False,
                struc_fname=False):
    linreg.linreg_init(expt_dir,
                       name,
                       gene_seq_fname,
                       gene_len_fname,
                       tr_codons_fname,
                       te_codons_fname,
                       outputs_fname,
                       rel_cod_idxs=rel_cod_idxs,
                       rel_nt_idxs=rel_nt_idxs)
    len_dict = proc.get_len_dict(gene_len_fname)
    cds_dict = proc.get_cds_dict(gene_seq_fname, len_dict)
    if struc_fname:
        struc_dict = proc.get_struc_dict(struc_fname)
    else:
        struc_dict = False
    X_tr, X_te, y_tr, y_te = proc.get_data_matrices(
        cds_dict,
        tr_codons_fname,
        te_codons_fname,
        outputs_fname,
        rel_cod_idxs=rel_cod_idxs,
        rel_nt_idxs=rel_nt_idxs,
        rel_struc_idxs=rel_struc_idxs,
        struc_dict=struc_dict)
    wts, y_tr_hat, y_te_hat = linreg.train_and_predict(X_tr, X_te, y_tr, y_te)
    linreg_fname = "{0}/linreg/{1}".format(expt_dir, name)
    proc.pickle_obj(wts, "{0}/wts.pkl".format(linreg_fname))
    proc.pickle_obj(y_tr, "{0}/y_tr.pkl".format(linreg_fname))
    proc.pickle_obj(y_tr_hat, "{0}/y_tr_hat.pkl".format(linreg_fname))
    proc.pickle_obj(y_te, "{0}/y_te.pkl".format(linreg_fname))
    proc.pickle_obj(y_te_hat, "{0}/y_te_hat.pkl".format(linreg_fname))
    return wts, y_tr_hat, y_te_hat, y_tr, y_te
示例#2
0
def make_lasagne_split_nn(name,
                          expt_dir,
                          gene_seq_fname,
                          gene_len_fname,
                          tr_codons_fname,
                          te_codons_fname,
                          outputs_fname,
                          rel_cod_idxs,
                          rel_nt_idxs,
                          nonlinearity="tanh",
                          widths=[200],
                          input_drop_rate=0,
                          hidden_drop_rate=0,
                          num_outputs=1,
                          update_method="sgd",
                          filter_pct=False,
                          rel_struc_idxs=False,
                          struc_fname=False,
                          max_struc_start_idx=None,
                          max_struc_width=None,
                          learning_rate=0.01,
                          momentum=0.9,
                          batch_size=500):

    X_tr, y_tr, X_te, y_te = setup_lasagne_nn(
        name,
        expt_dir,
        gene_seq_fname,
        gene_len_fname,
        tr_codons_fname,
        te_codons_fname,
        outputs_fname,
        rel_cod_idxs=rel_cod_idxs,
        rel_nt_idxs=rel_nt_idxs,
        nonlinearity=nonlinearity,
        widths=widths,
        input_drop_rate=input_drop_rate,
        hidden_drop_rate=hidden_drop_rate,
        num_outputs=num_outputs,
        update_method=update_method,
        filter_max=filter_max,
        filter_pct=filter_pct,
        rel_struc_idxs=rel_struc_idxs,
        struc_fname=struc_fname,
        max_struc_start_idx=max_struc_start_idx,
        max_struc_width=max_struc_width,
        learning_rate=learning_rate,
        momentum=momentum,
        batch_size=batch_size)

    out_dir = expt_dir + "/lasagne_nn"
    _, _, _, params = inspect.getargvalues(inspect.currentframe())
    proc.pickle_obj(params,
                    out_dir + "/{0}/init_data/init_data.pkl".format(name))

    my_nn = lasagnenn.SplitMLP(X_tr,
                               y_tr,
                               X_te,
                               y_te,
                               rel_cod_idxs,
                               rel_nt_idxs,
                               name=name,
                               out_dir=out_dir,
                               learning_rate=learning_rate,
                               update_method=update_method,
                               widths=widths,
                               nonlinearity=nonlinearity,
                               input_drop_rate=input_drop_rate,
                               hidden_drop_rate=hidden_drop_rate,
                               num_outputs=num_outputs,
                               momentum=momentum,
                               batch_size=batch_size)

    return my_nn
示例#3
0
def make_lasagne_feedforward_nn(name,
                                expt_dir,
                                gene_seq_fname,
                                gene_len_fname,
                                tr_codons_fname,
                                te_codons_fname,
                                outputs_fname,
                                rel_cod_idxs=[],
                                rel_nt_idxs=[],
                                nonlinearity="tanh",
                                widths=[200],
                                input_drop_rate=0,
                                hidden_drop_rate=0,
                                num_outputs=1,
                                update_method="sgd",
                                filter_max=False,
                                filter_test=False,
                                filter_pct=False,
                                rel_struc_idxs=False,
                                struc_fname=False,
                                max_struc_start_idx=None,
                                max_struc_width=None,
                                aa_feats=False,
                                learning_rate=0.01,
                                lr_decay=16,
                                momentum=0.9,
                                batch_size=500,
                                log_y=False,
                                scaled_psct=0,
                                raw_psct=False,
                                loss_fn="L2",
                                drop_zeros=False,
                                nonnegative=True):
    """
    Sets up neural network model directory, 
        initializes neural network model, 
        saves initial parameters, 
        and returns neural network model

    Args:
        name (str) - name of neural network model
        expt_dir (str) - name of experiment directory
        gene_seq_fname (str) - name of transcriptome fasta file
        gene_len_fname (str) - name of gene lengths file
        tr_codons_fname (str) - name of training set codons file
        te_codons_fname (str) - name of test set codons file
        outputs_fname (str) - name of outputs file
        rel_cod_idxs (list of ints) - indices of codon features in model
        rel_nt_idxs (list of ints) - indices of nucleotide features in model
        nonlinearity (str) - name of nonlinearity fn [tanh|rectify|linear]
        widths (list of ints) - # of units in each hidden layer, in order
        input_drop_rate (float) - dropout rate for inputs
        hidden_drop_rate (float) - dropout rate for hidden unit
        num_outputs (int) - number of units in output layer
        update_method (str) - name of update method [sgd|momentum|nesterov]
        NOTE: more arg descriptions here

    Returns: 
        my_nn (lasagnenn.FeedforwardMLP) - neural network object
    """
    # Initialize neural network directories
    setup_lasagne_nn(name, expt_dir)

    # Load neural network data matrices
    X_tr, y_tr, X_te, y_te = proc.load_lasagne_data(
        gene_len_fname,
        gene_seq_fname,
        tr_codons_fname,
        te_codons_fname,
        outputs_fname,
        rel_cod_idxs=rel_cod_idxs,
        rel_nt_idxs=rel_nt_idxs,
        rel_struc_idxs=rel_struc_idxs,
        struc_fname=struc_fname,
        max_struc_start_idx=max_struc_start_idx,
        max_struc_width=max_struc_width,
        aa_feats=aa_feats,
        filter_max=filter_max,
        filter_pct=filter_pct,
        filter_test=filter_test)

    # NOTE: Should I remove this?
    # Handle log transformation of y values
    if log_y:
        #Must have either a scaled psct to add, or a raw psct that has already
        #been put in the counts_by_codon when making the outputs file
        # Maybe change this scheme for raw pscts in the future?
        if scaled_psct <= 0 and not raw_psct and not drop_zeros:
            raise ValueError("Pseudocount must be >= 0 for log y")
        if scaled_psct > 0:
            y_tr = np.log(y_tr + scaled_psct)
            y_te = np.log(y_te + scaled_psct)
        if (not scaled_psct > 0) and raw_psct:
            y_tr = np.log(y_tr)
            y_te = np.log(y_te)
        if ((not scaled_psct > 0) and not raw_psct) and drop_zeros:
            positive = (y_tr > 0).ravel()
            y_tr = y_tr[positive]
            X_tr = X_tr[positive]

    # Save initial parameters
    out_dir = expt_dir + "/lasagne_nn"
    _, _, _, params = inspect.getargvalues(inspect.currentframe())
    del params["X_tr"]
    del params["X_te"]
    proc.pickle_obj(params,
                    out_dir + "/{0}/init_data/init_data.pkl".format(name))

    # Make neural network object
    my_nn = lasagnenn.FeedforwardMLP(X_tr,
                                     y_tr,
                                     X_te,
                                     y_te,
                                     name=name,
                                     out_dir=out_dir,
                                     learning_rate=learning_rate,
                                     lr_decay=lr_decay,
                                     update_method=update_method,
                                     widths=widths,
                                     nonlinearity=nonlinearity,
                                     input_drop_rate=input_drop_rate,
                                     hidden_drop_rate=hidden_drop_rate,
                                     num_outputs=num_outputs,
                                     momentum=momentum,
                                     batch_size=batch_size,
                                     loss_fn=loss_fn,
                                     nonnegative=nonnegative)

    # Return neural network object
    return my_nn