Exemplo n.º 1
0
def mbgcbuild(prot_alignment, prot_family_name, cohort_name,
              nucl_seq_directory, prot_seq_directory, seq_fmt, pair_fmt,
              r1_file_suffix, r2_file_suffix, tp_genes_nucl,
              blast_db_directory, blastn_search_directory,
              hmm_search_directory, f1_thresh, output_directory, cpu):
    startTime = time.time()
    if cpu is not None:
        CPU_THREADS = int(cpu)

    # setup paths
    build_op_dir = output_directory + os.sep + "build"
    hmm_directory = os.path.join(build_op_dir, 'spHMMs')
    tp_genes_prot = build_op_dir + os.sep + "TPGenes.faa"
    alnOutput = os.path.join(build_op_dir, "TP_Homolog_Alignment.afa")
    gene_pos_file = os.path.join(build_op_dir, 'Gene_Interval_Pos.txt')
    gene_pos_file_aa = os.path.join(build_op_dir, 'Gene_Interval_Pos_AA.txt')
    if hmm_search_directory is None:
        hmm_search_directory = os.path.join(build_op_dir, 'hmm_result')
    allHMMResult = os.path.join(build_op_dir, "CombinedHmmSearch.txt")
    if blastn_search_directory is None:
        blastn_search_directory = os.path.join(build_op_dir, 'blastn_result')
    allBLASTResult = os.path.join(build_op_dir, "CombinedBLASTSearch.txt")

    # Create OP dirs
    os.makedirs(hmm_directory, 0o777, True)

    # Translate protein sequence
    runTranSeq(tp_genes_nucl, "1", tp_genes_prot)

    # Join true positives in the sample with the BGC proteins
    tmpFile = os.path.join(build_op_dir, "TP_Homolog.faa")
    joinedSeqs = []
    tpGeneSeqs = list(SeqIO.parse(tp_genes_prot, "fasta"))
    # Removing _1 added by TranSeq
    for seq in tpGeneSeqs:
        seq.id = seq.id[:-2]
        seq.description = ""
        joinedSeqs.append(seq)
    SeqIO.write(joinedSeqs, tp_genes_prot, "fasta")
    protAlnSeqs = list(SeqIO.parse(prot_alignment, "fasta"))
    for seq in protAlnSeqs:
        joinedSeqs.append(seq)
    SeqIO.write(joinedSeqs, tmpFile, "fasta")

    # MUSCLE align TP genes with markers
    runMUSCLE(tmpFile, alnOutput)

    # Gen spHMMs and interval pos
    # Extract spHMM coordinates from MUSCLE alignment
    hmmDict = gensphmmfiles(prot_family_name, alnOutput, tp_genes_prot,
                            hmm_directory, gene_pos_file, gene_pos_file_aa)

    if r1_file_suffix is None:
        r1_file_suffix = ""
    if r2_file_suffix is None:
        r2_file_suffix = ""

    # #Preprocess synthetic reads
    nucl_seq_directory = PreProcessReadsPar(nucl_seq_directory, seq_fmt,
                                            pair_fmt, r1_file_suffix.strip(),
                                            r2_file_suffix.strip(),
                                            build_op_dir, CPU_THREADS)

    #Check if BLAST DB directory is provided or not
    if blast_db_directory is None:
        blast_db_directory = ""

    # Translate nucleotide seq
    if not os.path.isdir(prot_seq_directory):
        prot_seq_directory = TranseqReadsDir(build_op_dir, nucl_seq_directory,
                                             CPU_THREADS)

    # HMMER Search
    if not os.path.exists(allHMMResult):
        os.makedirs(hmm_search_directory, 0o777, True)
        for hmmSeqPosKey, hmmFileObj in hmmDict.items():
            hmmInterval = str(hmmDict[hmmSeqPosKey].intervalStart) + "_" + str(
                hmmDict[hmmSeqPosKey].intervalEnd)
            RunHMMDirectoryParallel(prot_seq_directory, hmmFileObj.hmmFile,
                                    cohort_name, prot_family_name, "30_10",
                                    hmmInterval, hmm_search_directory,
                                    CPU_THREADS)

        with open(allHMMResult, 'w') as outfile:
            for subdir, dirs, files in os.walk(hmm_search_directory):
                for file in files:
                    filePath = os.path.join(subdir, file)
                    if re.match(r".*txt$",
                                file) and os.path.getsize(filePath) > 0:
                        with open(filePath) as infile:
                            for line in infile:
                                outfile.write(line)

    # BLAST Alignment
    if not os.path.exists(allBLASTResult):
        if not os.path.isdir(blastn_search_directory):
            print("Constructing BLAST Search Dir:" + blastn_search_directory)
            os.makedirs(blastn_search_directory, 0o777, True)
            RunBLASTNDirectoryPar(
                nucl_seq_directory, blast_db_directory, tp_genes_nucl,
                "-max_target_seqs 10000 -perc_identity 90.0",
                blastn_search_directory, CPU_THREADS)

        with open(allBLASTResult, 'w') as outfile:
            outfile.write(
                "sseqid\tslen\tsstart\tsend\tqseqid\tqlen\tqstart\tqend\tpident\tevalue\tSample\tsampleType\n"
            )
            for subdir, dirs, files in os.walk(blastn_search_directory):
                for file in files:
                    filePath = os.path.join(subdir, file)
                    if re.match(r".*txt$",
                                file) and os.path.getsize(filePath) > 0:
                        with open(filePath) as infile:
                            for line in infile:
                                sampleName = ntpath.basename(filePath).split(
                                    ".txt")[0]
                                outfile.write(line.strip() + "\t" +
                                              sampleName + "\t" + cohort_name +
                                              "\n")

    # Eval spHMMs
    rpackages.importr('base')
    utils = rpackages.importr('utils')
    packageNames = ('tidyverse', 'ggsci', 'ggpubr', 'dplyr', 'ggplot2')
    packnames_to_install = [
        x for x in packageNames if not rpackages.isinstalled(x)
    ]
    if len(packnames_to_install) > 0:
        utils.install_packages(StrVector(packnames_to_install))
    rpackages.importr('tidyverse')
    rpackages.importr('ggsci')
    rpackages.importr('ggpubr')
    rpackages.importr('dplyr')
    rpackages.importr('ggplot2')

    hp_hmm_directory = os.path.join(build_op_dir, 'HiPer_spHMMs')
    os.makedirs(hp_hmm_directory, 0o777, True)
    module_dir = os.path.dirname(os.path.abspath(createhmm.__file__))
    print("\nR-script path : " + module_dir)
    #module_dir = os.path.join(os.path.dirname(__file__))
    #r_script = os.path.join('/projects/DONIA/abiswas/git/MetaBGC/MetaBGC-Development','metabgc','src','EvaluateSpHMMs.R')
    r_script = os.path.join(module_dir, 'EvaluateSpHMMs.R')

    with open(r_script, 'r') as f:
        rStr = f.read()
    myfunc = STAP(rStr, "EvaluateSpHMM")
    myfunc.EvaluateSpHMM(allHMMResult,
                         allBLASTResult, gene_pos_file, prot_family_name,
                         float(f1_thresh), hmm_directory, hp_hmm_directory)
    timeTaken = time.time() - startTime
    mins = int(timeTaken / 60)
    secs = int(timeTaken) % 60
    print("\nTotal time taken : " + str(mins) + " mins " + str(secs) +
          " seconds")
    return hp_hmm_directory
Exemplo n.º 2
0
        if type(val) not in [str, int, float, list]:
            sys.exit(1)

    return initial_config


if __name__ == '__main__':

    # Our application currently will contain only one pipeline
    p = Pipeline()

    # -------------------------- Stage 1 ---------------------------------------
    # Read initial configuration from R function
    with open('setup.R', 'r') as f:
        R_code = f.read()
    initial_config = STAP(R_code, 'initial_config')
    config = initial_config.initial_config(False)
    initial_config = dict(zip(config.names, list(config)))

    if not test_initial_config(initial_config):
        sys.exit(1)

    initial_config = process_initial_config(initial_config)

    #################################################
    # additional conversion from for the dictionary #
    #################################################

    # First stage corresponds to the AnEn computation
    s1 = Stage()
Exemplo n.º 3
0

user_name = getpass.getuser()

# a function uniting all other R functions need for annotation
fullAnnotation = """
fullAnnotationInGRanges <- function(resistance_table){
  source("/home/%s/bin/makeAnnotation.R")
  source("/home/%s/bin/resist2GRanges.R")
  resist_ranges <- resist2GRanges(resistance_table)
  annots <- makeAnnotation(withRanges=TRUE, inRanges=resist_ranges, outTable="annotation_rpoABC")
  
}
""" % (user_name, user_name)

R_Annot = STAP(fullAnnotation, "R_Annot")

help_message = "Script for annotating AA-changes in AB-resistance regions, v 1.2\n" \
               "VCF files are taken from the CWD\n" \
               "Resistance is taken from resistance_SNPs_withoutrpoAC.csv table\n" \
               "Three R functions should be in ~/bin\n" \
               "Four arguments have to be passed:\n" \
               "1 - reference genome accession (NC_000962)\n" \
               "2 - reference genome fasta (H37Rv.fna)\n" \
               "3 - reference genome annotation (H37Rv.gff)"

if len(sys.argv) < 4:
    print(help_message)
    sys.exit()

# check how many vcf files have SNPs in resistance table
Exemplo n.º 4
0
def main_function((individ, X_df, df, objective_str, trgt)):
    """
    Function to calculate BIC or AIC of model
    Input: individual defining model parameters to be used, 
    dataframe with independent variables, dataframe with all data,
    metric to be used (BIC or AIC), dependent variable string
    Output: mutate individual
    """
    #----------------------------------------------------------------------
    # Import necessary modules
    import numpy as np
    from rpy2.robjects import pandas2ri
    pandas2ri.activate()
    from rpy2.robjects.packages import STAP
    #----------------------------------------------------------------------
    # Remove variables from dataframe that are not part of the individual's genome
    vars_lst = list(X_df.columns)  # Get list of the available variables
    gen_ind_0 = np.where(individ == 0)  # Find which elements are set to 0
    gen_ind_0 = gen_ind_0[0]  # Get indices of variables to be removed
    vars_lst2 = vars_lst[:]  # Copy list of all variables
    # Remove variables
    for i in sorted(gen_ind_0, reverse=True):
        del vars_lst2[i]
    #----------------------------------------------------------------------
    # Fit model in R
    # Create formula to be used in R
    myString = "+".join(vars_lst2)
    stable_str = 'as.ordered(' + trgt + ') ~ '
    formula = stable_str + myString
    # Transform Pandas dataframe to R
    rdf = pandas2ri.py2ri(df)
    # Define R function as string
    string = """
    mdl_func <- function(formula,df) {
            library(VGAM)
            mdl1=vglm(formula,family=propodds, data=df)  
            ll=logLik(mdl1)    	
        return(ll)
    }
    """
    ord_ll = STAP(string, "ord_ll")
    # Calculate AIC and BIC based on LogLikelihood (ll_)
    try:
        ll_ = ord_ll.mdl_func(formula, rdf)
        ll_ = ll_[0]
    # In case LogLikelihood calculation fails
    except:
        ll_ = -1000.0
    k = float(len(vars_lst2))
    n = float(len(df))
    aic_ = (2.0 * k) - (2 * ll_)
    bic_ = (np.log(n) * k) - (2 * ll_)
    # Return AIC or BIC depending on used choice
    if objective_str == 'aic':
        obj_ = aic_
    elif objective_str == 'bic':
        obj_ = bic_
    else:
        obj_ = np.nan
    # Return optimisation metric
    return obj_, individ
Exemplo n.º 5
0
    '/Users/gracejia/Documents/A-UW/covid19 NSF Project/COVID19_RL/COVID19_models/SEIR/states.csv'
)

thetas_init = np.array(thetas_init.iloc[0, :6], dtype=np.float64)
thetas_sd_init = np.array(thetas_sd_init.iloc[0, :6], dtype=np.float64)
#%%
#thetas_updated = np.zeros((2,6))
numpy2ri.activate()
pandas2ri.activate()

cwd = os.getcwd()
with open(
        '/Users/gracejia/Documents/A-UW/covid19 NSF Project/COVID19_RL/COVID19_models/SEIR/update_thetas_weekly.R',
        'r') as f:
    string = f.read()
seir_theta = STAP(string, "seir_theta")
seir_theta = seir_theta.getThetas

modelOut = seir_theta(thetas_so_far=thetas_init,
                      thetas_sd_so_far=thetas_sd_init,
                      pred=1)

os.chdir(cwd)
print(modelOut)
#%%
with open(
        '/Users/gracejia/Documents/A-UW/covid19 NSF Project/COVID19_RL/COVID19_models/SEIR/seir_r_weekly.R',
        'r') as f:
    string = f.read()
seir_r_weekly = STAP(string, "seir_r_weekly")
seir_r_weekly = seir_r_weekly.seirPredictions
Exemplo n.º 6
0
base = importr('base')
rpart = importr('rpart')
stats = importr('stats')
base64enc = importr('base64enc')
C50 = importr('C50')
caret = importr('caret')

# pandas2ri.activate()
path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                    'scripts/graphvizC50.R')

with open(path, 'r') as f:
    graphvizC50_string = f.read()

graphvizC50 = STAP(graphvizC50_string, 'graphvizC50')


def r_options(*args, **kwargs):
    return base.options(*args, **kwargs)


def r_c(*args) -> RVector:
    """
    Generic function wrapper around the c function.
    """
    return base.c(*args)


def r_data_frame(*args, **kwargs) -> RDataFrame:
    """
Exemplo n.º 7
0
def compare(pred_Y,
            test_Y,
            predict_binary_output,
            peaks=None,
            save_curves=True,
            save_data=False):
    """
    Evaluates performance for predictions pred_Y relative to true labels test_Y. 
    If predict_binary_output, pred_Y should be a set of scores and test_Y should be 0, 1 labels. 
    Otherwise, both pred_Y and test_Y should be continuous values. 
    Returns squared error and Pearson correlation between the predicted output and the actual output.
    
    Both pred_Y and test_Y must be matrices of shape num_examples x num_histone_marks, 
    or they must both be matrices of shape num_examples x seq_length x num_histone_marks.
    If the latter, examples are concatenated together before correlations are computed.

    peaks is a list. Each element of this list corresponds to one mark and is a N x 2 matrix 
    where each row contains the (start, end) coordinates of a peak in that mark.
    If passing in peaks, make sure the coordinate system matches that of pred_Y and test_Y!
    For example, if your peaks start at the start of the chromosome, then pred_Y and test_Y have
    to start at the start of the chromosome as well.

    If save_curves is True, it saves the full precision-recall curve. save_curves cannot be True if 
    predict_binary_output is False. Right now it saves recalls @10, 20...90% precision. 

    If save_data is True, it saves the first mark of pred_Y and test_Y.
        
    Returns results, a dictionary containing:
        'AUC' (if predict_binary_output)
        'AUPRC' (if predict_binary_output)
        'precision_curves' (if save_curves)
        'recall_curves' (if save_curves)
        'threshold_curves' (if save_curves)
        'MSE' (if not predict_binary_output)
        'true_var' (if not predict_binary_output)
        'pearsonR' (if not predict_binary_output)
        'pred_Y' (if save_data)
        'test_Y' (if save_data)

    AUC, AUPRC, MSE, true_var, pearsonR, and spearmanR are each vectors of length num_histone_marks.  
    true_var is the variance of the true data; it is useful for interpreting whether a given
    MSE is good or bad.
    """

    # save_curves has to be False if predict_binary_output is also False
    if not predict_binary_output: save_curves = False

    pred_Y_is_binary = is_binary(pred_Y)
    test_Y_is_binary = is_binary(test_Y)
    assert pred_Y.shape == test_Y.shape, \
        "pred_Y.shape = %s doesn't match test_Y.shape = %s" % (str(pred_Y.shape), str(test_Y.shape))
    assert test_Y_is_binary == predict_binary_output

    #test_Y (the true labels) ought to be binary IFF we're predicting binary output.
    #pred_Y should be a set of continuous scores, regardless of whether we're predicting binary output.
    assert len(pred_Y.shape) == 2 or len(pred_Y.shape) == 3

    # If peaks is not None, then there should be one element in peaks for each mark in pred_Y.
    if peaks:
        assert len(peaks) == pred_Y.shape[-1]

    # If the input matrices are 3D, then squash the first two dimensions together
    if len(pred_Y.shape) == 3:
        pred_Y = np.reshape(
            pred_Y, [pred_Y.shape[0] * pred_Y.shape[1], pred_Y.shape[2]])
        test_Y = np.reshape(
            test_Y, [test_Y.shape[0] * test_Y.shape[1], test_Y.shape[2]])

    num_histone_marks = pred_Y.shape[len(pred_Y.shape) - 1]

    true_var = []
    MSE = []
    pearsonR = []

    precision_curves = []
    recall_curves = []
    threshold_curves = []
    auc = []
    auprc = []
    Y_pos_frac = []

    with open('PRROC.R', 'r') as f:  #load in the R code.
        r_fxn_string = f.read()
    r_auc_func = STAP(r_fxn_string, "auc_func")

    for mark_idx in range(num_histone_marks):
        ### Sub-select only peak regions
        if peaks:
            # If peaks exists but peaks[mark_idx] is set to None, we should skip this mark.
            # This mark should correspond to INPUT, which has no peaks of its own.
            if peaks[mark_idx] is None:
                if predict_binary_output:
                    precision_curves.append(None)
                    recall_curves.append(None)
                    threshold_curves.append(None)
                    auprc.append(None)
                    auc.append(None)
                else:
                    true_var.append(None)
                    MSE.append(None)
                    pearsonR.append(None)
                continue

            # Initialize peak_idxs to all False
            num_bins = pred_Y.shape[0]
            peak_idxs = np.zeros(num_bins, dtype=bool)

            # Set peak_idx such that it is True in each peak
            # Simultaneously get the average signal density in each peak
            for peak_counter, peak in enumerate(peaks[mark_idx]):
                # We have to check for this, because pred_Y and test_Y might only represent
                # a fraction of any given chromosome
                if peak[1] > num_bins:
                    continue

                peak_idxs[peak[0]:peak[1]] = True

            pred_Y_mark = pred_Y[peak_idxs, mark_idx]
            test_Y_mark = test_Y[peak_idxs, mark_idx]
        else:
            pred_Y_mark = pred_Y[:, mark_idx]
            test_Y_mark = test_Y[:, mark_idx]

        ### Run evaluations on (selected) regions
        if predict_binary_output:
            precisions, recalls, thresholds = precision_recall_curve(
                test_Y_mark, pred_Y_mark)
            precisions, recalls = compute_recalls_at_precision(
                precisions, recalls)

            precision_curves.append(list(precisions))
            recall_curves.append(list(recalls))

            if len(test_Y_mark) < 100000:
                downsample_idxs = range(len(test_Y_mark))
            else:
                downsample_idxs = sample(range(len(test_Y_mark)), 100000)

            r_auprc_results = r_auc_func.pr_curve(
                scores_class0=robjects.vectors.FloatVector(
                    pred_Y_mark[downsample_idxs]),
                weights_class0=robjects.vectors.FloatVector(
                    test_Y_mark[downsample_idxs]))

            auprc.append(float(r_auprc_results.rx('auc.davis.goadrich')[0][0]))
            r_auc_results = r_auc_func.roc_curve(
                scores_class0=robjects.vectors.FloatVector(
                    pred_Y_mark[downsample_idxs]),
                weights_class0=robjects.vectors.FloatVector(
                    test_Y_mark[downsample_idxs]))
            auc.append(float(r_auc_results.rx('auc')[0][0]))
            Y_pos_frac.append(test_Y_mark.mean())
            print("AUC %2.3f; AUPRC %2.3f" % (auc[mark_idx], auprc[mark_idx]))
        else:
            true_var.append(np.var(test_Y_mark))
            MSE.append(get_MSE(pred_Y_mark, test_Y_mark))
            pearsonR.append(get_pearsonR(pred_Y_mark, test_Y_mark))

            print("MSE %2.3f (true var %2.3f), pearsonR %2.3f" %
                  (MSE[mark_idx], true_var[mark_idx], pearsonR[mark_idx]))

    if predict_binary_output:
        assert ((len(precisions) > 0) and (len(recalls) > 0))
        results = {'AUC': auc, 'AUPRC': auprc, 'Y_pos_frac': Y_pos_frac}
        results['precision_curves'] = precision_curves
        results['recall_curves'] = recall_curves

    else:
        results = {'MSE': MSE, 'true_var': true_var, 'pearsonR': pearsonR}

    if save_data:
        results['pred_Y'] = list(pred_Y[..., 0])
        results['test_Y'] = list(test_Y[..., 0])

    return results
Exemplo n.º 8
0
    def step(self, action):
        # Check for valid action
        err_msg = "%r (%s) invalid" % (action, type(action))
        assert self.action_space.contains(action), err_msg

        # R <--> python conversions
        numpy2ri.activate(
        )  # automatic conversion of numpy objects to rpy2 objects

        # Update model based on actions
        action = (action - 1) / 2
        #reduction_factor = np.reshape(action,(self.num_cities,self.num_cities))

        # Get thetas updated for later calculations
        cwd = os.getcwd()
        input_path = '/Users/gracejia/Documents/A-UW/covid19 NSF Project/COVID19_RL/COVID19_models/SEIR'
        with open(input_path + '/update_thetas_weekly.R', 'r') as f:
            string = f.read()
        seir_theta = STAP(string, "seir_theta")
        seir_theta = seir_theta.getThetas

        modelOut = seir_theta(thetas_so_far=self.thetas_init,
                              thetas_sd_so_far=self.thetas_sd_init,
                              pred=self.pred)

        os.chdir(cwd)
        self.thetas_updated = np.array(modelOut, dtype=np.float64)

        # Get the SEIR model from r script

        with open(input_path + '/seir_r_weekly.R', 'r') as f:
            string = f.read()
        seir_r_weekly = STAP(string, 'seir_r_weekly')
        seir_r_weekly = seir_r_weekly.seirPredictions

        statesOut = seir_r_weekly(
            reduction_control=action,
            reduction_time_series=self.reduction_time_curr,
            thetas_data=self.thetas_updated,
            latent=self.latent,
            gamma=self.gamma,
            St_data=self.St_data,
            Et_data=self.Et_data,
            It_data=self.It_data,
            Rt_data=self.Rt_data,
            popu=self.popu,
            current=self.current,
            pred=self.pred)

        # Unpack output
        S = statesOut[0][0]
        E = statesOut[1][0]
        I = statesOut[2][0]
        R = statesOut[3][0]
        beta = statesOut[4][0]

        # Update state
        self.state = np.array((S, E, I, R))
        self.daynum += self.pred
        self.St_data = S
        self.Et_data = E
        self.It_data = I
        self.Rt_data = R
        self.beta = beta

        # Print states
        print('States:', self.state)
        # =============================================================================
        #     print(sum(self.state))
        # =============================================================================
        print('Beta:', beta)
        print('Action picked:', action)

        # Reward
        economicCost = np.sum(action) + np.sum(self.reduction_time_curr)
        publichealthCost = -0.00001 * abs(self.It_data)

        reward = self.weight * economicCost + (1 -
                                               self.weight) * publichealthCost
        print('Reward:', reward)
        print(self.daynum)

        # Observation
        observation = np.reshape(self.state, (4, ))

        # Check if episode is over
        done = bool(I < 0.5 or self.daynum >= 199)

        return observation, reward, done, {}
Exemplo n.º 9
0
 def get_ml(self):
     ml = STAP(self.get_r(), "r_fct_string")
     return ml
Exemplo n.º 10
0
def mdl_fit(model_vars, df, y_param, ci_level=0.95):
    """
    Function to fit final model and extract modelling statistics
    Input: model variables as a list, dataframe holding all the data, 
    dependent variable, confidence level for reporting statistics i.e. 0.95 for 95% 
    Output: dataframe with model coefficients and statistics   
    """
    #----------------------------------------------------------------------
    # Import necessary modules
    import rpy2.robjects.numpy2ri
    rpy2.robjects.numpy2ri.activate()
    import numpy as np
    from rpy2.robjects import pandas2ri
    pandas2ri.activate()
    from rpy2.robjects.packages import STAP
    import scipy.stats as stats
    #----------------------------------------------------------------------
    # Fit R model
    # Set R function as string to fit model and return results
    string_ord_mdl = """
    mdl_func <- function(formula,df) {
    	library(VGAM)
    	mdl1=vglm(formula,family=propodds, data=df)
    
    	ll=logLik(mdl1)
        coefficients_df=coef(summary(mdl1))
        coefficient_cols=colnames(coefficients_df)
        coefficient_rows=rownames(coefficients_df)
    	output<-list(ll,coefficients_df,coefficient_cols,coefficient_rows)
        return(output)
    }
        """
    # Transform pandas dataframe to R format
    rdf = pandas2ri.py2ri(df)
    # Set R formula as string using the model parameters and dependent variable
    formula = 'as.ordered(' + y_param + ') ~ ' + "+".join(model_vars)
    # Define R function to be used in Python
    ord_ll = STAP(string_ord_mdl, "ord_ll")
    # Fit model
    output_R = ord_ll.mdl_func(formula, rdf)
    # Extract data and place them in Pandas dataframe
    coeff_df_temp = output_R[1]
    coeff_df = pandas2ri.ri2py_dataframe(coeff_df_temp)
    cols_df = list(output_R[2])
    rows_df = list(output_R[3])
    coeff_df.columns = cols_df
    coeff_df.index = rows_df
    #----------------------------------------------------------------------
    # Calculate statistics
    # Number of parameters
    n_vars = len(coeff_df)
    # Degrees for freedom for t-distribution
    deg_free = len(df) - n_vars
    # Calculate alpha value from confidence interval
    alpha_ = 1.0 - ci_level
    # array to hold the low % confidence intervals
    low_arr = np.zeros(len(coeff_df))
    # array to hold the high % confidence intervals
    high_arr = np.zeros(len(coeff_df))
    # array to hold the Wald test p-values
    p_val_arr = np.zeros(len(coeff_df))
    # array to hold the t statistic
    t_value_arr = np.zeros(len(coeff_df))
    # loop counter variable
    index_arr = 0
    for index, row in coeff_df.iterrows():
        # Get standard error for variable coefficient from R model fit data
        std_error = row['Std. Error']
        # Get variable coefficient value from R model fit data
        coeff_value = row['Estimate']
        # Calculate t_critical statistic for desired confidence interval
        t_critical = stats.t.ppf(1 - (alpha_ / 2.), df=deg_free)
        # Calculate low - high confidence interval limits
        low_arr[index_arr] = coeff_value - (t_critical * std_error)
        high_arr[index_arr] = coeff_value + (t_critical * std_error)
        # t statistic calculation to get p-value
        t_value = coeff_value / std_error
        t_value_arr[index_arr] = t_value
        # Calculate p-value
        p_val_arr[index_arr] = 2.0 * \
            (1.0 - stats.t.cdf(np.abs(t_value), deg_free))
        index_arr += 1
    # Set arrays to dataframe columns
    coeff_df['Low ' + str((1.0 - alpha_) * 100) + '%'] = low_arr
    coeff_df['High ' + str((1.0 - alpha_) * 100) + '%'] = high_arr
    coeff_df['P Value'] = p_val_arr
    coeff_df['t Value'] = t_value_arr
    # Delete statistics of R model fit referring to normal distribution
    coeff_df.drop(['z value', 'Pr(>|z|)'], axis=1, inplace=True)
    # Return dataframe with model fit coefficients and statistics
    return coeff_df
Exemplo n.º 11
0
    # parse runtime configuration
    OutputLog().set_path(OUTPUT_DIR)

    data_config = ConfigParser.ConfigParser()
    data_config.read(data_set_config)
    data_parameters = ConfigSectionMap("dataset_parameters", data_config)

    # construct data set
    data_set = Container().create(data_parameters['name'], data_parameters)

    # robjects.r('source("%s")' % rca_location)

    with open(rca_location) as rca_file:
        rca_string = rca_file.read()

    rcca_fit = STAP(rca_string, "rcca_fit")
    rcca_eval = STAP(rca_string, "rcca_eval")

    # mnist = robjects.r('load')('/home/aviv/Project/DoubleEncoder/DataSet/MNIST_SPLIT/mnist.data')

    # x1_rcca = numpy.array(robjects.r['x_tr'])
    # x2_rcca = numpy.array(robjects.r['y_tr'])
    #
    # x1_dataset = data_set.trainset[1]
    # x2_dataset = data_set.trainset[0]
    #
    # x1_test = numpy.array(robjects.r['x_te'])
    # x2_test = numpy.array(robjects.r['y_te'])


    OutputLog().write('training rcca')
Exemplo n.º 12
0
def compare(pred_Y, test_Y, predict_binary_output, peaks=None,
            save_curves=True, save_data=False):
    """
    Evaluates performance for predictions pred_Y relative to true labels test_Y. 
    If predict_binary_output, pred_Y should be a set of scores and test_Y should be 0, 1 labels. 
    Otherwise, both pred_Y and test_Y should be continuous values. 
    Returns squared error and Pearson correlation between the predicted output and the actual output.
    
    Both pred_Y and test_Y must be matrices of shape num_examples x num_histone_marks, 
    or they must both be matrices of shape num_examples x seq_length x num_histone_marks.
    If the latter, examples are concatenated together before correlations are computed.

    peaks is a list. Each element of this list corresponds to one mark and is a N x 2 matrix 
    where each row contains the (start, end) coordinates of a peak in that mark.
    If passing in peaks, make sure the coordinate system matches that of pred_Y and test_Y!
    For example, if your peaks start at the start of the chromosome, then pred_Y and test_Y have
    to start at the start of the chromosome as well.

    If save_curves is True, it saves the full precision-recall curve. save_curves cannot be True if 
    predict_binary_output is False. Right now it saves recalls @10, 20...90% precision. 

    If save_data is True, it saves the first mark of pred_Y and test_Y.
        
    Returns results, a dictionary containing:
        'AUC' (if predict_binary_output)
        'AUPRC' (if predict_binary_output)
        'precision_curves' (if save_curves)
        'recall_curves' (if save_curves)
        'threshold_curves' (if save_curves)
        'MSE' (if not predict_binary_output)
        'true_var' (if not predict_binary_output)
        'pearsonR' (if not predict_binary_output)
        'pred_Y' (if save_data)
        'test_Y' (if save_data)

    AUC, AUPRC, MSE, true_var, pearsonR, and spearmanR are each vectors of length num_histone_marks.  
    true_var is the variance of the true data; it is useful for interpreting whether a given
    MSE is good or bad.
    """
    
    # save_curves has to be False if predict_binary_output is also False
    if not predict_binary_output: save_curves = False
    
    pred_Y_is_binary = is_binary(pred_Y)
    test_Y_is_binary = is_binary(test_Y)   
    assert pred_Y.shape == test_Y.shape, \
        "pred_Y.shape = %s doesn't match test_Y.shape = %s" % (str(pred_Y.shape), str(test_Y.shape))
    assert test_Y_is_binary == predict_binary_output 

    #test_Y (the true labels) ought to be binary IFF we're predicting binary output. 
    #pred_Y should be a set of continuous scores, regardless of whether we're predicting binary output. 
    assert len(pred_Y.shape) == 2 or len(pred_Y.shape) == 3

    # If peaks is not None, then there should be one element in peaks for each mark in pred_Y.
    if peaks:
        assert len(peaks) == pred_Y.shape[-1]
    
    # If the input matrices are 3D, then squash the first two dimensions together
    if len(pred_Y.shape) == 3:
        pred_Y = np.reshape(pred_Y, [pred_Y.shape[0] * pred_Y.shape[1], pred_Y.shape[2]])
        test_Y = np.reshape(test_Y, [test_Y.shape[0] * test_Y.shape[1], test_Y.shape[2]])
    
    num_histone_marks = pred_Y.shape[len(pred_Y.shape) - 1]

    true_var = []
    MSE = []
    pearsonR = []

    precision_curves = []
    recall_curves = [] 
    threshold_curves = []
    auc = []
    auprc = []
    Y_pos_frac = []

    with open('PRROC.R', 'r') as f:#load in the R code. 
        r_fxn_string = f.read()
    r_auc_func = STAP(r_fxn_string, "auc_func")
    
    for mark_idx in range(num_histone_marks):
        ### Sub-select only peak regions
        if peaks:
            # If peaks exists but peaks[mark_idx] is set to None, we should skip this mark. 
            # This mark should correspond to INPUT, which has no peaks of its own.
            if peaks[mark_idx] is None:
                if predict_binary_output:
                    precision_curves.append(None)
                    recall_curves.append(None)
                    threshold_curves.append(None)
                    auprc.append(None)
                    auc.append(None)
                else:
                    true_var.append(None)
                    MSE.append(None)
                    pearsonR.append(None)
                continue

            # Initialize peak_idxs to all False
            num_bins = pred_Y.shape[0]
            peak_idxs = np.zeros(
                num_bins,
                dtype=bool)
            
            # Set peak_idx such that it is True in each peak
            # Simultaneously get the average signal density in each peak
            for peak_counter, peak in enumerate(peaks[mark_idx]):            
                # We have to check for this, because pred_Y and test_Y might only represent
                # a fraction of any given chromosome
                if peak[1] > num_bins: 
                    continue
                
                peak_idxs[peak[0]:peak[1]] = True                                

            pred_Y_mark = pred_Y[peak_idxs, mark_idx]
            test_Y_mark = test_Y[peak_idxs, mark_idx]
        else:
            pred_Y_mark = pred_Y[:, mark_idx]
            test_Y_mark = test_Y[:, mark_idx]

        ### Run evaluations on (selected) regions
        if predict_binary_output:
            precisions, recalls, thresholds = precision_recall_curve(test_Y_mark, pred_Y_mark)
            precisions, recalls = compute_recalls_at_precision(precisions, recalls)

            precision_curves.append(list(precisions))
            recall_curves.append(list(recalls))

            if len(test_Y_mark) < 100000:
                downsample_idxs = range(len(test_Y_mark))
            else:
                downsample_idxs = sample(range(len(test_Y_mark)), 100000)
    
            r_auprc_results = r_auc_func.pr_curve(scores_class0 = robjects.vectors.FloatVector(pred_Y_mark[downsample_idxs]), weights_class0 = robjects.vectors.FloatVector(test_Y_mark[downsample_idxs]))

            auprc.append(float(r_auprc_results.rx('auc.davis.goadrich')[0][0]))
            r_auc_results = r_auc_func.roc_curve(scores_class0 = robjects.vectors.FloatVector(pred_Y_mark[downsample_idxs]), weights_class0 = robjects.vectors.FloatVector(test_Y_mark[downsample_idxs]))
            auc.append(float(r_auc_results.rx('auc')[0][0]))
            Y_pos_frac.append(test_Y_mark.mean())        
            print("AUC %2.3f; AUPRC %2.3f" % (auc[mark_idx], auprc[mark_idx]))
        else:
            true_var.append(np.var(test_Y_mark))
            MSE.append(get_MSE(pred_Y_mark, test_Y_mark))
            pearsonR.append(get_pearsonR(pred_Y_mark, test_Y_mark))

            print("MSE %2.3f (true var %2.3f), pearsonR %2.3f" % 
                (MSE[mark_idx], true_var[mark_idx], pearsonR[mark_idx]))      

    if predict_binary_output:
        assert((len(precisions) > 0) and (len(recalls) > 0))
        results = {
            'AUC':auc,
            'AUPRC':auprc,
            'Y_pos_frac':Y_pos_frac
        }
        results['precision_curves'] = precision_curves
        results['recall_curves'] = recall_curves

    else:
        results = {
            'MSE': MSE,
            'true_var': true_var,
            'pearsonR': pearsonR
        }

    if save_data: 
        results['pred_Y'] = list(pred_Y[..., 0])
        results['test_Y'] = list(test_Y[..., 0])

    return results
Exemplo n.º 13
0
def main():
    """
    Performs a Sparse Partial Least Squares (sPLS) analysis over subsets of gene expression and
    metabolomic data. To perform this subsetting, three different methodologies can be used for the
    metabolites:
    - By generic metabolite (sphingomyelin, ...)
    - By MMC cluster
    - By generic metabolite and then by MMC cluster
    and four for the genes:
    - All the genes
    - Genes contained in a list with interesting genes for the analysis
    - Pathway related genes for an specific generic metabolite
    - Metagenes (PANA approach)

    The outputs depend on the inputs.

    Arguments:
        :param geneDataset metDataset: Gene expression/Metabolomics wide dataset, respectively.
        :type geneDataset metDataset: files

        :param geneId metId: Name of the Genes/metabolites unique identifier column, respectively.
        :type geneId metId: strings

        :param geneAnnot metAnnot: Gene Expression/Metabolomics Annotation Dataset.
        :type geneAnnot metAnnot: files

        :param geneAnnotName metAnnotName: annotation file column with gene/metabolite names.
        :type geneAnnotName metAnnotName: strings

        :param design: Design File
        :type design: file

        :param keepX: Number of genes to keep in the sPLS model
        :param keepX: integer

        :param geneOption metOption: Options for metabolite subsetting (one of 'generic', 'mmc' or
        'both') and for gene expression subsetting (one of 'all', 'geneList', 'path' or 'pana')
        :type geneOption metOption: strings

        :param geneKeggAnno metKeggAnno: KEGG Annotation files for gene expression and
        metabolomics, respectively. From Add KEGG Anno Info Tool
        :type geneKeggAnno metKeggAnno: files

        :param geneKeggPath metKeggPath: KEGG Pathway files for gene expression and metabolomics,
        respectively. From Add KEGG Pathway Info Tool
        :type geneKeggPath metKeggPath: files

        :param path2genes: Downloaded KEGG file with this information: pathway_ID "\t" geneKEGG_ID
        :type path2genes: file

    Returns:
        :return figure1: sPLS heatmaps
        :rtype figure1: pdf

        :return splsOut: sif-like correlation matrix including a column describing the comparison.
        :rtype splsOut: file

        :return figure2: MMC plots if mmc or both metabolite subsetting option is selected.
        :rtype figure2: pdf

        :return mmcOut: MMC Output table if mmc or both metabolite subsetting option is selected.
        :rtype mmcOut: file

        :return panaOut: Table describing genes that forms the metagenes (1/0)
        :rtype panaOut: file
    """
    args = getOptions()
    logger = logging.getLogger()
    sl.setLogger(logger)
    logger.info(u"Importing data with the following parameters: "
                "\n\tGene Dataset:  {}"
                "\n\tGene UniqueID:  {}"
                "\n\tGene Option: {}"
                "\n\tMetabolite Dataset:{}"
                "\n\tMetabolite UniqueID:  {}"
                "\n\tMetabolite Option: {}".format(
                    args.geneDataset,
                    args.geneId,
                    args.geneOption,
                    args.metDataset,
                    args.metId,
                    args.metOption,
                ))
    pandas2ri.activate()
    with ires.path("gaitGM.data", "sPLS.R") as my_r_script_path:
        f = open(my_r_script_path, "r")
        rFile = f.read()
    sPLSScript = STAP(rFile, "sPLS")
    rGeneData, rMetData, multipleNames, multipleNamesId = modules.prepareSPLSData(
        args)
    rData = []
    data_counter = 0
    for R_met_df in rMetData:
        R_gene_df = rGeneData[data_counter]
        rData.append(
            sPLSScript.sPLS(geneData=R_gene_df,
                            metData=R_met_df,
                            keepX=args.keepX))
        if args.geneOption == "path":
            data_counter += 1
    if args.metOption == "both":
        sPLSScript.plotInPdf(splsObjects=rData,
                             figurePath=args.figure1,
                             multipleNames=multipleNamesId)
        # Correlation Matrix
        corMatrix = sPLSScript.corrMat(splsObjects=rData,
                                       multipleNames=multipleNamesId,
                                       threshold=args.thres)
        robjects.r["write.table"](
            corMatrix,
            file=args.splsOut,
            sep="\t",
            quote=False,
            row_names=False,
            col_names=True,
        )
    else:
        sPLSScript.plotInPdf(splsObjects=rData,
                             figurePath=args.figure1,
                             multipleNames=multipleNames)
        # Correlation Matrix
        corMatrix = sPLSScript.corrMat(splsObjects=rData,
                                       multipleNames=multipleNames,
                                       threshold=args.thres)
        robjects.r["write.table"](
            corMatrix,
            file=args.splsOut,
            sep="\t",
            quote=False,
            row_names=False,
            col_names=True,
        )
Exemplo n.º 14
0
used_features_payload = api.model('feature_payload', used_features)

immEd_output = api.model(
    'immEd_output', {
        'score': fields.Integer,
        'category': fields.String,
        'guidance': fields.String,
        'is_ready': fields.Boolean,
        'used_features': fields.Nested(used_features_payload)
    })

##

##
from rpy2.robjects.packages import STAP
immunomatch_ed = STAP(immunomatch_ed, "immunomatch_ed")

ns1 = Namespace('Immunomatch Ed')


@api.route('/immunomatch_ed')
class Immunomatch_ed(Resource):
    #@api.marshal_with(a_language, envelope='the_data')
    @api.doc(security='apikey')
    @token_required
    @api.expect(feat_root_objs)
    # @api.doc(body=immEd_output)
    def post(self):
        if not request.get_json():
            return bad_request('No input data provided')
        raw_dict = request.json
Exemplo n.º 15
0
}

fit_betabinom_w <- function(n, k, w) {
    fit <- vglm(cbind(k, n-k) ~ 1, betabinomialff, weights=w)
    return(coef(fit, matrix = TRUE))
}

eval_betabinom <- function(n, k, a, b) {
    p <- dbetabinom.ab(k, size=n, shape1=a, shape2=b)
    return(p)
}
"""

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    bbfit = STAP(FIT_RFUN_STR, 'bbfit')


def fit_betabinom_ab(n, k, weights=None):
    assert np.all((k >= 0) & (k <= n))
    n_r = ro.FloatVector(n)
    k_r = ro.FloatVector(k)
    if weights is not None:
        assert len(weights) == len(k)
        assert len(weights) == len(n)
        weights_r = ro.FloatVector(weights)
        result_r = bbfit.fit_betabinom_w(n_r, k_r, weights_r)
    else:
        result_r = bbfit.fit_betabinom(n_r, k_r)
    result = rpyn.ri2py(result_r)
    log_a, log_b = result.flatten()
Exemplo n.º 16
0
 def fitcoxnet(x, y):
     """
     Fit a cox net model
     Input : data_x : Data frame in pandas with covariates 
               y: Survival time array with time and censored event 
     Output : Coefficients selected 
     """
     pandas2ri.activate()
     x_raw = pandas2ri.py2ri(x)
     y_raw = pandas2ri.py2ri(pd.DataFrame(y))
     ro.globalenv['x_raw'] = x_raw
     ro.globalenv['y_raw'] = y_raw
     r_fct_string = """ 
     fitcoxnet <- function(DATAX,DATAY) {
     # Making Dummy Variables from the clinical Data
     x<- model.matrix( ~ ., data = DATAX)
     # Making a Survival Object 
     surv_obj <- Surv(time=DATAY$Survival_in_days,event=DATAY$Status)
     # create a glmnet cox object using lasso regularization and cross validation
     glmnet.cv <- cv.glmnet (x, surv_obj, family="cox")
     ## get the glmnet model on the full dataset
     glmnet.obj <- glmnet.cv$glmnet.fit
     # find lambda index for the models with least partial likelihood deviance
     optimal.lambda <- glmnet.cv$lambda.min
     lambda.index <- which(glmnet.obj$lambda==optimal.lambda)
     # take beta for optimal lambda 
     optimal.beta  <- glmnet.obj$beta[,lambda.index] 
     # find non zero beta coef 
     nonzero.coef <- abs(optimal.beta)>0 
     selectedBeta <- optimal.beta[nonzero.coef] 
     # take only covariates for which beta is not zero 
     selectedVar   <- x[,nonzero.coef]
     ## create a dataframe for trainSet with time, status and selected
     #variables in binary representation for evaluation in pec
     reformat_dataSet <- as.data.frame(cbind(surv_obj,selectedVar))
     # glmnet.cox only with meaningful features selected by stepwise
     #bidirectional AIC feature selection 
     glmnet.cox.meaningful <- step(coxph(Surv(time,status) ~
     .,data=reformat_dataSet),direction="both",trace=0)
     ## C-Index calculation 100 iter bootstrapping
     cIndexCoxglmnet<-c()
     features<-c()
     for (i in 1:100)
     {
             train <- sample(1:nrow(x), nrow(x), replace = TRUE) 
             reformat_trainSet <- reformat_dataSet [train,]
             glmnet.cox.meaningful.test <- step(coxph(Surv(time,status) ~
     .,data=reformat_trainSet),direction="both",trace=0)
             lbls<- as.vector(attr(glmnet.cox.meaningful.test$terms,"term.labels"))
             features<- c(features,lbls)
             varnames <- lapply(lbls, as.name)
             testdf<- as.data.frame(x)
             selectedVarCox   <-testdf[ names(testdf)[names(testdf) %in% varnames] ] 
             reformat_testSet <- as.data.frame(cbind(surv_obj,selectedVarCox))
             reformat_testSet <- reformat_dataSet [-train,]
         cIndexCoxglmnet <- c(cIndexCoxglmnet,
     1-rcorr.cens(predict(glmnet.cox.meaningful,
     reformat_testSet),Surv(reformat_testSet$time,reformat_testSet$status))[1])
     }
     cIndexm<- mean (unlist(cIndexCoxglmnet),rm.na=TRUE)
     cIndexstd<- sd(unlist(cIndexCoxglmnet))
     Finalresult=list(coefficients=glmnet.cox.meaningful,Average_Concordance=cIndexm,concordance_sd=cIndexstd,lambda=optimal.lambda)
     return(Finalresult)} """
     r_pkg = STAP(r_fct_string, "r_pkg")
     print(r_pkg.fitcoxnet(x_raw, y_raw))
Exemplo n.º 17
0
def main():
    """
    Perform a correlation analysis of a Gene Expression Dataset and a Metabolomic Dataset.

    Arguments:
        :param geneDataset metDataset: Gene expression/Metabolomics wide dataset, respectively.
        :type geneDataset metDataset: files

        :param geneId metId: Name of the Genes/metabolites unique identifier column, respectively.
        :type geneId metId: strings

        :param geneAnnot metAnnot: Gene Expression/Metabolomics Annotation Datasets, respectively.
        :type geneAnnot metAnnot: files

        :param geneAnnotName metAnnotName: Name of the column of the Annotation file that contains
        genes/metabolites names respectively.
        :type geneAnnotName metAnnotName: strings

        :param meth: Methodology for the correlation function. One of 'pearson', 'spearman' or
        'kendall'.
        :type meth: string

        :param thres: PValue Threshold to cut the correlations for the output table.
        :type thres: float

    Returns:
        :return output: Output table with the following information: Metabolite "\t" Gene "\t"
        Correlation "\t" pvalue

        :rtype output: file

        :return corMat: Correlation Matrix
        :rtype corMat: file

        :return fig: Network-like output figure
        :rtype fig: pdf
    """

    warnings.filterwarnings("ignore", category=RRuntimeWarning)
    args = getOptions()
    logger = logging.getLogger()
    sl.setLogger(logger)
    logger.info(u"Importing data with the following parameters: "
                "\n\tGene Dataset:  {}"
                "\n\tGene UniqueID:  {}"
                "\n\tMet Dataset:{}"
                "\n\tMet UniqueID:  {}"
                "\n\tMethod:  {}"
                "\n\tThreshold:  {}".format(
                    args.geneDataset,
                    args.geneId,
                    args.metDataset,
                    args.metId,
                    args.meth,
                    args.thres,
                ))

    modules.checkForDuplicates(args.geneDataset, args.geneId)
    modules.checkForDuplicates(args.metDataset, args.metId)
    pandas2ri.activate()
    with ires.path("gaitGM.data",
                   "all_by_all_correlation.R") as my_r_script_path:
        f = open(my_r_script_path, "r")
        rFile = f.read()
    allByAllCorrScript = STAP(rFile, "corr_main_func")
    # Prepare Gene Expression Data
    geneTable = pd.read_table(args.geneDataset, sep="\t", header=0)
    if args.geneAnnot:
        R_gene_df = modules.Ids2Names(geneTable, args.geneId, args.geneAnnot,
                                      args.geneName)
    else:

        geneTable = geneTable.set_index(args.geneId)
        R_gene_df = pandas2ri.py2rpy(geneTable)

    # Prepare Metabolomics Data
    metTable = pd.read_table(args.metDataset, sep="\t", header=0)
    if args.metAnnot:
        R_met_df = modules.Ids2Names(metTable, args.metId, args.metAnnot,
                                     args.metName)
    else:
        metTable = metTable.set_index(args.metId)
        R_met_df = pandas2ri.py2rpy(metTable)

    allByAllCorrScript.corr_main_func(
        x=R_gene_df,
        y=R_met_df,
        meth=args.meth,
        thres=args.thres,
        corrMatPath=args.corMat,
        outputPath=args.output,
        figurePath=args.fig,
    )
Exemplo n.º 18
0
    def bin(self, method, target, event, *args):
        import rpy2.robjects as robjects
        from rpy2.robjects.vectors import StrVector
        import rpy2.robjects as ro
        from rpy2.robjects.packages import importr
        import rpy2.robjects.packages as rpackages
        #from rpy2.robjects.lib.dplyr import dplyr
        from rpy2.robjects import pandas2ri
        from rpy2.robjects import default_converter
        from rpy2.robjects.conversion import localconverter
        base = importr('base')
        utils = importr('utils')
        try:
            woeBinning = importr('woeBinning')
        except:
            print("woeBinning package is being installed")
            utils.install_packages('woeBinning')
        try:
            discretization = importr('discretization')
        except:
            print("discretization pacakge is being installed")
            utils.install_packages('discretization')

        try:
            lazyeval = importr('lazyeval')
        except:
            print("lazyeval pacakge is being installed")
            utils.install_packages('lazyeval')

        column_Name = []
        cl = []
        if method == 'woe':
            print('woe')
            for arg in args:
                print(arg)
                if self.Train[arg].dtypes != 'object':
                    print(self.Train[arg].dtypes)
                    column_Name.append(arg)
                    cl.append(arg)
                    print(column_Name)

                else:
                    print("Column Name " + arg +
                          " is Object so no binning is done for it")
            column_Name.append(target)
            df = self.Train[column_Name]
            try:
                with open('woe.r', 'r') as f:
                    string = f.read()
                woe = STAP(string, "woe")
            except FileNotFoundError:
                path = input("set directory to path: ")
                from os import chdir
                chdir(path)
                with open('woe.r', 'r') as f:
                    string = f.read()
                woe = STAP(string, "woe")
            pandas2ri.activate()

            self.woe_based_bin = pandas2ri.ri2py(
                woe.woe_based_binning(df, target, event, cl))
            pandas2ri.deactivate()
            return self.woe_based_bin
        elif method == 'Chisq':
            print('Chisq')
            cl = []
            s = []
            for arg in args:
                s.append(arg)

            #column_name =[]
            if s[0] == 'ALL':
                print("ALL")
                for j in self.Train.columns:
                    if self.Train[j].dtypes != 'object':
                        cl.append(j)
                        #column_name.append(arg)
                cl.append(target)
                df = self.Train[cl]
                try:
                    with open('woe.r', 'r') as f:
                        string = f.read()
                        woe = STAP(string, "woe")
                        print('woe')

                except FileNotFoundError:
                    path = input("set directory to path: ")
                    from os import chdir
                    chdir(path)
                    with open('woe.r', 'r') as f:
                        string = f.read()
                        woe = STAP(string, "woe")
                        print('woe')

                pandas2ri.activate()
                print('pandas2ri is activated')
                self.chisq_based_bin = pandas2ri.ri2py(
                    woe.Chi_sq_based_bin(df))
                pandas2ri.deactivate()
                return self.chisq_based_bin
            else:
                cl = []
                for arg in args:
                    if self.Train[arg].dtypes != 'object':
                        cl.append(arg)
                    else:
                        print(arg + " is not continuous")
                if len(cl) == 0:
                    message = ' Binning can not be done as all variable passed is not continuos'
                    return message
                cl.append(target)
                df = self.Train[cl].copy()
                try:
                    with open('woe.r', 'r') as f:
                        string = f.read()
                        woe = STAP(string, "woe")
                        print('woe module loaded')

                except FileNotFoundError:
                    path = input("set directory to path: ")
                    from os import chdir
                    chdir(path)
                    with open('woe.r', 'r') as f:
                        string = f.read()
                        woe = STAP(string, "woe")
                        print('woe module loaded')

                pandas2ri.activate()
                print('pandas2ri is activated')
                self.chisq_based_bin = pandas2ri.ri2py(
                    woe.Chi_sq_based_bin(df))
                pandas2ri.deactivate()
                print('pandas2ri is deactivated')
                return self.chisq_based_bin
        elif method == 'Entropy':
            print('Entropy')
            cl = []
            s = []
            for arg in args:
                s.append(arg)
            if s[0] == 'ALL':
                print("ALL")
                for j in self.Train.columns:
                    if self.Train[j].dtypes != 'object':
                        cl.append(j)
                        #column_name.append(arg)
                cl.append(target)
                df = self.Train[cl]
                try:
                    with open('woe.r', 'r') as f:
                        string = f.read()
                        woe = STAP(string, "woe")
                        print('woe')

                except FileNotFoundError:
                    path = input("set directory to path: ")
                    from os import chdir
                    chdir(path)
                    with open('woe.r', 'r') as f:
                        string = f.read()
                        woe = STAP(string, "woe")
                        print('woe')

                pandas2ri.activate()
                print('pandas2ri is activated')
                self.entropy_based_bin = pandas2ri.ri2py(
                    woe.entropy_based_bin(df))
                pandas2ri.deactivate()
                return self.entropy_based_bin
            else:
                cl = []
                for arg in args:
                    if self.Train[arg].dtypes != 'object':
                        cl.append(arg)
                    else:
                        print(arg + " is not continuous")
                if len(cl) == 0:
                    message = ' Binning can not be done as all variable passed is not continuos'
                    return message
                cl.append(target)
                df = self.Train[cl].copy()
                try:
                    with open('woe.r', 'r') as f:
                        string = f.read()
                        woe = STAP(string, "woe")
                        print('woe module loaded')

                except FileNotFoundError:
                    path = input("set directory to path: ")
                    from os import chdir
                    chdir(path)
                    with open('woe.r', 'r') as f:
                        string = f.read()
                        woe = STAP(string, "woe")
                        print('woe module loaded')

                pandas2ri.activate()
                print('pandas2ri is activated')
                self.entropy_based_bin = pandas2ri.ri2py(
                    woe.entropy_based_bin(df))
                pandas2ri.deactivate()
                print('pandas2ri is deactivated')
                return self.entropy_based_bin

            def crt():
                pass
Exemplo n.º 19
0
import pprint
import numpy as np
import pickle

import rpy2.robjects as robjects
import rpy2.robjects.numpy2ri
from rpy2.robjects.packages import importr
from rpy2.robjects.packages import STAP

rpy2.robjects.numpy2ri.activate()
Rsession = rpy2.robjects.r

TSclust = importr('TSclust')

mfunc = 'myasmatrix <- function(dobj){return(as.matrix(dobj))}'
myasmatrix = STAP(mfunc, "myasmatrix")

mfunc = 'saxconv <- function(x,asize){return(convert.to.SAX.symbol( x, alpha=asize ))}'
saxconv = STAP(mfunc, "saxconv")


def znorm(x):
    return (x - x.mean()) / x.std()


def compute_and_save_metrics_R(data, n, dname, dirname, njobs=40):
    # Creare a SAX symbolic representation with alphabet size 10
    Xsax = [np.asarray(saxconv.saxconv(znorm(_x), 10)) for _x in data]
    Xsax = np.vstack(Xsax)

    # p
}

c.F4 <- function(data) {

  data <- ovo(data)
  aux <- mapply(function(d) {
    nrow(removing(d))/nrow(d)
  }, d=data)

  #aux <- mean(aux)
  return(aux)
}

"""

stringr_c = STAP(string, "stringr_c")
stringr_c._rpy2r.keys()


def my_evaluate(individual):
    dataFrame['label'] = individual
    robjects.globalenv['dataFrame'] = dataFrame
    fmla = Formula('label ~ .')

    ## -- linearity
    linearityVector = stringr_c.linearity_formula(fmla,
                                                  dataFrame,
                                                  measures="L2",
                                                  summary="return")
    linearity = linearityVector.rx(1)
    fitness = abs(globalLinear - linearity[0][0])
Exemplo n.º 21
0
class EvalAdmix():
    'Class for executing evalAdmix commands'

    def __init__(self, prefix, mc, funcs):
        self.prefix = prefix
        self.mc = mc
        self.mcOnly = False
        if (self.mc != "none"):
            self.mcOnly = True
        self.qfiles = dict()
        self.runs = dict()
        self.qfilePaths = dict()

        if (self.mcOnly == True):
            self.parseMC()

        #import R functions
        self.utils = importr('utils')
        self.base = importr('base')
        self.grdevices = importr('grDevices')

        # import R plotting functions from evalAdmix
        with open(funcs, 'r') as f:
            string = f.read()
        self.myfunc = STAP(string, "myfunc")

    def parseMC(self):
        print("Parsing MC")
        with open(self.mc) as fh:
            newlist = fh.read().splitlines()
            print(newlist)

    def loadJson(self):
        self.loadQ()
        self.loadRuns()
        self.loadQfilePaths()

    def loadQ(self):
        qfn = self.prefix + ".qfiles.json"
        if os.path.isfile(qfn):
            with open(qfn) as fh:
                self.qfiles = json.load(fh)
        else:
            print("ERROR:", qfn, "does not exist.")
            print("Exiting program...")
            raise SystemExit

    def loadRuns(self):
        rfn = "cvRuns.json"
        if os.path.isfile(rfn):
            with open(rfn) as fh:
                self.runs = json.load(fh)
        else:
            print("ERROR:", rfn, "does not exist.")
            print("Exiting program...")
            raise SystemExit

    def loadQfilePaths(self):
        rfn = "qfilePaths.json"
        if os.path.isfile(rfn):
            with open(rfn) as fh:
                self.qfilePaths = json.load(fh)
        else:
            print("ERROR:", rfn, "does not exist.")
            print("Exiting program...")
            raise SystemExit

    def evalAdmix(self, minK, maxK, np):
        ks = range(int(minK), int(maxK) + 1)
        for k in ks:
            for qf in self.qfiles[str(k)]:
                print(qf)
                temp = qf.split(".")

                #make .P file name
                temp[-1] = "P"
                pf = ".".join(temp)

                #make output .corres file name
                temp[-1] = "corres"
                eAf = ".".join(temp)

                #build command for evalAdmix
                evalAdmix_str_com = "evalAdmix -plink " + self.prefix + " -fname " + pf + " -qname " + qf + " -o " + eAf + " -P " + str(
                    np)

                call = SysCall(evalAdmix_str_com)
                call.run_program()

    def averageCorres(self, funcs):

        for k in self.runs:
            matrixList = list()
            print(k)
            for run in self.runs[k]:
                temp = run.split(".")
                temp[-1] = "corres"
                eAf = ".".join(temp)
                if (os.path.isfile(eAf)):
                    cor = self.base.as_matrix(self.utils.read_table(eAf))
                    matrixList.append(cor)
                else:
                    print("ERROR:", eAf, "does not exist.")
                    print("Exiting program...")
                    raise SystemExit
            reducedList = self.base.Reduce('+',
                                           matrixList)  #sum matrices in list
            cor = reducedList.ro / float(
                len(matrixList))  #div by num elements in list to get mean

            #get average corres matrix for major or minor cluster k
            q = self.parseClumpp(self.qfilePaths[k])

            #check if object q is NoneType
            if q is None:
                print("")
                print("")
                print("ERROR from evaladmix.py:")
                print(
                    "Empty matrices (Python NoneType) were returned When trying to create average matrices for Major/Minor clusters."
                )
                print("Check that the paths in qfilePaths.json are valid.")
                print(
                    "This error could occur if you have moved your admixture run folder after running distructRerun.py."
                )
                print(
                    "Alternatively, if you are using the Docker container this could have occurred if you ran distructRerun.py on your own system outside of the container."
                )
                print("")
                print("")
                raise SystemExit

            famf = self.prefix + ".fam"
            pop = self.base.as_matrix(self.utils.read_table(famf))

            # uncomment lines below for debugging of object types
            #print("Type for q is:")
            #print(type(q))

            # uncomment below lines for debugging.
            #print(type(pop))
            #print(type(famf))

            output = k + ".png"
            ordr = self.myfunc.orderInds(pop=self.base.as_vector(
                pop.rx(True, 2)),
                                         q=q)
            title = k

            self.grdevices.png(file=output)
            try:
                self.myfunc.plotCorRes(cor_mat=cor,
                                       pop=self.base.as_vector(pop.rx(True,
                                                                      2)),
                                       ord=ordr,
                                       title=title,
                                       max_z=0.1,
                                       min_z=-0.1)
            except rpy2.rinterface_lib.embedded.RRuntimeError:
                print("Error in R code (plotting functions) from evalAdmix.")
            self.grdevices.dev_off()

    def parseClumpp(self, f):
        if (os.path.isfile(f)):
            df = pandas.read_csv(f,
                                 delimiter="\s+",
                                 header=None,
                                 index_col=False)

            # Even though inplace=True is used in this context, operating on the dataframe directly rather than assigning to a new variable should prevent creation of a "NoneType"
            df.drop(df.columns[0:5], axis=1, inplace=True)

            # uncomment below lines for debugging.
            #print("Type for df is:")
            #print(type(df))

            with localconverter(rpy2.robjects.default_converter +
                                pandas2ri.converter):
                Rdf = rpy2.robjects.conversion.py2rpy(df)

# uncomment lines below for debugging.
            #print("Type for Rdf is:")
            #print(type(Rdf))
            #print(Rdf)

            return Rdf

    def Rcode(self, funcs, minK, maxK):
        #make file names
        famf = self.prefix + ".fam"

        ks = range(int(minK), int(maxK) + 1)
        for k in ks:
            title = "K=" + str(k)
            for qf in self.qfiles[str(k)]:
                temp = qf.split(".")
                temp[-1] = "corres"
                eAf = ".".join(temp)
                output = eAf + ".png"

                # read in files
                pop = self.base.as_matrix(self.utils.read_table(famf))

                print(qf)
                q = self.utils.read_table(qf)
                cor = self.base.as_matrix(self.utils.read_table(eAf))

                # run plotting functions
                ordr = self.myfunc.orderInds(pop=self.base.as_vector(
                    pop.rx(True, 2)),
                                             q=q)

                self.grdevices.png(file=output)
                try:
                    self.myfunc.plotCorRes(cor_mat=cor,
                                           pop=self.base.as_vector(
                                               pop.rx(True, 2)),
                                           ord=ordr,
                                           title=title,
                                           max_z=0.1,
                                           min_z=-0.1)
                except rpy2.rinterface_lib.embedded.RRuntimeError:
                    print("Something happened.")
                self.grdevices.dev_off()