def wrapper(args): try: npar = args.noiseparam.strip('[').strip(']').split(',') except: npar = [] nbins = args.nbins # Run funciton if args.i: df = pd.io.parsers.read_csv( args.i,delim_whitespace=True, dtype={'seqs':str,'batch':int}) else: df = pd.io.parsers.read_csv( sys.stdin,delim_whitespace=True, dtype={'seqs':str,'batch':int}) if len(utils.get_column_headers(df)) > 0: raise SortSeqError('Library already sorted!') model_df = io.load_model(args.model) output_df = main( df,model_df,args.noisemodel,npar, nbins,start=args.start,end=args.end) if args.out: outloc = open(args.out,'w') else: outloc = sys.stdout pd.set_option('max_colwidth',int(1e8)) # Validate dataframe for writting output_df = qc.validate_dataset(output_df,fix=True) io.write(output_df,outloc)
def main( data_df,model_df, start=0,end=None,err=False): dicttype, modeltype = qc.get_model_type(model_df) seq_cols = qc.get_cols_from_df(data_df,'seqs') if not len(seq_cols)==1: raise SortSeqError('Dataframe has multiple seq cols: %s'%str(seq_cols)) seq_dict,inv_dict = utils.choose_dict(dicttype,modeltype=modeltype) #set name of sequences column based on type of sequence type_name_dict = {'dna':'seq','rna':'seq_rna','protein':'seq_pro'} seq_col_name = type_name_dict[dicttype] #Cut the sequences based on start and end, and then check if it makes sense if (start != 0 or end): data_df.loc[:,seq_col_name] = data_df.loc[:,seq_col_name].str.slice(start,end) if modeltype=='MAT': if len(data_df.loc[0,seq_col_name]) != len(model_df.loc[:,'pos']): raise SortSeqError('model length does not match dataset length') elif modeltype=='NBR': if len(data_df.loc[0,seq_col_name]) != len(model_df.loc[:,'pos'])+1: raise SortSeqError('model length does not match dataset length') col_headers = utils.get_column_headers(data_df) if 'ct' not in data_df.columns: data_df['ct'] = data_df[col_headers].sum(axis=1) data_df = data_df[data_df.ct != 0] if not end: seqL = len(data_df[seq_col_name][0]) - start else: seqL = end-start data_df = data_df[data_df[seq_col_name].apply(len) == (seqL)] #make a numpy array out of the model data frame model_df_headers = ['val_' + str(inv_dict[i]) for i in range(len(seq_dict))] value = np.transpose(np.array(model_df[model_df_headers])) #now we evaluate the expression of each sequence according to the model. seq_mat,wtrow = numerics.dataset2mutarray(data_df.copy(),modeltype) temp_df = data_df.copy() temp_df['val'] = numerics.eval_modelmatrix_on_mutarray(np.array(model_df[model_df_headers]),seq_mat,wtrow) temp_sorted = temp_df.sort_values(by='val') temp_sorted.reset_index(inplace=True,drop=True) #we must divide by the total number of counts in each bin for the MI calculator #temp_sorted[col_headers] = temp_sorted[col_headers].div(temp_sorted['ct'],axis=0) MI = EstimateMutualInfoforMImax.alt4(temp_sorted,coarse_graining_level=0) if not err: Std = np.NaN else: data_df_for_sub = data_df.copy() sub_MI = np.zeros(15) for i in range(15): sub_df = data_df_for_sub.sample(int(len(data_df_for_sub.index)/2)) sub_df.reset_index(inplace=True,drop=True) sub_MI[i],sub_std = main( sub_df,model_df,modeltype=modeltype,err=False) Std = np.std(sub_MI)/np.sqrt(2) return MI,Std
def Berg_von_Hippel(df,dicttype,foreground=1,background=0,pseudocounts=1): '''Learn models using berg von hippel model. The foreground sequences are usually bin_1 and background in bin_0, this can be changed via flags.''' seq_dict,inv_dict = utils.choose_dict(dicttype) #check that the foreground and background chosen columns actually exist. columns_to_check = {'ct_' + str(foreground),'ct_' + str(background)} if not columns_to_check.issubset(set(df.columns)): raise SortSeqError('Foreground or Background column does not exist!') #get counts of each base at each position foreground_counts = utils.profile_counts(df,dicttype,bin_k=foreground) background_counts = utils.profile_counts(df,dicttype,bin_k=background) binheaders = utils.get_column_headers(foreground_counts) #add pseudocounts to each position foreground_counts[binheaders] = foreground_counts[binheaders] + pseudocounts background_counts[binheaders] = background_counts[binheaders] + pseudocounts #make sure there are no zeros in counts after addition of pseudocounts ct_headers = utils.get_column_headers(foreground_counts) if foreground_counts[ct_headers].isin([0]).values.any(): raise SortSeqError('''There are some bases without any representation in\ the foreground data, you should use pseudocounts to avoid failure \ of the learning method''') if background_counts[ct_headers].isin([0]).values.any(): raise SortSeqError('''There are some bases without any representation in\ the background data, you should use pseudocounts to avoid failure \ of the learning method''') #normalize to compute frequencies foreground_freqs = foreground_counts.copy() background_freqs = background_counts.copy() foreground_freqs[binheaders] = foreground_freqs[binheaders].div( foreground_freqs[binheaders].sum(axis=1),axis=0) background_freqs[binheaders] = background_freqs[binheaders].div( background_freqs[binheaders].sum(axis=1),axis=0) output_df = -np.log(foreground_freqs/background_freqs) #change column names accordingly (instead of ct_ we want val_) rename_dict = {'ct_' + str(inv_dict[i]):'val_' + str(inv_dict[i]) for i in range(len(seq_dict))} output_df = output_df.rename(columns=rename_dict) return output_df
def main( df,mp,noisetype,npar,nbins,sequence_library=True, start=0,end=None): #validate noise parameters if not isinstance(npar,list): raise SortSeqError('Noise parameters must be given as a list') if noisetype == 'Normal': if len(npar) != 1: raise SortSeqError('''For a normal noise model, there must be one input parameter (width of normal distribution)''') if noisetype == 'LogNormal': if len(npar) != 2: raise SortSeqError('''For a LogNormal noise model there must be 2 input parameters''') if nbins <= 1: raise SortSeqError('number of bins must be greater than 1') #generate predicted energy of each sequence. df = evaluate_model.main(df,mp,left=start,right=None) #Determine model type to use for noise if noisetype == 'LogNormal': NoiseModelSort = Models.LogNormalNoise(npar) elif noisetype == 'Normal': NoiseModelSort = Models.NormalNoise(npar) elif noisetype == 'None': NoiseModelSort = Models.NormalNoise([1e-16]) else: NoiseModelSort = Models.CustomModel(noisetype,npar) #Apply noise to our calculated energies noisyexp,listnoisyexp = NoiseModelSort.genlist(df) #Determine Expression Cutoffs for bins noisyexp.sort() cutoffs = list( noisyexp[np.linspace(0,len(noisyexp),nbins,endpoint=False,dtype=int)]) cutoffs.append(np.inf) seqs_arr = np.zeros([len(listnoisyexp),nbins],dtype=int) #split sequence into bins based on calculated cutoffs for i,entry in enumerate(listnoisyexp): seqs_arr[i,:] = np.histogram(entry,bins=cutoffs)[0] col_labels = ['ct_' + str(i+1) for i in range(nbins)] if sequence_library: df['ct_0'] = utils.sample(df['ct'],int(df['ct'].sum()/nbins)) output_df = pd.concat([df,pd.DataFrame(seqs_arr,columns=col_labels)],axis=1) col_labels = utils.get_column_headers(output_df) output_df['ct'] = output_df[col_labels].sum(axis=1) output_df = output_df.drop('val',axis=1) return output_df
def main(df,lm='IM',modeltype='MAT',LS_means_std=None,\ db=None,iteration=30000,burnin=1000,thin=10,\ runnum=0,initialize='LS',start=0,end=None,foreground=1,\ background=0,alpha=0,pseudocounts=1,test=False,drop_library=False,\ verbose=False): # Determine dictionary seq_cols = qc.get_cols_from_df(df,'seqs') if not len(seq_cols)==1: raise SortSeqError('Dataframe has multiple seq cols: %s'%str(seq_cols)) dicttype = qc.colname_to_seqtype_dict[seq_cols[0]] seq_dict,inv_dict = utils.choose_dict(dicttype,modeltype=modeltype) '''Check to make sure the chosen dictionary type correctly describes the sequences. An issue with this test is that if you have DNA sequence but choose a protein dictionary, you will still pass this test bc A,C, G,T are also valid amino acids''' #set name of sequences column based on type of sequence type_name_dict = {'dna':'seq','rna':'seq_rna','protein':'seq_pro'} seq_col_name = type_name_dict[dicttype] lin_seq_dict,lin_inv_dict = utils.choose_dict(dicttype,modeltype='MAT') #wtseq = utils.profile_counts(df.copy(),dicttype,return_wtseq=True,start=start,end=end) #wt_seq_dict_list = [{inv_dict[np.mod(i+1+seq_dict[w],len(seq_dict))]:i for i in range(len(seq_dict)-1)} for w in wtseq] par_seq_dict = {v:k for v,k in seq_dict.items() if k != (len(seq_dict)-1)} #drop any rows with ct = 0 df = df[df.loc[:,'ct'] != 0] df.reset_index(drop=True,inplace=True) #If there are sequences of different lengths, then print error but continue if len(set(df[seq_col_name].apply(len))) > 1: sys.stderr.write('Lengths of all sequences are not the same!') #select target sequence region df.loc[:,seq_col_name] = df.loc[:,seq_col_name].str.slice(start,end) df = utils.collapse_further(df) col_headers = utils.get_column_headers(df) #make sure all counts are ints df[col_headers] = df[col_headers].astype(int) #create vector of column names val_cols = ['val_' + inv_dict[i] for i in range(len(seq_dict))] df.reset_index(inplace=True,drop=True) #Drop any sequences with incorrect length if not end: '''is no value for end of sequence was supplied, assume first seq is correct length''' seqL = len(df[seq_col_name][0]) - start else: seqL = end-start df = df[df[seq_col_name].apply(len) == (seqL)] df.reset_index(inplace=True,drop=True) #Do something different for each type of learning method (lm) if lm == 'ER': emat = Berg_von_Hippel( df,dicttype,foreground=foreground,background=background, pseudocounts=pseudocounts) if lm == 'LS': '''First check that is we don't have a penalty for ridge regression, that we at least have all possible base values so that the analysis will not fail''' if LS_means_std: #If user supplied preset means and std for each bin means_std_df = io.load_meanstd(LS_means_std) #change bin number to 'ct_number' and then use as index labels = list(means_std_df['bin'].apply(add_label)) std = means_std_df['std'] std.index = labels #Change Weighting of each sequence by dividing counts by bin std df[labels] = df[labels].div(std) means = means_std_df['mean'] means.index = labels else: means = None #drop all rows without counts df['ct'] = df[col_headers].sum(axis=1) df = df[df.ct != 0] df.reset_index(inplace=True,drop=True) ''' For sort-seq experiments, bin_0 is library only and isn't the lowest expression even though it is will be calculated as such if we proceed. Therefore is drop_library is passed, drop this column from analysis.''' if drop_library: try: df.drop('ct_0',inplace=True) col_headers = utils.get_column_headers(df) if len(col_headers) < 2: raise SortSeqError( '''After dropping library there are no longer enough columns to run the analysis''') except: raise SortSeqError('''drop_library option was passed, but no ct_0 column exists''') #parameterize sequences into 3xL vectors raveledmat,batch,sw = utils.genweightandmat( df,par_seq_dict,dicttype,means=means,modeltype=modeltype) #Use ridge regression to find matrix. emat = Compute_Least_Squares(raveledmat,batch,sw,alpha=alpha) if lm == 'IM': seq_mat,wtrow = numerics.dataset2mutarray(df.copy(),modeltype) #this is also an MCMC routine, do the same as above. if initialize == 'rand': if modeltype == 'MAT': emat_0 = utils.RandEmat(len(df[seq_col_name][0]),len(seq_dict)) elif modeltype == 'NBR': emat_0 = utils.RandEmat(len(df['seq'][0])-1,len(seq_dict)) elif initialize == 'LS': emat_cols = ['val_' + inv_dict[i] for i in range(len(seq_dict))] emat_0_df = main(df.copy(),lm='LS',modeltype=modeltype,alpha=alpha,start=0,end=None,verbose=verbose) emat_0 = np.transpose(np.array(emat_0_df[emat_cols])) #pymc doesn't take sparse mat emat = MaximizeMI_memsaver( seq_mat,df.copy(),emat_0,wtrow,db=db,iteration=iteration,burnin=burnin, thin=thin,runnum=runnum,verbose=verbose) #now format the energy matrices to get them ready to output if (lm == 'IM' or lm == 'memsaver'): if modeltype == 'NBR': emat_typical = gauge.fix_neighbor(np.transpose(emat)) elif modeltype == 'MAT': emat_typical = gauge.fix_matrix(np.transpose(emat)) elif lm == 'ER': '''the emat for this format is currently transposed compared to other formats it is also already a data frame with columns [pos,val_...]''' emat_cols = ['val_' + inv_dict[i] for i in range(len(seq_dict))] emat_typical = emat[emat_cols] emat_typical = (gauge.fix_matrix((np.array(emat_typical)))) else: #must be Least squares emat_typical = utils.emat_typical_parameterization(emat,len(seq_dict)) if modeltype == 'NBR': emat_typical = gauge.fix_neighbor(np.transpose(emat_typical)) elif modeltype == 'MAT': emat_typical = gauge.fix_matrix(np.transpose(emat_typical)) em = pd.DataFrame(emat_typical) em.columns = val_cols #add position column if modeltype == 'NBR': pos = pd.Series(range(start,start - 1 + len(df[seq_col_name][0])),name='pos') else: pos = pd.Series(range(start,start + len(df[seq_col_name][0])),name='pos') output_df = pd.concat([pos,em],axis=1) # Validate model and return output_df = qc.validate_model(output_df,fix=True) return output_df