def main(dataset_df,model_df,left=None,right=None): # Validate dataframes qc.validate_dataset(dataset_df) qc.validate_model(model_df) # Detect model type based on columns seqtype, modeltype = qc.get_model_type(model_df) seqcol = qc.seqtype_to_seqcolname_dict[seqtype] # Set start and end based on left or right if not ((left is None) or (right is None)): raise SortSeqError('Cannot set both left and right at same time.') if not (left is None): start = left end = start + model_df.shape[0] + (1 if modeltype=='NBR' else 0) elif not (right is None): end = right start = end - model_df.shape[0] - (1 if modeltype=='NBR' else 0) else: start = model_df['pos'].values[0] end = model_df['pos'].values[-1] + (2 if modeltype=='NBR' else 1) assert start < end # Validate start and end positions seq_length = len(dataset_df[seqcol][0]) if start < 0: raise SortSeqError('Invalid start=%d'%start) if end > seq_length: raise SortSeqError('Invalid end=%d for seq_length=%d'%(end,seq_length)) #select target sequence region out_df = dataset_df.copy() out_df.loc[:,'seq'] = out_df.loc[:,'seq'].str.slice(start,end) #Create model object of correct type if modeltype == 'MAT': mymodel = Models.LinearModel(model_df) elif modeltype == 'NBR': mymodel = Models.NeighborModel(model_df) else: raise SortSeqError('Unrecognized model type %s'%modeltype) # Compute values out_df['val'] = mymodel.evaluate(out_df) # Validate dataframe and return return qc.validate_dataset(out_df,fix=True)
def __init__(self,model_df): """ Constructor takes model parameters in the form of a model dataframe """ model_df = qc.validate_model(model_df.copy(),fix=True) seqtype, modeltype = qc.get_model_type(model_df) if not modeltype=='NBR': raise SortSeqError('Invalid modeltype: %s'%modeltype) seq_dict,inv_dict = utils.choose_dict(seqtype,modeltype=modeltype) self.seqtype = seqtype self.seq_dict = seq_dict self.inv_dict = inv_dict self.df = model_df self.length = model_df.shape[0]+1 # Extract matrix part of model dataframe headers = qc.get_cols_from_df(model_df,'vals') self.matrix = np.transpose(np.array(model_df[headers]))
def main(df,lm='IM',modeltype='MAT',LS_means_std=None,\ db=None,iteration=30000,burnin=1000,thin=10,\ runnum=0,initialize='LS',start=0,end=None,foreground=1,\ background=0,alpha=0,pseudocounts=1,test=False,drop_library=False,\ verbose=False): # Determine dictionary seq_cols = qc.get_cols_from_df(df,'seqs') if not len(seq_cols)==1: raise SortSeqError('Dataframe has multiple seq cols: %s'%str(seq_cols)) dicttype = qc.colname_to_seqtype_dict[seq_cols[0]] seq_dict,inv_dict = utils.choose_dict(dicttype,modeltype=modeltype) '''Check to make sure the chosen dictionary type correctly describes the sequences. An issue with this test is that if you have DNA sequence but choose a protein dictionary, you will still pass this test bc A,C, G,T are also valid amino acids''' #set name of sequences column based on type of sequence type_name_dict = {'dna':'seq','rna':'seq_rna','protein':'seq_pro'} seq_col_name = type_name_dict[dicttype] lin_seq_dict,lin_inv_dict = utils.choose_dict(dicttype,modeltype='MAT') #wtseq = utils.profile_counts(df.copy(),dicttype,return_wtseq=True,start=start,end=end) #wt_seq_dict_list = [{inv_dict[np.mod(i+1+seq_dict[w],len(seq_dict))]:i for i in range(len(seq_dict)-1)} for w in wtseq] par_seq_dict = {v:k for v,k in seq_dict.items() if k != (len(seq_dict)-1)} #drop any rows with ct = 0 df = df[df.loc[:,'ct'] != 0] df.reset_index(drop=True,inplace=True) #If there are sequences of different lengths, then print error but continue if len(set(df[seq_col_name].apply(len))) > 1: sys.stderr.write('Lengths of all sequences are not the same!') #select target sequence region df.loc[:,seq_col_name] = df.loc[:,seq_col_name].str.slice(start,end) df = utils.collapse_further(df) col_headers = utils.get_column_headers(df) #make sure all counts are ints df[col_headers] = df[col_headers].astype(int) #create vector of column names val_cols = ['val_' + inv_dict[i] for i in range(len(seq_dict))] df.reset_index(inplace=True,drop=True) #Drop any sequences with incorrect length if not end: '''is no value for end of sequence was supplied, assume first seq is correct length''' seqL = len(df[seq_col_name][0]) - start else: seqL = end-start df = df[df[seq_col_name].apply(len) == (seqL)] df.reset_index(inplace=True,drop=True) #Do something different for each type of learning method (lm) if lm == 'ER': emat = Berg_von_Hippel( df,dicttype,foreground=foreground,background=background, pseudocounts=pseudocounts) if lm == 'LS': '''First check that is we don't have a penalty for ridge regression, that we at least have all possible base values so that the analysis will not fail''' if LS_means_std: #If user supplied preset means and std for each bin means_std_df = io.load_meanstd(LS_means_std) #change bin number to 'ct_number' and then use as index labels = list(means_std_df['bin'].apply(add_label)) std = means_std_df['std'] std.index = labels #Change Weighting of each sequence by dividing counts by bin std df[labels] = df[labels].div(std) means = means_std_df['mean'] means.index = labels else: means = None #drop all rows without counts df['ct'] = df[col_headers].sum(axis=1) df = df[df.ct != 0] df.reset_index(inplace=True,drop=True) ''' For sort-seq experiments, bin_0 is library only and isn't the lowest expression even though it is will be calculated as such if we proceed. Therefore is drop_library is passed, drop this column from analysis.''' if drop_library: try: df.drop('ct_0',inplace=True) col_headers = utils.get_column_headers(df) if len(col_headers) < 2: raise SortSeqError( '''After dropping library there are no longer enough columns to run the analysis''') except: raise SortSeqError('''drop_library option was passed, but no ct_0 column exists''') #parameterize sequences into 3xL vectors raveledmat,batch,sw = utils.genweightandmat( df,par_seq_dict,dicttype,means=means,modeltype=modeltype) #Use ridge regression to find matrix. emat = Compute_Least_Squares(raveledmat,batch,sw,alpha=alpha) if lm == 'IM': seq_mat,wtrow = numerics.dataset2mutarray(df.copy(),modeltype) #this is also an MCMC routine, do the same as above. if initialize == 'rand': if modeltype == 'MAT': emat_0 = utils.RandEmat(len(df[seq_col_name][0]),len(seq_dict)) elif modeltype == 'NBR': emat_0 = utils.RandEmat(len(df['seq'][0])-1,len(seq_dict)) elif initialize == 'LS': emat_cols = ['val_' + inv_dict[i] for i in range(len(seq_dict))] emat_0_df = main(df.copy(),lm='LS',modeltype=modeltype,alpha=alpha,start=0,end=None,verbose=verbose) emat_0 = np.transpose(np.array(emat_0_df[emat_cols])) #pymc doesn't take sparse mat emat = MaximizeMI_memsaver( seq_mat,df.copy(),emat_0,wtrow,db=db,iteration=iteration,burnin=burnin, thin=thin,runnum=runnum,verbose=verbose) #now format the energy matrices to get them ready to output if (lm == 'IM' or lm == 'memsaver'): if modeltype == 'NBR': emat_typical = gauge.fix_neighbor(np.transpose(emat)) elif modeltype == 'MAT': emat_typical = gauge.fix_matrix(np.transpose(emat)) elif lm == 'ER': '''the emat for this format is currently transposed compared to other formats it is also already a data frame with columns [pos,val_...]''' emat_cols = ['val_' + inv_dict[i] for i in range(len(seq_dict))] emat_typical = emat[emat_cols] emat_typical = (gauge.fix_matrix((np.array(emat_typical)))) else: #must be Least squares emat_typical = utils.emat_typical_parameterization(emat,len(seq_dict)) if modeltype == 'NBR': emat_typical = gauge.fix_neighbor(np.transpose(emat_typical)) elif modeltype == 'MAT': emat_typical = gauge.fix_matrix(np.transpose(emat_typical)) em = pd.DataFrame(emat_typical) em.columns = val_cols #add position column if modeltype == 'NBR': pos = pd.Series(range(start,start - 1 + len(df[seq_col_name][0])),name='pos') else: pos = pd.Series(range(start,start + len(df[seq_col_name][0])),name='pos') output_df = pd.concat([pos,em],axis=1) # Validate model and return output_df = qc.validate_model(output_df,fix=True) return output_df
def main(model_df, contig_list, numsites=10, verbose=False): # Determine type of string from model qc.validate_model(model_df) seqtype, modeltype = qc.get_model_type(model_df) seq_dict,inv_dict = utils.choose_dict(seqtype,modeltype=modeltype) # Check that all characters are from the correct alphabet alphabet = qc.seqtype_to_alphabet_dict[seqtype] search_string = r"[^%s]"%alphabet for contig_str, contig_name, pos_offset in contig_list: if re.search(search_string,contig_str): raise SortSeqError(\ 'Invalid character for seqtype %s found in %s.'%\ (seqtype,contig_name)) # Create model object to evaluate on seqs if modeltype == 'MAT': model_obj = Models.LinearModel(model_df) elif modeltype == 'NBR': model_obj = Models.NeighborModel(model_df) # Create list of dataframes, one for each contig seq_col = qc.seqtype_to_seqcolname_dict[seqtype] L = model_obj.length sitelist_df = pd.DataFrame(\ columns=['val',seq_col,'left','right','ori','contig']) for contig_str, contig_name, pos_offset in contig_list: if len(contig_str) < L: continue this_df = pd.DataFrame(\ columns=['val',seq_col,'left','right','ori','contig']) num_sites = len(contig_str) - L + 1 poss = np.arange(num_sites).astype(int) this_df['left'] = poss + pos_offset this_df['right'] = poss + pos_offset + L - 1 #this_df[seq_col] = [contig_str[i:(i+L)] for i in poss] this_df[seq_col] = fast.seq2sitelist(contig_str,L) #Cython this_df['ori'] = '+' this_df['contig'] = contig_name this_df['val'] = model_obj.evaluate(this_df[seq_col]) sitelist_df = pd.concat([sitelist_df,this_df], ignore_index=True) # If scanning DNA, scan reverse-complement as well if seqtype=='dna': #this_df[seq_col] = [qc.rc(s) for s in this_df[seq_col]] this_df[seq_col] = fast.seq2sitelist(contig_str,L,rc=True) #Cython this_df['ori'] = '-' this_df['val'] = model_obj.evaluate(this_df[seq_col]) sitelist_df = pd.concat([sitelist_df,this_df], ignore_index=True) # Sort by value and reindex sitelist_df.sort_values(by='val', ascending=False, inplace=True) sitelist_df.reset_index(drop=True,inplace=True) # Crop list at numsites if sitelist_df.shape[0]>numsites: sitelist_df.drop(sitelist_df.index[numsites:], inplace=True) if verbose: print '.', sys.stdout.flush() if verbose: print '' sys.stdout.flush() # If no sites were found, raise error if sitelist_df.shape[0]==0: raise SortSeqError(\ 'No full-length sites found within provided contigs.') sitelist_df = qc.validate_sitelist(sitelist_df,fix=True) return sitelist_df