def main( data_df,model_df, start=0,end=None,err=False,coarse_graining_level=0): dicttype, modeltype = qc.get_model_type(model_df) seq_cols = qc.get_cols_from_df(data_df,'seqs') if not len(seq_cols)==1: raise SortSeqError('Dataframe has multiple seq cols: %s'%str(seq_cols)) seq_dict,inv_dict = utils.choose_dict(dicttype,modeltype=modeltype) #set name of sequences column based on type of sequence type_name_dict = {'dna':'seq','rna':'seq_rna','protein':'seq_pro'} seq_col_name = type_name_dict[dicttype] #Cut the sequences based on start and end, and then check if it makes sense if (start != 0 or end): data_df.loc[:,seq_col_name] = data_df.loc[:,seq_col_name].str.slice(start,end) if modeltype=='MAT': if len(data_df.loc[0,seq_col_name]) != len(model_df.loc[:,'pos']): raise SortSeqError('model length does not match dataset length') elif modeltype=='NBR': if len(data_df.loc[0,seq_col_name]) != len(model_df.loc[:,'pos'])+1: raise SortSeqError('model length does not match dataset length') col_headers = utils.get_column_headers(data_df) if 'ct' not in data_df.columns: data_df['ct'] = data_df[col_headers].sum(axis=1) data_df = data_df[data_df.ct != 0] if not end: seqL = len(data_df[seq_col_name][0]) - start else: seqL = end-start data_df = data_df[data_df[seq_col_name].apply(len) == (seqL)] #make a numpy array out of the model data frame model_df_headers = ['val_' + str(inv_dict[i]) for i in range(len(seq_dict))] value = np.transpose(np.array(model_df[model_df_headers])) #now we evaluate the expression of each sequence according to the model. seq_mat,wtrow = numerics.dataset2mutarray(data_df.copy(),modeltype) temp_df = data_df.copy() temp_df['val'] = numerics.eval_modelmatrix_on_mutarray(np.array(model_df[model_df_headers]),seq_mat,wtrow) temp_sorted = temp_df.sort_values(by='val') temp_sorted.reset_index(inplace=True,drop=True) #we must divide by the total number of counts in each bin for the MI calculator #temp_sorted[col_headers] = temp_sorted[col_headers].div(temp_sorted['ct'],axis=0) MI = EstimateMutualInfoforMImax.alt4(temp_sorted,coarse_graining_level=coarse_graining_level) if not err: Std = np.NaN else: data_df_for_sub = data_df.copy() sub_MI = np.zeros(15) for i in range(15): sub_df = data_df_for_sub.sample(int(len(data_df_for_sub.index)/2)) sub_df.reset_index(inplace=True,drop=True) sub_MI[i],sub_std = main( sub_df,model_df,err=False) Std = np.std(sub_MI)/np.sqrt(2) return MI,Std
def __init__(self,model_df): """ Constructor takes model parameters in the form of a model dataframe """ model_df = qc.validate_model(model_df.copy(),fix=True) seqtype, modeltype = qc.get_model_type(model_df) if not modeltype=='NBR': raise SortSeqError('Invalid modeltype: %s'%modeltype) seq_dict,inv_dict = utils.choose_dict(seqtype,modeltype=modeltype) self.seqtype = seqtype self.seq_dict = seq_dict self.inv_dict = inv_dict self.df = model_df self.length = model_df.shape[0]+1 # Extract matrix part of model dataframe headers = qc.get_cols_from_df(model_df,'vals') self.matrix = np.transpose(np.array(model_df[headers]))
def Berg_von_Hippel(df,dicttype,foreground=1,background=0,pseudocounts=1): '''Learn models using berg von hippel model. The foreground sequences are usually bin_1 and background in bin_0, this can be changed via flags.''' seq_dict,inv_dict = utils.choose_dict(dicttype) #check that the foreground and background chosen columns actually exist. columns_to_check = {'ct_' + str(foreground),'ct_' + str(background)} if not columns_to_check.issubset(set(df.columns)): raise SortSeqError('Foreground or Background column does not exist!') #get counts of each base at each position foreground_counts = utils.profile_counts(df,dicttype,bin_k=foreground) background_counts = utils.profile_counts(df,dicttype,bin_k=background) binheaders = utils.get_column_headers(foreground_counts) #add pseudocounts to each position foreground_counts[binheaders] = foreground_counts[binheaders] + pseudocounts background_counts[binheaders] = background_counts[binheaders] + pseudocounts #make sure there are no zeros in counts after addition of pseudocounts ct_headers = utils.get_column_headers(foreground_counts) if foreground_counts[ct_headers].isin([0]).values.any(): raise SortSeqError('''There are some bases without any representation in\ the foreground data, you should use pseudocounts to avoid failure \ of the learning method''') if background_counts[ct_headers].isin([0]).values.any(): raise SortSeqError('''There are some bases without any representation in\ the background data, you should use pseudocounts to avoid failure \ of the learning method''') #normalize to compute frequencies foreground_freqs = foreground_counts.copy() background_freqs = background_counts.copy() foreground_freqs[binheaders] = foreground_freqs[binheaders].div( foreground_freqs[binheaders].sum(axis=1),axis=0) background_freqs[binheaders] = background_freqs[binheaders].div( background_freqs[binheaders].sum(axis=1),axis=0) output_df = -np.log(foreground_freqs/background_freqs) #change column names accordingly (instead of ct_ we want val_) rename_dict = {'ct_' + str(inv_dict[i]):'val_' + str(inv_dict[i]) for i in range(len(seq_dict))} output_df = output_df.rename(columns=rename_dict) return output_df
x = fast.seq2sitelist(seq, site_length, rc=True, safe=False) c_time = time.time() - t print 'cython, seq2sitelist w/ rc: %f sec to chop seq into %d rc sites'%\ (c_time,len(x)) print '%.1f-fold speedup.' % (p_time / c_time) print '-----------------------------' # Test seqs2array_for_matmodel sites = fast.seq2sitelist(seq, 20) site_length = len(sites[0]) num_sites = len(sites) t = time.time() seq_dict, inv_dict = utils.choose_dict(seqtype, modeltype='MAT') sitearray_p = np.zeros([num_sites, (site_length * len(seq_dict))], dtype=int) for i, site in enumerate(sites): sitearray_p[i, :] = utils.seq2mat(site, seq_dict).ravel() p_time = time.time() - t print 'python, seqs2array_for_matmodel: %f sec to convert %d %s seqs of length %d'%\ (p_time,num_sites,seqtype,site_length) t = time.time() sitearray = fast.seqs2array_for_matmodel(sites, seqtype, safe=False) c_time = time.time() - t print 'cython, seqs2array_for_matmodel: %f sec to convert %d %s seqs of length %d'%\ (c_time,num_sites,seqtype,site_length) print '%.1f-fold speedup.' % (p_time / c_time)
def main(df,lm='IM',modeltype='MAT',LS_means_std=None,\ db=None,iteration=30000,burnin=1000,thin=10,\ runnum=0,initialize='LS',start=0,end=None,foreground=1,\ background=0,alpha=0,pseudocounts=1,test=False,drop_library=False,\ verbose=False): # Determine dictionary seq_cols = qc.get_cols_from_df(df,'seqs') if not len(seq_cols)==1: raise SortSeqError('Dataframe has multiple seq cols: %s'%str(seq_cols)) dicttype = qc.colname_to_seqtype_dict[seq_cols[0]] seq_dict,inv_dict = utils.choose_dict(dicttype,modeltype=modeltype) '''Check to make sure the chosen dictionary type correctly describes the sequences. An issue with this test is that if you have DNA sequence but choose a protein dictionary, you will still pass this test bc A,C, G,T are also valid amino acids''' #set name of sequences column based on type of sequence type_name_dict = {'dna':'seq','rna':'seq_rna','protein':'seq_pro'} seq_col_name = type_name_dict[dicttype] lin_seq_dict,lin_inv_dict = utils.choose_dict(dicttype,modeltype='MAT') #wtseq = utils.profile_counts(df.copy(),dicttype,return_wtseq=True,start=start,end=end) #wt_seq_dict_list = [{inv_dict[np.mod(i+1+seq_dict[w],len(seq_dict))]:i for i in range(len(seq_dict)-1)} for w in wtseq] par_seq_dict = {v:k for v,k in seq_dict.items() if k != (len(seq_dict)-1)} #drop any rows with ct = 0 df = df[df.loc[:,'ct'] != 0] df.reset_index(drop=True,inplace=True) #If there are sequences of different lengths, then print error but continue if len(set(df[seq_col_name].apply(len))) > 1: sys.stderr.write('Lengths of all sequences are not the same!') #select target sequence region df.loc[:,seq_col_name] = df.loc[:,seq_col_name].str.slice(start,end) df = utils.collapse_further(df) col_headers = utils.get_column_headers(df) #make sure all counts are ints df[col_headers] = df[col_headers].astype(int) #create vector of column names val_cols = ['val_' + inv_dict[i] for i in range(len(seq_dict))] df.reset_index(inplace=True,drop=True) #Drop any sequences with incorrect length if not end: '''is no value for end of sequence was supplied, assume first seq is correct length''' seqL = len(df[seq_col_name][0]) - start else: seqL = end-start df = df[df[seq_col_name].apply(len) == (seqL)] df.reset_index(inplace=True,drop=True) #Do something different for each type of learning method (lm) if lm == 'ER': emat = Berg_von_Hippel( df,dicttype,foreground=foreground,background=background, pseudocounts=pseudocounts) if lm == 'LS': '''First check that is we don't have a penalty for ridge regression, that we at least have all possible base values so that the analysis will not fail''' if LS_means_std: #If user supplied preset means and std for each bin means_std_df = io.load_meanstd(LS_means_std) #change bin number to 'ct_number' and then use as index labels = list(means_std_df['bin'].apply(add_label)) std = means_std_df['std'] std.index = labels #Change Weighting of each sequence by dividing counts by bin std df[labels] = df[labels].div(std) means = means_std_df['mean'] means.index = labels else: means = None #drop all rows without counts df['ct'] = df[col_headers].sum(axis=1) df = df[df.ct != 0] df.reset_index(inplace=True,drop=True) ''' For sort-seq experiments, bin_0 is library only and isn't the lowest expression even though it is will be calculated as such if we proceed. Therefore is drop_library is passed, drop this column from analysis.''' if drop_library: try: df.drop('ct_0',inplace=True) col_headers = utils.get_column_headers(df) if len(col_headers) < 2: raise SortSeqError( '''After dropping library there are no longer enough columns to run the analysis''') except: raise SortSeqError('''drop_library option was passed, but no ct_0 column exists''') #parameterize sequences into 3xL vectors raveledmat,batch,sw = utils.genweightandmat( df,par_seq_dict,dicttype,means=means,modeltype=modeltype) #Use ridge regression to find matrix. emat = Compute_Least_Squares(raveledmat,batch,sw,alpha=alpha) if lm == 'IM': seq_mat,wtrow = numerics.dataset2mutarray(df.copy(),modeltype) #this is also an MCMC routine, do the same as above. if initialize == 'rand': if modeltype == 'MAT': emat_0 = utils.RandEmat(len(df[seq_col_name][0]),len(seq_dict)) elif modeltype == 'NBR': emat_0 = utils.RandEmat(len(df['seq'][0])-1,len(seq_dict)) elif initialize == 'LS': emat_cols = ['val_' + inv_dict[i] for i in range(len(seq_dict))] emat_0_df = main(df.copy(),lm='LS',modeltype=modeltype,alpha=alpha,start=0,end=None,verbose=verbose) emat_0 = np.transpose(np.array(emat_0_df[emat_cols])) #pymc doesn't take sparse mat emat = MaximizeMI_memsaver( seq_mat,df.copy(),emat_0,wtrow,db=db,iteration=iteration,burnin=burnin, thin=thin,runnum=runnum,verbose=verbose) #now format the energy matrices to get them ready to output if (lm == 'IM' or lm == 'memsaver'): if modeltype == 'NBR': emat_typical = gauge.fix_neighbor(np.transpose(emat)) elif modeltype == 'MAT': emat_typical = gauge.fix_matrix(np.transpose(emat)) elif lm == 'ER': '''the emat for this format is currently transposed compared to other formats it is also already a data frame with columns [pos,val_...]''' emat_cols = ['val_' + inv_dict[i] for i in range(len(seq_dict))] emat_typical = emat[emat_cols] emat_typical = (gauge.fix_matrix((np.array(emat_typical)))) else: #must be Least squares emat_typical = utils.emat_typical_parameterization(emat,len(seq_dict)) if modeltype == 'NBR': emat_typical = gauge.fix_neighbor(np.transpose(emat_typical)) elif modeltype == 'MAT': emat_typical = gauge.fix_matrix(np.transpose(emat_typical)) em = pd.DataFrame(emat_typical) em.columns = val_cols #add position column if modeltype == 'NBR': pos = pd.Series(range(start,start - 1 + len(df[seq_col_name][0])),name='pos') else: pos = pd.Series(range(start,start + len(df[seq_col_name][0])),name='pos') output_df = pd.concat([pos,em],axis=1) # Validate model and return output_df = qc.validate_model(output_df,fix=True) return output_df
def main(model_df, contig_list, numsites=10, verbose=False): # Determine type of string from model qc.validate_model(model_df) seqtype, modeltype = qc.get_model_type(model_df) seq_dict, inv_dict = utils.choose_dict(seqtype, modeltype=modeltype) # Check that all characters are from the correct alphabet alphabet = qc.seqtype_to_alphabet_dict[seqtype] search_string = r"[^%s]" % alphabet for contig_str, contig_name, pos_offset in contig_list: if re.search(search_string, contig_str): raise SortSeqError("Invalid character for seqtype %s found in %s." % (seqtype, contig_name)) # Create model object to evaluate on seqs if modeltype == "MAT": model_obj = Models.LinearModel(model_df) elif modeltype == "NBR": model_obj = Models.NeighborModel(model_df) # Create list of dataframes, one for each contig seq_col = qc.seqtype_to_seqcolname_dict[seqtype] L = model_obj.length sitelist_df = pd.DataFrame(columns=["val", seq_col, "left", "right", "ori", "contig"]) for contig_str, contig_name, pos_offset in contig_list: if len(contig_str) < L: continue this_df = pd.DataFrame(columns=["val", seq_col, "left", "right", "ori", "contig"]) num_sites = len(contig_str) - L + 1 poss = np.arange(num_sites).astype(int) this_df["left"] = poss + pos_offset this_df["right"] = poss + pos_offset + L - 1 # this_df[seq_col] = [contig_str[i:(i+L)] for i in poss] this_df[seq_col] = fast.seq2sitelist(contig_str, L) # Cython this_df["ori"] = "+" this_df["contig"] = contig_name this_df["val"] = model_obj.evaluate(this_df[seq_col]) sitelist_df = pd.concat([sitelist_df, this_df], ignore_index=True) # If scanning DNA, scan reverse-complement as well if seqtype == "dna": # this_df[seq_col] = [qc.rc(s) for s in this_df[seq_col]] this_df[seq_col] = fast.seq2sitelist(contig_str, L, rc=True) # Cython this_df["ori"] = "-" this_df["val"] = model_obj.evaluate(this_df[seq_col]) sitelist_df = pd.concat([sitelist_df, this_df], ignore_index=True) # Sort by value and reindex sitelist_df.sort_values(by="val", ascending=False, inplace=True) sitelist_df.reset_index(drop=True, inplace=True) # Crop list at numsites if sitelist_df.shape[0] > numsites: sitelist_df.drop(sitelist_df.index[numsites:], inplace=True) if verbose: print ".", sys.stdout.flush() if verbose: print "" sys.stdout.flush() # If no sites were found, raise error if sitelist_df.shape[0] == 0: raise SortSeqError("No full-length sites found within provided contigs.") sitelist_df = qc.validate_sitelist(sitelist_df, fix=True) return sitelist_df
x = fast.seq2sitelist(seq,site_length, rc=True, safe=False) c_time = time.time()-t print 'cython, seq2sitelist w/ rc: %f sec to chop seq into %d rc sites'%\ (c_time,len(x)) print '%.1f-fold speedup.'%(p_time/c_time) print '-----------------------------' # Test seqs2array_for_matmodel sites = fast.seq2sitelist(seq,20) site_length = len(sites[0]) num_sites = len(sites) t = time.time() seq_dict,inv_dict = utils.choose_dict(seqtype,modeltype='MAT') sitearray_p = np.zeros([num_sites,(site_length*len(seq_dict))],dtype=int) for i, site in enumerate(sites): sitearray_p[i,:] = utils.seq2mat(site,seq_dict).ravel() p_time = time.time()-t print 'python, seqs2array_for_matmodel: %f sec to convert %d %s seqs of length %d'%\ (p_time,num_sites,seqtype,site_length) t = time.time() sitearray = fast.seqs2array_for_matmodel(sites,seqtype,safe=False) c_time = time.time()-t print 'cython, seqs2array_for_matmodel: %f sec to convert %d %s seqs of length %d'%\ (c_time,num_sites,seqtype,site_length) print '%.1f-fold speedup.'%(p_time/c_time)
def main(wtseq=None, mutrate=0.10, numseq=10000,dicttype='dna',probarr=None, tags=False,tag_length=10): #generate sequence dictionary seq_dict,inv_dict = utils.choose_dict(dicttype) mutrate = float(mutrate) if (mutrate < 0.0) or (mutrate > 1.0): raise SortSeqError('Invalid mutrate==%f'%mutrate) numseq = int(numseq) if (numseq <= 0): raise SortSeqError('numseq must be positive. Is %d'%numseq) tag_length = int(tag_length) if (tag_length <= 0): raise SortSeqErorr('tag_length must be positive. Is %d'%tag_length) if isinstance(probarr,np.ndarray): L = probarr.shape[1] #Generate bases according to provided probability matrix letarr = np.zeros([numseq,L]) for z in range(L): letarr[:,z] = np.random.choice( range(len(seq_dict)),numseq,p=probarr[:,z]) else: parr = [] wtseq = wtseq.upper() L = len(wtseq) letarr = np.zeros([numseq,L]) #Check to make sure the wtseq uses the correct bases. lin_seq_dict,lin_inv_dict = utils.choose_dict(dicttype,modeltype='MAT') def check_sequences(s): return set(s).issubset(lin_seq_dict) if not check_sequences(wtseq): raise SortSeqError( 'wtseq can only contain bases in ' + str(lin_seq_dict.keys())) #find wtseq array wtarr = seq2arr(wtseq,seq_dict) mrate = mutrate/(len(seq_dict)-1) #prob of non wildtype #Generate sequences by mutating away from wildtype '''probabilities away from wildtype (0 = stays the same, a 3 for example means a C becomes an A, a 1 means C-> G)''' parr = np.array( [1-(len(seq_dict)-1)*mrate] + [mrate for i in range(len(seq_dict)-1)]) #Generate random movements from wtseq letarr = np.random.choice( range(len(seq_dict)),[numseq,len(wtseq)],p=parr) #Find sequences letarr = np.mod(letarr + wtarr,len(seq_dict)) seqs= [] #Convert Back to letters for i in range(numseq): seqs.append(arr2seq(letarr[i,:],inv_dict)) seq_col = qc.seqtype_to_seqcolname_dict[dicttype] seqs_df = pd.DataFrame(seqs, columns=[seq_col]) # If simulating tags, each generated seq gets a unique tag if tags: tag_seq_dict,tag_inv_dict = utils.choose_dict('dna') tag_alphabet_list = tag_seq_dict.keys() # Make sure tag_length is long enough for the number of tags needed if len(tag_alphabet_list)**tag_length < 2*numseq: raise SortSeqError(\ 'tag_length=%d is too short for num_tags_needed=%d'%\ (tag_length,numseq)) # Generate a unique tag for each unique sequence tag_set = set([]) while len(tag_set) < numseq: num_tags_left = numseq - len(tag_set) new_tags = [''.join(choice(tag_alphabet_list,size=tag_length)) \ for i in range(num_tags_left)] tag_set = tag_set.union(new_tags) df = seqs_df.copy() df.loc[:,'ct'] = 1 df.loc[:,'tag'] = list(tag_set) # If not simulating tags, list only unique seqs w/ corresponding counts else: seqs_counts = seqs_df[seq_col].value_counts() df = seqs_counts.reset_index() df.columns = [seq_col,'ct'] # Convert into valid dataset dataframe and return return qc.validate_dataset(df,fix=True)
def main(data_df, model_df, start=0, end=None, err=False, coarse_graining_level=0, rsquared=False, return_freg=False): #determine whether you are working with RNA, DNA, or protein. #this also should determine modeltype (MAT, NBR, PAIR). dicttype, modeltype = qc.get_model_type(model_df) #get column header for the sequence column. seq_cols = qc.get_cols_from_df(data_df, 'seqs') if not len(seq_cols) == 1: raise SortSeqError('Dataframe has multiple seq cols: %s' % str(seq_cols)) #create dictionary that goes from, for example, nucleotide to number and #visa versa. seq_dict, inv_dict = utils.choose_dict(dicttype, modeltype=modeltype) #set name of sequences column based on type of sequence type_name_dict = {'dna': 'seq', 'rna': 'seq_rna', 'protein': 'seq_pro'} seq_col_name = type_name_dict[dicttype] if not end: seqL = len(data_df[seq_col_name][0]) - start else: seqL = end - start #throw out wrong length sequences. #Cut the sequences based on start and end, and then check if it makes sense if (start != 0 or end): data_df.loc[:,seq_col_name] = \ data_df.loc[:,seq_col_name].str.slice(start,end) right_length = data_df.loc[:, seq_col_name].apply(len) == (seqL) if not right_length.all(): sys.stderr.write('''Not all sequences are the same length! Throwing out incorrect sequences!''') data_df = data_df.loc[right_length, :] data_df = data_df.reset_index(drop=True) if modeltype == 'MAT': if seqL != len(model_df.loc[:, 'pos']): raise SortSeqError( 'model length does not match dataset length') elif modeltype == 'NBR': if seqL != len(model_df.loc[:, 'pos']) + 1: raise SortSeqError( 'model length does not match dataset length') elif modeltype == 'PAIR': if int(scipy.misc.comb(seqL, 2)) != len(model_df.loc[:, 'pos']): raise SortSeqError( 'model length does not match dataset length') #get column names of the counts columns (excluding total counts 'ct') col_headers = utils.get_column_headers(data_df) if 'ct' not in data_df.columns: data_df['ct'] = data_df[col_headers].sum(axis=1) #remove empty rows. data_df = data_df[data_df.ct != 0] #determine sequence length. #make a numpy array out of the model data frame model_df_headers = [ 'val_' + str(inv_dict[i]) for i in range(len(seq_dict)) ] value = np.array(model_df[model_df_headers]) #now we evaluate the expression of each sequence according to the model. #first convert to matrix representation of sequences seq_mat, wtrow = numerics.dataset2mutarray(data_df.copy(), modeltype) temp_df = data_df.copy() #evaluate energy of each sequence temp_df['val'] = numerics.eval_modelmatrix_on_mutarray( value, seq_mat, wtrow) #sort based on value temp_sorted = temp_df.sort_values(by='val') temp_sorted.reset_index(inplace=True, drop=True) #freg is a regularized plot which show how sequences are distributed #in energy space. if return_freg: fig, ax = plt.subplots() MI, freg = EstimateMutualInfoforMImax.alt4( temp_sorted, coarse_graining_level=coarse_graining_level, return_freg=return_freg) plt.imshow(freg, interpolation='nearest', aspect='auto') plt.savefig(return_freg) else: MI = EstimateMutualInfoforMImax.alt4( temp_sorted, coarse_graining_level=coarse_graining_level, return_freg=return_freg) #if we want to calculate error then use bootstrapping. if not err: Std = np.NaN else: data_df_for_sub = data_df.copy() sub_MI = np.zeros(15) for i in range(15): sub_df = data_df_for_sub.sample(int( len(data_df_for_sub.index) / 2)) sub_df.reset_index(inplace=True, drop=True) sub_MI[i], sub_std = main(sub_df, model_df, err=False) Std = np.std(sub_MI) / np.sqrt(2) #we can return linfoot corrolation (rsquared) or return MI. if rsquared: return (1 - 2**(-2 * MI)), (1 - 2**(-2 * Std)) else: return MI, Std
def main(dataset_df, bin=None, start=0, end=None, bins_df=None, pseudocounts=1, return_profile=False): """ Computes character frequencies (0.0 to 1.0) at each position Arguments: dataset_df (pd.DataFrame): A dataframe containing a valid dataset. bin (int): A bin number specifying which counts to use start (int): An integer specifying the sequence start position end (int): An integer specifying the sequence end position Returns: freq_df (pd.DataFrame): A dataframe containing counts for each nucleotide/amino acid character at each position. """ seq_cols = qc.get_cols_from_df(dataset_df, 'seqs') if not len(seq_cols) == 1: raise SortSeqError('Dataframe has multiple seq cols: %s' % str(seq_cols)) dicttype = qc.colname_to_seqtype_dict[seq_cols[0]] seq_dict, inv_dict = utils.choose_dict(dicttype) # Validate dataset_df qc.validate_dataset(dataset_df) #for each bin we need to find character frequency profile, then sum over all #bins to get activity. #first make sure we have activities of each bin: if not bins_df: bins = utils.get_column_headers(dataset_df) #in this case no activity was specified so just assume the activity #equals bin number activity = [float(b.split('_')[-1]) for b in bins] else: bins = list(bins_df['bins']) activity = list(bins_df['activity']) #initialize dataframe for total counts in all bins output_ct_df = pd.DataFrame() #initialize dataframe for running activity calculation output_activity_df = pd.DataFrame() for i, b in enumerate(bins): bin_num = int(b.split('_')[-1]) # Compute counts counts_df = profile_ct.main(dataset_df, bin=bin_num, start=start, end=end) # Create columns for profile_freqs table ct_cols = utils.get_column_headers(counts_df) #add_pseudocounts counts_df[ct_cols] = counts_df[ct_cols] + pseudocounts #add to all previous bin counts #print output_activity_df if i == 0: output_ct_df = counts_df[ct_cols] output_activity_df = counts_df[ct_cols] * activity[i] else: output_ct_df = output_ct_df + counts_df[ct_cols] output_activity_df = output_activity_df + counts_df[ ct_cols] * activity[i] #now normalize by each character at each position, this is the activity #profile output_activity_df = output_activity_df[ct_cols].div(output_ct_df[ct_cols]) mut_rate = profile_mut.main(dataset_df, bin=bin) freq = profile_freq.main(dataset_df, bin=bin) freq_cols = [x for x in freq.columns if 'freq_' in x] #now normalize by the wt activity wtseq = ''.join(mut_rate['wt']) wtarr = utils.seq2mat(wtseq, seq_dict) wt_activity = np.transpose(wtarr) * (np.array(output_activity_df[ct_cols])) #sum this to get total wt_activity2 = wt_activity.sum(axis=1) delta_activity = output_activity_df.subtract(pd.Series(wt_activity2), axis=0) if return_profile: #first find mutation rate according to formula in SI text profile_delta_activity = mut_rate['mut']*np.sum( (1-np.transpose(wtarr))*np.array(\ freq[freq_cols])*np.array(delta_activity),axis=1) #format into dataframe output_df = pd.DataFrame() output_df['pos'] = range(start, start + len(profile_delta_activity.index)) output_df['mut_activity'] = profile_delta_activity return output_df else: #just add pos column and rename counts columns to activity columns output_df = pd.DataFrame(delta_activity) output_df.insert(0, 'pos', range(start, start + len(delta_activity.index))) #reorder columns activity_col_dict = {x:'activity_' + x.split('_')[-1] \ for x in delta_activity.columns if 'ct_' in x} output_df = output_df.rename(columns=activity_col_dict) return output_df