def evaluate(self,seqs): # Check seqs container if isinstance(seqs,pd.DataFrame): seq_col = qc.get_cols_from_df(seqs,'seqs')[0] seqs_to_use = list(seqs[seq_col]) elif not (isinstance(seqs,list) or isinstance(seqs,pd.Series)): raise SortSeqError('Sequences must be input as a list, pd.Series, or pd.DataFrame') else: seqs_to_use = list(seqs) # Check length if len(seqs_to_use[0]) != self.length: raise SortSeqError(\ 'Energy Matrix Length does not equal Sequence Length') # Compute seqmats t0 = time.time() # fast.seqs2array_for_matmodel expects seqtype to be bytes #self.seqtype = str.encode(self.seqtype) -> type bytes #print(self.seqtype) #self.seqtype = str.encode(self.seqtype) #self.seqtype = str(self.seqtype).encode() #print('In Models...') #print(type(self.seqtype)) # if not bytes, change to bytes if not (isinstance(self.seqtype,bytes)): self.seqtype = str(self.seqtype).encode('utf-8') #print(type(self.seqtype)) #print(type(self.seqtype)) #print(qc.seqtypes) #seqs_to_use = list(map(bytes, str(seqs_to_use).encode('UTF-8'))) #seqs_to_use = list(map(bytes, seqs_to_use)) #print(seqs_to_use) #if (isinstance(seqs_to_use[0], bytes)): #print(seqs_to_use[0].decode()) # for i in range(len(seqs_to_use)): # seqs_to_use[i] = seqs_to_use[i].decode() # change elements to bytes if they're not bytes if not (isinstance(seqs_to_use[0],bytes)): #print('changing seq to bytes...') for i in range(len(seqs_to_use)): seqs_to_use[i] = str(seqs_to_use[i]).encode('utf-8') #print('calling cython:') #seqarray = fast.seqs2array_for_matmodel(list(seqs_to_use),self.seqtype) seqarray = fast.seqs2array_for_matmodel(seqs_to_use, self.seqtype) t1 = time.time() # Compute and return values vals = self.evaluate_on_seqarray(seqarray) t2 = time.time() #print 't1-t0 = %.4f, t1-t2 = %.4f'%(t1-t0,t2-t1) return vals
def dataset2mutarray_withwtseq(dataset_df, modeltype, wtseq, chunksize=1000): # Determine the type of model and set seq2array function appropriately if modeltype == 'MAT': seqs2array = fast.seqs2array_for_matmodel elif modeltype == 'NBR': seqs2array = fast.seqs2array_for_nbrmodel else: raise SortSeqError('Unknown model type: %s' % modeltype) # Determine seqtype, etc. seqcol = qc.get_cols_from_df(dataset_df, 'seqs')[0] seqtype = qc.colname_to_seqtype_dict[seqcol] wtcol = qc.seqtype_to_wtcolname_dict[seqtype] # Compute the wt sequence wtrow = seqs2array([wtseq], seq_type=seqtype).ravel().astype(bool) numfeatures = len(wtrow) # Process dataframe in chunks startrow = 0 endrow = startrow + chunksize - 1 numrows = dataset_df.shape[0] # Fill in mutarray (a lil matrix) chunk by chunk mutarray_lil = lil_matrix((numrows, numfeatures), dtype=int) matrix_filled = False while not matrix_filled: if startrow >= numrows: matrix_filled = True continue elif endrow >= numrows: endrow = numrows - 1 matrix_filled = True # Compute seqarray seqlist = list(dataset_df[seqcol][startrow:(endrow + 1)]) seqarray = seqs2array(seqlist, seq_type=seqtype) # Remove wt entries tmp = seqarray.copy() tmp[:, wtrow] = 0 # Store results from this chunk mutarray_lil[startrow:(endrow + 1), :] = tmp # Increment rows startrow = endrow + 1 endrow = startrow + chunksize - 1 # Convert to csr matrix mutarray_csr = mutarray_lil.tocsr() # Return vararray as well as binary representation of wt seq return mutarray_csr, wtrow
def main(filelist_df, tags_df=None, indir='./', seq_type=None): """ Merges datasets listed in the filelist_df dataframe """ # Validate filelist qc.validate_filelist(filelist_df) # Read datasets into dictionary indexed by bin number dataset_df_dict = {} for item in filelist_df.iterrows(): # Autodetect fasta, fastq, or text file based on file extension fn = indir + item[1]['file'] b = item[1]['bin'] if re.search(fasta_filename_patterns, fn): df = io.load_dataset(fn, file_type='fasta', seq_type=seq_type) elif re.search(fastq_filename_patterns, fn): df = io.load_dataset(fn, file_type='fastq', seq_type=seq_type) else: df = io.load_dataset(fn, file_type='text', seq_type=seq_type) dataset_df_dict[b] = df # Merge datasets into one out_df = merge_datasets(dataset_df_dict) # Add seqs if given tags_df if not tags_df is None: qc.validate_tagkey(tags_df) tag_col = 'tag' # Test to make sure all tags in dataset are a subset of tags data_tags = set(out_df[tag_col]) all_tags = set(tags_df[tag_col]) if not (data_tags <= all_tags): sys.stderr.write('Some tags probably could not be identified.') # Get name of seq column seq_cols = qc.get_cols_from_df(tags_df, 'seqs') if not len(seq_cols) == 1: raise SortSeqError('Multiple seq columns; exaclty 1 required.') seq_col = seq_cols[0] # Set tag to be index column of dataframe tags_df = tags_df.set_index(tag_col) # Add seqs corresponding to each tag tags = out_df[tag_col] seqs = tags_df[seq_col][tags].values if not all([type(x) == str for x in seqs]): raise SortSeqError('Some looked-up seqs are not strings.') out_df[seq_col] = tags_df[seq_col][tags].values qc.validate_dataset(out_df) return out_df
def dataset2seqarray(dataset_df, modeltype): # Determine the type of model and set seq2array function appropriately if modeltype == 'MAT': seqs2array = fast.seqs2array_for_matmodel elif modeltype == 'NBR': seqs2array = fast.seqs2array_for_nbrmodel else: raise SortSeqError('Unknown model type: %s' % modeltype) seqcol = qc.get_cols_from_df(dataset_df, 'seqs')[0] seqtype = qc.colname_to_seqtype_dict[seqcol] seqlist = list(dataset_df[seqcol]) seqarray = seqs2array(seqlist, seq_type=seqtype) return seqarray
def __init__(self,model_df): """ Constructor takes model parameters in the form of a model dataframe """ model_df = qc.validate_model(model_df.copy(),fix=True) seqtype, modeltype = qc.get_model_type(model_df) if not modeltype=='MAT': raise SortSeqError('Invalid modeltype: %s'%modeltype) seq_dict,inv_dict = utils.choose_dict(seqtype,modeltype=modeltype) self.seqtype = seqtype self.seq_dict = seq_dict self.inv_dict = inv_dict self.df = model_df self.length = model_df.shape[0] # Extract matrix part of model dataframe headers = qc.get_cols_from_df(model_df,'vals') self.matrix = np.transpose(np.array(model_df[headers]))
def evaluate(self,seqs): # Check seqs container if isinstance(seqs,pd.DataFrame): seq_col = qc.get_cols_from_df(seqs,'seqs')[0] seqs_to_use = list(seqs[seq_col]) elif not (isinstance(seqs,list) or isinstance(seqs,pd.Series)): raise SortSeqError('Sequences must be input as a list, pd.Series, or pd.DataFrame') else: seqs_to_use = list(seqs) # Check length if len(seqs_to_use[0]) != self.length: raise SortSeqError(\ 'Energy Matrix Length does not equal Sequence Length') # Compute seqmats t0 = time.time() # python 3 fast.c update for string to bytes conversion # if not bytes, change to bytes if not (isinstance(self.seqtype,bytes)): self.seqtype = str(self.seqtype).encode('utf-8') # change elements to bytes if they're not bytes if not (isinstance(seqs_to_use[0],bytes)): #print('changing seq to bytes...') for i in range(len(seqs_to_use)): seqs_to_use[i] = str(seqs_to_use[i]).encode('utf-8') seqarray = fast.seqs2array_for_nbrmodel(seqs_to_use,self.seqtype) t1 = time.time() # Compute and return values vals = self.evaluate_on_seqarray(seqarray) t2 = time.time() return vals
def __init__(self, data_df, model_df, start=0, end=None, err=False, coarse_graining_level=0, rsquared=False, return_freg=False): self.data_df = data_df self.model_df = model_df self.start = start self.end = end self.err = err self.coarse_graining_level = coarse_graining_level self.out_MI = None self.out_std = None self._input_checks() dicttype, modeltype = qc.get_model_type(self.model_df) seq_cols = qc.get_cols_from_df(self.data_df, 'seqs') if not len(seq_cols) == 1: raise SortSeqError('Dataframe has multiple seq cols: %s' % str(seq_cols)) seq_dict, inv_dict = utils.choose_dict(dicttype, modeltype=modeltype) # set name of sequences column based on type of sequence type_name_dict = {'dna': 'seq', 'rna': 'seq_rna', 'protein': 'seq_pro'} seq_col_name = type_name_dict[dicttype] # Cut the sequences based on start and end, and then check if it makes sense if (self.start != 0 or self.end): self.data_df.loc[:, seq_col_name] = self.data_df.loc[:, seq_col_name].str.slice( self.start, self.end) if modeltype == 'MAT': if len(self.data_df.loc[0, seq_col_name]) != len( self.model_df.loc[:, 'pos']): print('predictive info class: BP lengths: ', len(self.data_df.loc[0, seq_col_name]), " ", len(self.model_df.loc[:, 'pos'])) raise SortSeqError( 'model length does not match dataset length') elif modeltype == 'NBR': if len(self.data_df.loc[0, seq_col_name]) != len( self.model_df.loc[:, 'pos']) + 1: raise SortSeqError( 'model length does not match dataset length') col_headers = utils.get_column_headers(self.data_df) if 'ct' not in self.data_df.columns: self.data_df['ct'] = data_df[col_headers].sum(axis=1) self.data_df = self.data_df[self.data_df.ct != 0] if not self.end: seqL = len(self.data_df[seq_col_name][0]) - self.start else: seqL = self.end - self.start self.data_df = self.data_df[self.data_df[seq_col_name].apply(len) == (seqL)] # make a numpy array out of the model data frame model_df_headers = [ 'val_' + str(inv_dict[i]) for i in range(len(seq_dict)) ] value = np.transpose(np.array(self.model_df[model_df_headers])) # now we evaluate the expression of each sequence according to the model. seq_mat, wtrow = numerics.dataset2mutarray(self.data_df.copy(), modeltype) temp_df = self.data_df.copy() # AT: what is this line trying to do? temp_df['val'] = numerics.eval_modelmatrix_on_mutarray( np.array(self.model_df[model_df_headers]), seq_mat, wtrow) temp_sorted = temp_df.sort_values(by='val') temp_sorted.reset_index(inplace=True, drop=True) # we must divide by the total number of counts in each bin for the MI calculator # temp_sorted[col_headers] = temp_sorted[col_headers].div(temp_sorted['ct'],axis=0) if return_freg: #fig, ax = plt.subplots() MI, freg = EstimateMutualInfoforMImax.alt4( temp_sorted, coarse_graining_level=coarse_graining_level, return_freg=return_freg) #plt.imshow(freg, interpolation='nearest', aspect='auto') #plt.savefig(return_freg) else: MI = EstimateMutualInfoforMImax.alt4( temp_sorted, coarse_graining_level=coarse_graining_level, return_freg=return_freg) if not self.err: Std = np.NaN else: data_df_for_sub = self.data_df.copy() sub_MI = np.zeros(15) for i in range(15): sub_df = data_df_for_sub.sample( int(len(data_df_for_sub.index) / 2)) sub_df.reset_index(inplace=True, drop=True) sub_MI[i], sub_std = PredictiveInfo(sub_df, model_df, err=False) Std = np.std(sub_MI) / np.sqrt(2) if rsquared: #return (1 - 2 ** (-2 * MI)), (1 - 2 ** (-2 * Std)) self.out_MI, self.out_std = (1 - 2**(-2 * MI)), (1 - 2**(-2 * Std)) else: #return MI, Std self.out_MI, self.out_std = MI, Std
def __init__(self, df, lm='ER', modeltype='MAT', LS_means_std=None, db=None, iteration=30000, burnin=1000, thin=10, runnum=0, initialize='LS', start=0, end=None, foreground=1, background=0, alpha=0.0, pseudocounts=1, drop_library=False, verbose=False, tm=None): # set attributes self.df = df self.lm = lm self.modeltype = modeltype self.LS_means_std = LS_means_std self.db = db self.iteration = iteration self.burnin = burnin self.thin = thin self.runnum = runnum self.initialize = initialize self.start = start self.end = end self.foreground = foreground self.background = background self.alpha = alpha self.pseudocounts = pseudocounts self.drop_library = drop_library self.verbose = verbose self.tm = tm # output df self.output_df = None # validate parameters self._input_checks() # Determine dictionary seq_cols = qc.get_cols_from_df(df, 'seqs') if not len(seq_cols) == 1: raise SortSeqError('Dataframe has multiple seq cols: %s' % str(seq_cols)) dicttype = qc.colname_to_seqtype_dict[seq_cols[0]] seq_dict, inv_dict = utils.choose_dict(dicttype, modeltype=modeltype) '''Check to make sure the chosen dictionary type correctly describes the sequences. An issue with this test is that if you have DNA sequence but choose a protein dictionary, you will still pass this test bc A,C, G,T are also valid amino acids''' # set name of sequences column based on type of sequence type_name_dict = {'dna': 'seq', 'rna': 'seq_rna', 'protein': 'seq_pro'} seq_col_name = type_name_dict[dicttype] lin_seq_dict, lin_inv_dict = utils.choose_dict(dicttype, modeltype='MAT') # wtseq = utils.profile_counts(df.copy(),dicttype,return_wtseq=True,start=start,end=end) # wt_seq_dict_list = [{inv_dict[np.mod(i+1+seq_dict[w],len(seq_dict))]:i for i in range(len(seq_dict)-1)} for w in wtseq] par_seq_dict = { v: k for v, k in seq_dict.items() if k != (len(seq_dict) - 1) } # drop any rows with ct = 0 df = df[df.loc[:, 'ct'] != 0] df.reset_index(drop=True, inplace=True) # If there are sequences of different lengths, then print error but continue if len(set(df[seq_col_name].apply(len))) > 1: sys.stderr.write('Lengths of all sequences are not the same!') # select target sequence region df.loc[:, seq_col_name] = df.loc[:, seq_col_name].str.slice(start, end) df = utils.collapse_further(df) col_headers = utils.get_column_headers(df) # make sure all counts are ints df[col_headers] = df[col_headers].astype(int) # create vector of column names val_cols = ['val_' + inv_dict[i] for i in range(len(seq_dict))] df.reset_index(inplace=True, drop=True) # Drop any sequences with incorrect length if not end: '''is no value for end of sequence was supplied, assume first seq is correct length''' seqL = len(df[seq_col_name][0]) - start else: seqL = end - start df = df[df[seq_col_name].apply(len) == (seqL)] df.reset_index(inplace=True, drop=True) # Do something different for each type of learning method (lm) if lm == 'ER': if modeltype == 'NBR': emat = self.Markov(df, dicttype, foreground=foreground, background=background, pseudocounts=pseudocounts) else: emat = self.Berg_von_Hippel(df, dicttype, foreground=foreground, background=background, pseudocounts=pseudocounts) if lm == 'PR': emat = self.convex_opt(df, seq_dict, inv_dict, col_headers, tm=tm, \ dicttype=dicttype, modeltype=modeltype) if lm == 'LS': '''First check that is we don't have a penalty for ridge regression, that we at least have all possible base values so that the analysis will not fail''' if LS_means_std: # If user supplied preset means and std for each bin means_std_df = io.load_meanstd(LS_means_std) # change bin number to 'ct_number' and then use as index labels = list(means_std_df['bin'].apply(self.add_label)) std = means_std_df['std'] std.index = labels # Change Weighting of each sequence by dividing counts by bin std df[labels] = df[labels].div(std) means = means_std_df['mean'] means.index = labels else: means = None # drop all rows without counts df['ct'] = df[col_headers].sum(axis=1) df = df[df.ct != 0] df.reset_index(inplace=True, drop=True) ''' For sort-seq experiments, bin_0 is library only and isn't the lowest expression even though it is will be calculated as such if we proceed. Therefore is drop_library is passed, drop this column from analysis.''' if drop_library: try: df.drop('ct_0', inplace=True) col_headers = utils.get_column_headers(df) if len(col_headers) < 2: raise SortSeqError( '''After dropping library there are no longer enough columns to run the analysis''') except: raise SortSeqError( '''drop_library option was passed, but no ct_0 column exists''') # parameterize sequences into 3xL vectors print('init learn model: \n') print(par_seq_dict) print('dict: ', dicttype) raveledmat, batch, sw = utils.genweightandmat(df, par_seq_dict, dicttype, means=means, modeltype=modeltype) # Use ridge regression to find matrix. emat = self.Compute_Least_Squares(raveledmat, batch, sw, alpha=alpha) if lm == 'IM': seq_mat, wtrow = numerics.dataset2mutarray(df.copy(), modeltype) # this is also an MCMC routine, do the same as above. if initialize == 'rand': if modeltype == 'MAT': emat_0 = utils.RandEmat(len(df[seq_col_name][0]), len(seq_dict)) elif modeltype == 'NBR': emat_0 = utils.RandEmat( len(df[seq_col_name][0]) - 1, len(seq_dict)) elif initialize == 'LS': emat_cols = [ 'val_' + inv_dict[i] for i in range(len(seq_dict)) ] emat_0_df = LearnModel(df.copy(), lm='LS', modeltype=modeltype, alpha=alpha, start=0, end=None, verbose=verbose).output_df emat_0 = np.transpose(np.array(emat_0_df[emat_cols])) # pymc doesn't take sparse mat elif initialize == 'PR': emat_cols = [ 'val_' + inv_dict[i] for i in range(len(seq_dict)) ] emat_0_df = LearnModel(df.copy(), lm='PR', modeltype=modeltype, start=0, end=None).output_df emat_0 = np.transpose(np.array(emat_0_df[emat_cols])) emat = self.MaximizeMI_memsaver(seq_mat, df.copy(), emat_0, wtrow, db=db, iteration=iteration, burnin=burnin, thin=thin, runnum=runnum, verbose=verbose) # We have infered out matrix. # now format the energy matrices to get them ready to output if (lm == 'IM' or lm == 'memsaver'): if modeltype == 'NBR': try: emat_typical = gauge.fix_neighbor(np.transpose(emat)) except: sys.stderr.write('Gauge Fixing Failed') emat_typical = np.transpose(emat) elif modeltype == 'MAT': try: emat_typical = gauge.fix_matrix(np.transpose(emat)) except: sys.stderr.write('Gauge Fixing Failed') emat_typical = np.transpose(emat) elif lm == 'ER': '''the emat for this format is currently transposed compared to other formats it is also already a data frame with columns [pos,val_...]''' if modeltype == 'NBR': emat_cols = [ 'val_' + inv_dict[i] for i in range(len(seq_dict)) ] emat_typical = emat[emat_cols] else: emat_cols = [ 'val_' + inv_dict[i] for i in range(len(seq_dict)) ] emat_typical = emat[emat_cols] try: emat_typical = (gauge.fix_matrix((np.array(emat_typical)))) except: sys.stderr.write('Gauge Fixing Failed') emat_typical = emat_typical elif (lm == 'MK'): '''The model is a first order markov model and its gauge does not need to be changed.''' elif lm == 'PR': emat_typical = np.transpose(emat) else: # must be Least squares emat_typical = utils.emat_typical_parameterization( emat, len(seq_dict)) if modeltype == 'NBR': try: emat_typical = gauge.fix_neighbor( np.transpose(emat_typical)) except: sys.stderr.write('Gauge Fixing Failed') emat_typical = np.transpose(emat_typical) elif modeltype == 'MAT': try: emat_typical = gauge.fix_matrix(np.transpose(emat_typical)) except: sys.stderr.write('Gauge Fixing Failed') emat_typical = np.transpose(emat_typical) em = pd.DataFrame(emat_typical) em.columns = val_cols # add position column if modeltype == 'NBR': pos = pd.Series(range(start, start - 1 + len(df[seq_col_name][0])), name='pos') else: pos = pd.Series(range(start, start + len(df[seq_col_name][0])), name='pos') output_df = pd.concat([pos, em], axis=1) # Validate model and return output_df = qc.validate_model(output_df, fix=True) self.output_df = output_df