def main( data_df,model_df, start=0,end=None,err=False,coarse_graining_level=0): dicttype, modeltype = qc.get_model_type(model_df) seq_cols = qc.get_cols_from_df(data_df,'seqs') if not len(seq_cols)==1: raise SortSeqError('Dataframe has multiple seq cols: %s'%str(seq_cols)) seq_dict,inv_dict = utils.choose_dict(dicttype,modeltype=modeltype) #set name of sequences column based on type of sequence type_name_dict = {'dna':'seq','rna':'seq_rna','protein':'seq_pro'} seq_col_name = type_name_dict[dicttype] #Cut the sequences based on start and end, and then check if it makes sense if (start != 0 or end): data_df.loc[:,seq_col_name] = data_df.loc[:,seq_col_name].str.slice(start,end) if modeltype=='MAT': if len(data_df.loc[0,seq_col_name]) != len(model_df.loc[:,'pos']): raise SortSeqError('model length does not match dataset length') elif modeltype=='NBR': if len(data_df.loc[0,seq_col_name]) != len(model_df.loc[:,'pos'])+1: raise SortSeqError('model length does not match dataset length') col_headers = utils.get_column_headers(data_df) if 'ct' not in data_df.columns: data_df['ct'] = data_df[col_headers].sum(axis=1) data_df = data_df[data_df.ct != 0] if not end: seqL = len(data_df[seq_col_name][0]) - start else: seqL = end-start data_df = data_df[data_df[seq_col_name].apply(len) == (seqL)] #make a numpy array out of the model data frame model_df_headers = ['val_' + str(inv_dict[i]) for i in range(len(seq_dict))] value = np.transpose(np.array(model_df[model_df_headers])) #now we evaluate the expression of each sequence according to the model. seq_mat,wtrow = numerics.dataset2mutarray(data_df.copy(),modeltype) temp_df = data_df.copy() temp_df['val'] = numerics.eval_modelmatrix_on_mutarray(np.array(model_df[model_df_headers]),seq_mat,wtrow) temp_sorted = temp_df.sort_values(by='val') temp_sorted.reset_index(inplace=True,drop=True) #we must divide by the total number of counts in each bin for the MI calculator #temp_sorted[col_headers] = temp_sorted[col_headers].div(temp_sorted['ct'],axis=0) MI = EstimateMutualInfoforMImax.alt4(temp_sorted,coarse_graining_level=coarse_graining_level) if not err: Std = np.NaN else: data_df_for_sub = data_df.copy() sub_MI = np.zeros(15) for i in range(15): sub_df = data_df_for_sub.sample(int(len(data_df_for_sub.index)/2)) sub_df.reset_index(inplace=True,drop=True) sub_MI[i],sub_std = main( sub_df,model_df,err=False) Std = np.std(sub_MI)/np.sqrt(2) return MI,Std
def emat(p=pymcdf,value=emat_0): p['val'] = numerics.eval_modelmatrix_on_mutarray(np.transpose(value),seq_mat,wtrow) MI = EstimateMutualInfoforMImax.alt4(p.copy()) # New and improved return n_seqs*MI
def evaluate_on_mutarray(self, mutarray, wtrow): return numerics.eval_modelmatrix_on_mutarray(\ modelmatrix=self.matrix.T, mutarray=mutarray, wtrow=wtrow)
def main(data_df, model_df, start=0, end=None, err=False, coarse_graining_level=0, rsquared=False, return_freg=False): #determine whether you are working with RNA, DNA, or protein. #this also should determine modeltype (MAT, NBR, PAIR). dicttype, modeltype = qc.get_model_type(model_df) #get column header for the sequence column. seq_cols = qc.get_cols_from_df(data_df, 'seqs') if not len(seq_cols) == 1: raise SortSeqError('Dataframe has multiple seq cols: %s' % str(seq_cols)) #create dictionary that goes from, for example, nucleotide to number and #visa versa. seq_dict, inv_dict = utils.choose_dict(dicttype, modeltype=modeltype) #set name of sequences column based on type of sequence type_name_dict = {'dna': 'seq', 'rna': 'seq_rna', 'protein': 'seq_pro'} seq_col_name = type_name_dict[dicttype] if not end: seqL = len(data_df[seq_col_name][0]) - start else: seqL = end - start #throw out wrong length sequences. #Cut the sequences based on start and end, and then check if it makes sense if (start != 0 or end): data_df.loc[:,seq_col_name] = \ data_df.loc[:,seq_col_name].str.slice(start,end) right_length = data_df.loc[:, seq_col_name].apply(len) == (seqL) if not right_length.all(): sys.stderr.write('''Not all sequences are the same length! Throwing out incorrect sequences!''') data_df = data_df.loc[right_length, :] data_df = data_df.reset_index(drop=True) if modeltype == 'MAT': if seqL != len(model_df.loc[:, 'pos']): raise SortSeqError( 'model length does not match dataset length') elif modeltype == 'NBR': if seqL != len(model_df.loc[:, 'pos']) + 1: raise SortSeqError( 'model length does not match dataset length') elif modeltype == 'PAIR': if int(scipy.misc.comb(seqL, 2)) != len(model_df.loc[:, 'pos']): raise SortSeqError( 'model length does not match dataset length') #get column names of the counts columns (excluding total counts 'ct') col_headers = utils.get_column_headers(data_df) if 'ct' not in data_df.columns: data_df['ct'] = data_df[col_headers].sum(axis=1) #remove empty rows. data_df = data_df[data_df.ct != 0] #determine sequence length. #make a numpy array out of the model data frame model_df_headers = [ 'val_' + str(inv_dict[i]) for i in range(len(seq_dict)) ] value = np.array(model_df[model_df_headers]) #now we evaluate the expression of each sequence according to the model. #first convert to matrix representation of sequences seq_mat, wtrow = numerics.dataset2mutarray(data_df.copy(), modeltype) temp_df = data_df.copy() #evaluate energy of each sequence temp_df['val'] = numerics.eval_modelmatrix_on_mutarray( value, seq_mat, wtrow) #sort based on value temp_sorted = temp_df.sort_values(by='val') temp_sorted.reset_index(inplace=True, drop=True) #freg is a regularized plot which show how sequences are distributed #in energy space. if return_freg: fig, ax = plt.subplots() MI, freg = EstimateMutualInfoforMImax.alt4( temp_sorted, coarse_graining_level=coarse_graining_level, return_freg=return_freg) plt.imshow(freg, interpolation='nearest', aspect='auto') plt.savefig(return_freg) else: MI = EstimateMutualInfoforMImax.alt4( temp_sorted, coarse_graining_level=coarse_graining_level, return_freg=return_freg) #if we want to calculate error then use bootstrapping. if not err: Std = np.NaN else: data_df_for_sub = data_df.copy() sub_MI = np.zeros(15) for i in range(15): sub_df = data_df_for_sub.sample(int( len(data_df_for_sub.index) / 2)) sub_df.reset_index(inplace=True, drop=True) sub_MI[i], sub_std = main(sub_df, model_df, err=False) Std = np.std(sub_MI) / np.sqrt(2) #we can return linfoot corrolation (rsquared) or return MI. if rsquared: return (1 - 2**(-2 * MI)), (1 - 2**(-2 * Std)) else: return MI, Std