def update(self, min_yield): ''' updates the sequences based on if they are higher than the last minimum yield :param min_yield: current threshold :return: will update original sequence based on if developability parameter found from self.test_seq was higher than thershold. ''' # print('updating the sequences based on last minimum yield') # print('current minimum yield is %0.2f' % min_yield) # convert the pandas columns to numpy arrays so no for loops :/ test_array = sm.convert2numpy(self.test_seq) orginal_array =sm.convert2numpy(self.df) test_dev = sm.convert2numpy(self.test_seq,self.yield2optimize) org_dev = sm.convert2numpy(self.df,self.yield2optimize) # accept changes that meet the min yield requirement mutatable_seq = min_yield < test_dev orginal_array[mutatable_seq, :] = np.copy(test_array[mutatable_seq, :]) org_dev[mutatable_seq] = np.copy(test_dev[mutatable_seq]) # update self.test_seq and self.original_seq # dangerous code below ; changing self parameters... self.df['Ordinal'] = sm.convert2pandas(orginal_array) self.df[self.yield2optimize] = org_dev self.test_seq = self.df.copy() self.test_seq = self.test_seq[['Ordinal']] return np.count_nonzero(mutatable_seq) / mutatable_seq.shape[0]
def violin_loop_plots(c, loops_2_show=None, nb_strings=None): ''' make violin plots of distribution of yield for sequences :param c: inputs() object :param loops_2_show: ndarray of loops that were saved in run specified by 'c' :param nb_strings: number of strings to have in the violin plot [default: 6] :return: saves the violin plot ''' print('making violin plots') if loops_2_show is None: loops_done = dm.read_pickle(c=c, file_description=fn.loops_done_fn) loops_2_show = sm.convert2numpy(df=loops_done, field=fn.loops_done_fn) + 1 #of the loops completed only show half of them. if nb_strings is None: nb_strings = 6 step = loops_2_show.shape[0] // nb_strings if step == 0: step = 1 loops_2_show = loops_2_show[::step] completed = sm.convert2numpy(df=loops_done, field=fn.loops_done_fn) + 1 if not (np.max(completed) in loops_2_show): loops_2_show = np.hstack( (loops_2_show, np.array([np.max(completed)]))) pm.violin_saved_dataset(c=c, loops_2_show=loops_2_show)
def update(self, min_yield): ''' updates the sequences based on if they are higher than the last minimum yield :param min_yield: current threshold :return: will update original sequence based on if developability parameter found from self.test_seq was higher than thershold. ''' print('updating the sequences based on last minimum yield') print('current minimum yield is %0.2f' % min_yield) # convert the pandas columns to numpy arrays so no for loops :/ test_array = sm.convert2numpy(self.test_seq) orginal_array = sm.convert2numpy(self.original_seq) test_dev = sm.convert2numpy(self.test_seq, 'Developability') org_dev = sm.convert2numpy(self.original_seq, 'Developability') # accept changes that meet the min yield requirement mutatable_seq = min_yield < test_dev orginal_array[mutatable_seq, :] = np.copy(test_array[mutatable_seq, :]) org_dev[mutatable_seq] = np.copy(test_dev[mutatable_seq]) # update self.test_seq and self.original_seq # dangerous code below ; changing self parameters... self.save_testseq_2_original_seq(org_dev, orginal_array) # i really need to make some error checking statements # return percentage positive return np.count_nonzero(mutatable_seq) / mutatable_seq.shape[0]
def walk(self,min_yield,nb_mutations): percent_pos=[] for i in np.arange(self.nb_steps): # first make mutations self.multiple_mutate(nb_mutations=nb_mutations) self.get_yield() pp = self.update(min_yield) percent_pos.append(pp) # for starters join all the dataframes # i want to avoid as many long serializations as possible # return the minimum min_yield=np.min(sm.convert2numpy(df=self.df,field=self.yield2optimize)) self.idx=np.argmin(sm.convert2numpy(df=self.df,field=self.yield2optimize)) return min_yield,np.mean(percent_pos) # return the average pp for all the step
def mutate(self, random_AA_pos=None): ''' mutate module responsible for multiple mutations :param random_AA_pos: ndarray [Number of sequences x 1] specifies which random positions to change for each sequence :return: make changes to self.test_seq['Ordinal'] based on a single mutation for each sequence ''' # mutate every sequence of the original # for a mutation to occur ; # pseudo random number # generate a pseudo random number to define which AA to change [0-15] if random_AA_pos is None: random_AA_pos = self.g.uniform(shape=[self.nb_of_sequences], minval=0, maxval=16, dtype=tf.int64).numpy() # [0,16) # generate a pseudo random number to define which AA to change to [0-20] # using the same generator might be problematic random_AA = self.g.uniform(shape=[self.nb_of_sequences], minval=0, maxval=21, dtype=tf.int64).numpy() # [0,21) # remove blanks from the sequence test_numpy_seq = sm.convert2numpy(df=self.test_seq, field='Ordinal') with dm.suppress_stdout(): random_AA = sm.remove_blanks(generator=self.g, random_AA_pos=random_AA_pos, random_AA=random_AA, seq=test_numpy_seq) # print('mutating test sequence') # converting to numpy for logical array manipulation # test_numpy_seq[:, random_AA_pos] = random_AA # there has to be a way to do this without a loop. test_list_seq = [] for j, r_AA, r_AA_pos in zip(test_numpy_seq, random_AA, random_AA_pos): j[r_AA_pos] = r_AA test_list_seq.append((j)) self.test_seq['Ordinal'] = test_list_seq
def compare(C,field2Show): ''' compare fields across runs make sure c.Nb_sequences and c.nb_loops is the same for all runs. :param C: list of runs :param field2Show: what thing to show ? :return: None ''' for c in C: if C[0].Nb_sequences!=c.Nb_sequences or C[0].nb_loops !=c.nb_loops: raise SystemError('Your sequences and loops are not the same for comparing models.') df=pd.read_pickle(path=dm.make_file_name(c=c,file_description=field2Show)) stat=sm.convert2numpy(df=df,field=field2Show) if c.mutation_type=='dynamic': nm='' else : nm=' ,# mutations: %i'%c.nb_mutations plt.plot(np.arange(stat.shape[0]).tolist(),stat,label=c.mutation_type+' '+nm,) plt.title('%s vs nested sample loop :%i'%(field2Show,C[0].Nb_sequences)) plt.ylabel(field2Show) plt.xlabel('loop number') plt.legend() os.system('mkdir ./sampling_data/comparisons') print('saving ./sampling_data/comparisons/%s_nb_loops_%i_nb_sequences_%i'% (field2Show,C[0].nb_loops,C[0].Nb_sequences)) plt.savefig('./sampling_data/comparisons/%s_nb_loops_%i_nb_sequences_%i'% (field2Show,C[0].nb_loops,C[0].Nb_sequences)) plt.close()
def comparisons(C, field2Show): ''' :param C: list of runs :param field2Show: what thing to show ? :return: None ''' for c in C: df = pd.read_pickle( path=dm.make_file_name(c=c, file_description=field2Show)) stat = sm.convert2numpy(df=df, field=field2Show) if c.mutation_type == 'dynamic': nm = '' else: nm = ' ,# mutations: %i' % c.nb_mutations plt.plot( np.arange(stat.shape[0]).tolist(), stat, label=c.mutation_type + ' ' + nm, ) plt.title('%s vs nested sample loop' % field2Show) plt.ylabel(field2Show) plt.xlabel('loop number') plt.legend() plt.savefig( dm.make_file_name(c=C[0], file_description='%_plot', fileformat='png')) plt.close()
def change_lowest_yield_sequence_configuration(self): ''' function to mutate the current sequences :param idx: index of sequence with lowest yield :return: updates self.original_seq['Ordinal'] ''' # print('sequence to change %i'%self.idx) change_2_seq = self.idx while change_2_seq == self.idx: change_2_seq = self.g.uniform(shape=[1], minval=0, maxval=self.nb_of_sequences, # [0,nb_of_sequences) dtype=tf.int64).numpy()[0] # print('new idx %i '%change_2_seq) orginal_array =sm.convert2numpy(df=self.df) orginal_array[self.idx, :] = orginal_array[change_2_seq, :].copy() # TODO : optimize in pandas to change one sequence without changing everything self.df['Ordinal'] = sm.convert2pandas(orginal_array) dev=sm.convert2numpy(df=self.df,field=self.yield2optimize) # print(dev[self.idx]) dev[self.idx]=dev[change_2_seq] self.df[self.yield2optimize]=sm.convert2pandas(dev)
def twinAxisvsLoops(c,fields2show): ''' :param c: inputs() object :param fields2show: 2x1 List of two fields to show [default: percent positive and nb mutations] :return: makes twin axis plot of the two fields that are specified ''' df_pp = dm.read_pickle(c=c, file_description=fields2show[0]) pp = sm.convert2numpy(df=df_pp, field=fields2show[0]) df_nb = dm.read_pickle(c=c, file_description=fields2show[1]) nb = sm.convert2numpy(df=df_nb, field=fields2show[1]) # nb = nb[0:-2].copy() # if nb.shape[0] != pp.shape[0]: # raise IndexError('number mutations and percent positive lengths are different?') loop_range = np.arange(pp.shape[0]).tolist() fig, ax1 = plt.subplots(1, 1, figsize=[5, 3], dpi=300) color = 'tab:red' ax1.set_xlabel('loop nb') ax1.set_ylabel(fields2show[1], color=color) ax1.plot(loop_range, nb[0:-1], color=color) ax1.tick_params(axis='y', labelcolor=color) ax2 = ax1.twinx() color = 'tab:blue' ax2.set_ylabel(fields2show[0], color=color) # we already handled the x-label with ax1 ax2.plot(loop_range, pp, color=color) ax2.tick_params(axis='y', labelcolor=color) fig.tight_layout() # otherwise the right y-label is slightly clipped plt.savefig(dm.make_file_name(c=c, file_description='%s_vs_%s_plot'%(fields2show[0],fields2show[1]), fileformat='png')) plt.close(fig)
def make_heat_map(df,c,loop_nb): ''' :param df: dataframe with ordinal field :param c: inputs() object :param loop_nb: the loop number in the run :return: makes a heat map saves to inputs() directory ''' ord=sm.convert2numpy(df=df,field='Ordinal') nb_AA=21 nb_positions=16 heat_map=np.zeros((nb_positions,nb_AA)) for k in np.arange(nb_AA): # number of amino acids heat_map[:,k]=np.sum(ord==k,axis=0) frequency=(heat_map.T/np.sum(heat_map,axis=1)).T.copy() heat_map_plot(frequency=frequency,c=c,loop_nb=loop_nb)
def showFieldvsLoops(c, field2Show): ''' :param c: inputs() object :param field2Show: what thing to show ? :return: shows a plot of specified field reading from parralel files in inputs() object ''' df = pd.read_pickle(path=dm.make_file_name(c=c, file_description=field2Show)) stat = sm.convert2numpy(df=df, field=field2Show) if c.mutation_type == 'dynamic': nm = '' else: nm = ' ,# mutations: %i' % c.nb_mutations plt.plot(np.arange(stat.shape[0]).tolist(), stat, label=c.mutation_type + ' ' + nm, ) plt.title('%s vs nested sample loop' % field2Show) plt.ylabel(field2Show) plt.xlabel('loop number') plt.legend() plt.savefig(dm.make_file_name(c=c, file_description='%s_plot'%field2Show, fileformat='png')) plt.close()
def init_yield(self): self.df=self.get_yield(self.df).copy() return np.min(sm.convert2numpy(df=self.df,field=self.yield2optimize))
def cys_stuff(c): ''' :param c: inputs() object :return: returns png files of already saved loops and their corresponding cystene properties. Note this function can be tweaked in order observe any of the Amino Acids. Just need to add a parameter c_position. ''' print('starting cystine for job : ') print(c) seq_loop = os.listdir(path='./sampling_data/' + dm.make_directory(c=c)) N = [] loops = [] for sl in seq_loop: if sl.startswith('sequences_loop') and sl.endswith('pkl'): df = dm.read_pickle(c=c, file_description=sl[0:-4]) N.append(sm.convert2numpy(df=df, field='Ordinal')) loops.append(sl[15:-4]) c_pos = 1 c_max = -1 for n in N: max = np.max(np.sum(n == c_pos, axis=1)) if max > c_max: c_max = max for n, loop in zip(N, loops): # find the distribution for # make a new function for this b = n == c_pos c_nb = np.sum(b, axis=1) fig, ax = plt.subplots(1, 2, figsize=[10, 4], dpi=300) ax[0].hist(x=c_nb, bins=np.arange(0, c_max + 1) - 0.5, ec='k', rwidth=0.7, color='k') ax[0].set_xlabel('Number of Cys in Sequence') ax[0].set_ylabel('count') if c.mutation_type == 'dynamic': title = c.mutation_type + ' Loop %s' % loop else: title = c.mutation_type + ':%i Loop %s' % (c.nb_mutations, loop) ax[0].set_title(title) TRU_LABELS = np.array([ '7', '8', '9', '9b', '9c', '10', '11', '12', '34', '35', '36', '36b', '36c', '37', '38', '39' ]) COLORS = np.array([ 'w', 'r', 'hotpink', 'c', 'm', 'y', 'b', 'silver', 'lime', 'lightsalmon', 'pink', 'violet', 'aqua', 'peachpuff', 'lightcyan', 'moccasin' ]) label = [] percentage = [] nb_seq = [] p = [] # TODO: put average yield in here too, plus standard of deviation for n_c in np.unique(b, axis=0): #calculate the percentage of each p.append( np.count_nonzero((b == n_c).all(axis=1)) / n.shape[0] * 100) percentage.append('%0.2f' % p[-1]) if n_c.any(): s = ',' L = TRU_LABELS[n_c].tolist() label.append(s.join(L)) nb_cys = np.count_nonzero(n_c) nb_seq.append(str(nb_cys)) else: label.append('Non-Cys') nb_seq.append('0') # make the table table = [] for l, p, ns in zip(label, percentage, nb_seq): table.append([l, p, ns]) table = sorted(table, key=lambda x: float(x[1]), reverse=True) tabel_row = ['label', '% of total', '# cys in sequence'] colors = [] for t in table: ns = t[2] colors.append(['w', 'w', COLORS[int(ns)]]) ax[1].set_title = 'LeaderBoard' ax[1].set_axis_off() table = ax[1].table(cellText=table, colLabels=tabel_row, cellColours=colors, colColours=["palegreen"] * 10, cellLoc='center', loc='upper left') fig.savefig( dm.make_file_name(c=c, file_description='cys_loop_%s' % loop, fileformat='png')) plt.close(fig)
import ns_nested_sampling as ns from input_deck import inputs import numpy as np import ns_main_sampling as ms import ns_data_modules as dm import pandas as pd import ns_sampling_modules as sm import matplotlib.pyplot as plt import matplotlib as mpl mpl.use('Agg') A = np.array([1000, 5000, 10000, 20000, 30000, 40000, 50000]) t = [] t2 = [] for a in A: c = inputs(Nb_sequences=a, nb_loops=2, nb_steps=8, nb_snapshots=2) df = ms.driver(c=c) #df=pd.read_pickle(path=dm.make_file_name(c=c,file_description='times',fileformat='pkl')) times = sm.convert2numpy(df=df, field='1th loop') times2 = sm.convert2numpy(df=df, field='2th loop') t.append(np.average(times)) t2.append(np.average(times2)) # todo : include standard of deviation. plt.plot(A.tolist(), t, label='loop 1') plt.plot(A.tolist(), t2, label='loop 2') plt.title('number of sequences vs runtime for a single step') plt.ylabel('runtime for a single step (sec)') plt.xlabel('number of sequences') plt.savefig('./sampling_data/time_stats_improved_get_yield.png') plt.close()
def violin_saved_dataset(c,loops_2_show,y_lim=None): ''' :param c: inputs() object :param loops_2_show: Nx1 ndarray array of loops to show in violin plot :param y_lim: 2x1 list . Max and min of y limits :return: makes violin plot of the loops specified in loops2show ''' 'nb strings is the number of strings in the violin plot' # first make the directory and get all the pkl files if y_lim is None: y_lim = [-1.5, 3] df_pp = pd.read_pickle(path=dm.make_file_name(c=c,file_description=fn.pp_fn)) pp = sm.convert2numpy(df=df_pp, field=fn.pp_fn) df_min_yield =pd.read_pickle(path=dm.make_file_name(c=c,file_description=fn.min_yield_fn)) min_yield=sm.convert2numpy(df=df_min_yield,field=fn.min_yield_fn) # make the figure fig,ax=plt.subplots(1, 1, figsize=[5, 3], dpi=300) labels=[] for k in np.arange(len(loops_2_show)): # read parralel file # s=strings[k] # n=numbers[k] n=loops_2_show[k] df=dm.read_pickle(c=c,file_description='sequences_loop_'+str(loops_2_show[k])) dev=sm.convert2numpy(df=df,field=c.yield2optimize) violin_parts =ax.violinplot([dev], positions=[k], showmedians=False, showextrema=False, points=100, widths=.9) # violin_parts['cmedians'].set_color('r') # violin_parts['cmedians']=min_yield[n-1] for pc in violin_parts['bodies']: pc.set_color('k') #TODO: figure out why min yield and pp are off by 2 idexes ,look to nested_sampling labels.append('Loop: %i'%n) nb_strings=loops_2_show.shape[0] ax.set_xticks(np.arange(nb_strings)) ax.set_ylim(y_lim) ax.set_xticklabels(labels) # ax.set_xlabel('Loop',fontsize=6) ax.set_ylabel('Yield', fontsize=6) ax.tick_params(axis='both', which='major', labelsize=6) ax.set_title('Nested Sampling Loops: %i, Random Walk Steps: %i ' % (np.max(loops_2_show),c.nb_steps)) start=-0.5 for k in loops_2_show: x=[start,start+1] y=[min_yield[k-1],min_yield[k-1]] ax.plot(x,y,'-r',linewidth=0.5,label='Threshold') ax.text(np.average(x),np.average(y)-0.02,'%0.2f'%min_yield[k-1],fontsize=6,color='grey',horizontalalignment='center', verticalalignment='top') start=start+1 ax.legend(['Threshold']) fig.tight_layout() print('saving ' +dm.make_file_name(c=c,file_description='violin_plot_nb_strings_%i'%nb_strings,fileformat='png')) fig.savefig(dm.make_file_name(c=c,file_description='violin_plot_nb_strings_%i'%nb_strings,fileformat='png')) plt.close(fig)
import pandas as pd import ns_sampling_modules as sm import numpy as np df = pd.read_pickle( './sampling_data/Nb_sequences_1000_Nbsteps_5_Nb_loops_100000_dynamic_10/sequences_loop_60001.pkl' ) y = sm.convert2numpy(df, 'Developability') idx = np.argmax(y) print(df.loc[idx, 'Ordinal'])