Exemplo n.º 1
0
    def update(self, min_yield):
        '''
        updates the sequences based on if they are higher than the last minimum yield
        :param min_yield: current threshold
        :return: will update original sequence based on if developability parameter found from self.test_seq was
        higher than thershold.

        '''
        # print('updating the sequences based on last minimum yield')
        # print('current minimum yield is  %0.2f' % min_yield)
        # convert the pandas columns to numpy arrays so no for loops  :/
        test_array = sm.convert2numpy(self.test_seq)
        orginal_array =sm.convert2numpy(self.df)
        test_dev = sm.convert2numpy(self.test_seq,self.yield2optimize)
        org_dev = sm.convert2numpy(self.df,self.yield2optimize)
        # accept changes that meet the min yield requirement
        mutatable_seq = min_yield < test_dev

        orginal_array[mutatable_seq, :] = np.copy(test_array[mutatable_seq, :])
        org_dev[mutatable_seq] = np.copy(test_dev[mutatable_seq])

        # update self.test_seq and self.original_seq
        # dangerous code below ; changing self parameters...
        self.df['Ordinal'] = sm.convert2pandas(orginal_array)
        self.df[self.yield2optimize] = org_dev
        self.test_seq = self.df.copy()
        self.test_seq = self.test_seq[['Ordinal']]

        return np.count_nonzero(mutatable_seq) / mutatable_seq.shape[0]
Exemplo n.º 2
0
def violin_loop_plots(c, loops_2_show=None, nb_strings=None):
    '''
    make violin plots of distribution of yield for sequences
    :param c: inputs() object
    :param loops_2_show: ndarray of loops that were saved in run specified by 'c'
    :param nb_strings: number of strings to have in the violin plot  [default: 6]
    :return: saves the violin plot
    '''
    print('making violin plots')
    if loops_2_show is None:
        loops_done = dm.read_pickle(c=c, file_description=fn.loops_done_fn)
        loops_2_show = sm.convert2numpy(df=loops_done,
                                        field=fn.loops_done_fn) + 1
        #of the loops completed only show half of them.
        if nb_strings is None:
            nb_strings = 6
        step = loops_2_show.shape[0] // nb_strings
        if step == 0:
            step = 1
        loops_2_show = loops_2_show[::step]
        completed = sm.convert2numpy(df=loops_done, field=fn.loops_done_fn) + 1
        if not (np.max(completed) in loops_2_show):
            loops_2_show = np.hstack(
                (loops_2_show, np.array([np.max(completed)])))

    pm.violin_saved_dataset(c=c, loops_2_show=loops_2_show)
Exemplo n.º 3
0
    def update(self, min_yield):
        '''
        updates the sequences based on if they are higher than the last minimum yield
        :param min_yield: current threshold
        :return: will update original sequence based on if developability parameter found from self.test_seq was
        higher than thershold.

        '''
        print('updating the sequences based on last minimum yield')
        print('current minimum yield is  %0.2f' % min_yield)
        # convert the pandas columns to numpy arrays so no for loops  :/
        test_array = sm.convert2numpy(self.test_seq)
        orginal_array = sm.convert2numpy(self.original_seq)
        test_dev = sm.convert2numpy(self.test_seq, 'Developability')
        org_dev = sm.convert2numpy(self.original_seq, 'Developability')
        # accept changes that meet the min yield requirement
        mutatable_seq = min_yield < test_dev
        orginal_array[mutatable_seq, :] = np.copy(test_array[mutatable_seq, :])
        org_dev[mutatable_seq] = np.copy(test_dev[mutatable_seq])
        # update self.test_seq and self.original_seq
        # dangerous code below ; changing self parameters...

        self.save_testseq_2_original_seq(org_dev, orginal_array)
        # i really need to make some error checking statements
        # return percentage positive
        return np.count_nonzero(mutatable_seq) / mutatable_seq.shape[0]
Exemplo n.º 4
0
    def walk(self,min_yield,nb_mutations):
        percent_pos=[]
        for i in np.arange(self.nb_steps):
            # first make mutations
            self.multiple_mutate(nb_mutations=nb_mutations)
            self.get_yield()
            pp = self.update(min_yield)
            percent_pos.append(pp)

        # for starters join all the dataframes
        # i want to avoid as many long serializations as possible
        # return the minimum
        min_yield=np.min(sm.convert2numpy(df=self.df,field=self.yield2optimize))
        self.idx=np.argmin(sm.convert2numpy(df=self.df,field=self.yield2optimize))

        return min_yield,np.mean(percent_pos) # return the average pp for all the step
Exemplo n.º 5
0
    def mutate(self, random_AA_pos=None):
        '''
        mutate module responsible for multiple mutations
        :param random_AA_pos: ndarray [Number of sequences x 1]  specifies which random positions
         to change for each sequence
        :return: make changes to self.test_seq['Ordinal'] based on a single mutation for each sequence
        '''
        # mutate every sequence of the original
        # for a mutation to occur ;
        # pseudo random number
        # generate a pseudo random number to define which AA to change [0-15]

        if random_AA_pos is None:
            random_AA_pos = self.g.uniform(shape=[self.nb_of_sequences], minval=0, maxval=16,
                                           dtype=tf.int64).numpy()  # [0,16)
        # generate a pseudo random number to define which AA to change to [0-20]
        # using the same generator might be problematic

        random_AA = self.g.uniform(shape=[self.nb_of_sequences], minval=0, maxval=21, dtype=tf.int64).numpy()
        # [0,21)
        # remove blanks from the sequence
        test_numpy_seq = sm.convert2numpy(df=self.test_seq, field='Ordinal')
        with dm.suppress_stdout():
            random_AA = sm.remove_blanks(generator=self.g, random_AA_pos=random_AA_pos, random_AA=random_AA,
                                     seq=test_numpy_seq)
        # print('mutating test sequence')
        # converting to numpy for logical array manipulation
        # test_numpy_seq[:, random_AA_pos] = random_AA
        # there has to be a way to do this without a loop.
        test_list_seq = []
        for j, r_AA, r_AA_pos in zip(test_numpy_seq, random_AA, random_AA_pos):
            j[r_AA_pos] = r_AA
            test_list_seq.append((j))

        self.test_seq['Ordinal'] = test_list_seq
Exemplo n.º 6
0
def compare(C,field2Show):
    '''
    compare fields across runs
    make sure c.Nb_sequences and c.nb_loops is the same for all runs.
    :param C: list of runs
    :param field2Show: what thing to show ?
    :return: None
    '''


    for c in C:
        if C[0].Nb_sequences!=c.Nb_sequences or C[0].nb_loops !=c.nb_loops:
            raise SystemError('Your sequences and loops are not the same for comparing models.')
        df=pd.read_pickle(path=dm.make_file_name(c=c,file_description=field2Show))
        stat=sm.convert2numpy(df=df,field=field2Show)

        if c.mutation_type=='dynamic':
            nm=''
        else :
            nm=' ,# mutations: %i'%c.nb_mutations
        plt.plot(np.arange(stat.shape[0]).tolist(),stat,label=c.mutation_type+' '+nm,)
    plt.title('%s vs nested sample loop :%i'%(field2Show,C[0].Nb_sequences))
    plt.ylabel(field2Show)
    plt.xlabel('loop number')
    plt.legend()
    os.system('mkdir ./sampling_data/comparisons')
    print('saving ./sampling_data/comparisons/%s_nb_loops_%i_nb_sequences_%i'% (field2Show,C[0].nb_loops,C[0].Nb_sequences))
    plt.savefig('./sampling_data/comparisons/%s_nb_loops_%i_nb_sequences_%i'% (field2Show,C[0].nb_loops,C[0].Nb_sequences))
    plt.close()
Exemplo n.º 7
0
def comparisons(C, field2Show):
    '''

    :param C: list of runs
    :param field2Show: what thing to show ?
    :return: None

    '''

    for c in C:
        df = pd.read_pickle(
            path=dm.make_file_name(c=c, file_description=field2Show))
        stat = sm.convert2numpy(df=df, field=field2Show)

        if c.mutation_type == 'dynamic':
            nm = ''
        else:
            nm = ' ,# mutations: %i' % c.nb_mutations
        plt.plot(
            np.arange(stat.shape[0]).tolist(),
            stat,
            label=c.mutation_type + ' ' + nm,
        )
    plt.title('%s vs nested sample loop' % field2Show)
    plt.ylabel(field2Show)
    plt.xlabel('loop number')
    plt.legend()
    plt.savefig(
        dm.make_file_name(c=C[0], file_description='%_plot', fileformat='png'))
    plt.close()
Exemplo n.º 8
0
    def change_lowest_yield_sequence_configuration(self):
        '''
        function to mutate the current sequences
        :param idx: index of sequence with lowest yield
        :return: updates self.original_seq['Ordinal']
        '''
        # print('sequence to change %i'%self.idx)
        change_2_seq = self.idx
        while change_2_seq == self.idx:
            change_2_seq = self.g.uniform(shape=[1], minval=0, maxval=self.nb_of_sequences,  # [0,nb_of_sequences)
                                      dtype=tf.int64).numpy()[0]
        # print('new idx  %i '%change_2_seq)
        orginal_array =sm.convert2numpy(df=self.df)
        orginal_array[self.idx, :] = orginal_array[change_2_seq, :].copy()
        # TODO : optimize in pandas to change one sequence without changing everything
        self.df['Ordinal'] = sm.convert2pandas(orginal_array)

        dev=sm.convert2numpy(df=self.df,field=self.yield2optimize)
        # print(dev[self.idx])
        dev[self.idx]=dev[change_2_seq]
        self.df[self.yield2optimize]=sm.convert2pandas(dev)
Exemplo n.º 9
0
def twinAxisvsLoops(c,fields2show):
    '''


    :param c: inputs() object
    :param fields2show: 2x1 List of two fields to show  [default: percent positive and nb mutations]
    :return: makes twin axis plot of the two fields that are specified

    '''

    df_pp = dm.read_pickle(c=c, file_description=fields2show[0])
    pp = sm.convert2numpy(df=df_pp, field=fields2show[0])

    df_nb = dm.read_pickle(c=c, file_description=fields2show[1])
    nb = sm.convert2numpy(df=df_nb, field=fields2show[1])



    # nb = nb[0:-2].copy()
    # if nb.shape[0] != pp.shape[0]:
    #     raise IndexError('number mutations and percent positive lengths are different?')

    loop_range = np.arange(pp.shape[0]).tolist()
    fig, ax1 = plt.subplots(1, 1, figsize=[5, 3], dpi=300)
    color = 'tab:red'
    ax1.set_xlabel('loop nb')
    ax1.set_ylabel(fields2show[1], color=color)
    ax1.plot(loop_range, nb[0:-1], color=color)
    ax1.tick_params(axis='y', labelcolor=color)

    ax2 = ax1.twinx()
    color = 'tab:blue'
    ax2.set_ylabel(fields2show[0], color=color)  # we already handled the x-label with ax1
    ax2.plot(loop_range, pp, color=color)
    ax2.tick_params(axis='y', labelcolor=color)

    fig.tight_layout()  # otherwise the right y-label is slightly clipped
    plt.savefig(dm.make_file_name(c=c, file_description='%s_vs_%s_plot'%(fields2show[0],fields2show[1]), fileformat='png'))
    plt.close(fig)
Exemplo n.º 10
0
def make_heat_map(df,c,loop_nb):
    '''

    :param df: dataframe with ordinal field
    :param c: inputs() object
    :param loop_nb: the loop number in the run
    :return: makes a heat map saves to inputs() directory

    '''
    ord=sm.convert2numpy(df=df,field='Ordinal')
    nb_AA=21
    nb_positions=16
    heat_map=np.zeros((nb_positions,nb_AA))
    for k in np.arange(nb_AA):
        # number of amino acids
        heat_map[:,k]=np.sum(ord==k,axis=0)

    frequency=(heat_map.T/np.sum(heat_map,axis=1)).T.copy()

    heat_map_plot(frequency=frequency,c=c,loop_nb=loop_nb)
Exemplo n.º 11
0
def showFieldvsLoops(c, field2Show):
    '''

    :param c: inputs() object
    :param field2Show: what thing to show ?
    :return: shows a plot of specified field reading from parralel files in inputs() object

    '''

    df = pd.read_pickle(path=dm.make_file_name(c=c, file_description=field2Show))
    stat = sm.convert2numpy(df=df, field=field2Show)

    if c.mutation_type == 'dynamic':
        nm = ''
    else:
        nm = ' ,# mutations: %i' % c.nb_mutations
    plt.plot(np.arange(stat.shape[0]).tolist(), stat, label=c.mutation_type + ' ' + nm, )
    plt.title('%s vs nested sample loop' % field2Show)
    plt.ylabel(field2Show)
    plt.xlabel('loop number')
    plt.legend()
    plt.savefig(dm.make_file_name(c=c, file_description='%s_plot'%field2Show, fileformat='png'))
    plt.close()
Exemplo n.º 12
0
    def init_yield(self):

        self.df=self.get_yield(self.df).copy()

        return np.min(sm.convert2numpy(df=self.df,field=self.yield2optimize))
Exemplo n.º 13
0
def cys_stuff(c):
    '''

    :param c: inputs() object
    :return: returns png files of already saved loops and their corresponding cystene properties.
    Note this function can be tweaked in order observe any of the Amino Acids. Just need to add
    a parameter c_position.

    '''
    print('starting cystine for job : ')
    print(c)
    seq_loop = os.listdir(path='./sampling_data/' + dm.make_directory(c=c))
    N = []
    loops = []
    for sl in seq_loop:
        if sl.startswith('sequences_loop') and sl.endswith('pkl'):
            df = dm.read_pickle(c=c, file_description=sl[0:-4])
            N.append(sm.convert2numpy(df=df, field='Ordinal'))
            loops.append(sl[15:-4])
    c_pos = 1
    c_max = -1
    for n in N:
        max = np.max(np.sum(n == c_pos, axis=1))
        if max > c_max:
            c_max = max

    for n, loop in zip(N, loops):
        # find the distribution for
        # make a new function for this
        b = n == c_pos
        c_nb = np.sum(b, axis=1)
        fig, ax = plt.subplots(1, 2, figsize=[10, 4], dpi=300)
        ax[0].hist(x=c_nb,
                   bins=np.arange(0, c_max + 1) - 0.5,
                   ec='k',
                   rwidth=0.7,
                   color='k')
        ax[0].set_xlabel('Number of Cys in Sequence')
        ax[0].set_ylabel('count')
        if c.mutation_type == 'dynamic':
            title = c.mutation_type + ' Loop %s' % loop
        else:
            title = c.mutation_type + ':%i Loop %s' % (c.nb_mutations, loop)
        ax[0].set_title(title)

        TRU_LABELS = np.array([
            '7', '8', '9', '9b', '9c', '10', '11', '12', '34', '35', '36',
            '36b', '36c', '37', '38', '39'
        ])
        COLORS = np.array([
            'w', 'r', 'hotpink', 'c', 'm', 'y', 'b', 'silver', 'lime',
            'lightsalmon', 'pink', 'violet', 'aqua', 'peachpuff', 'lightcyan',
            'moccasin'
        ])

        label = []
        percentage = []
        nb_seq = []
        p = []
        # TODO: put average yield in here too, plus standard of deviation
        for n_c in np.unique(b, axis=0):
            #calculate the percentage of each
            p.append(
                np.count_nonzero((b == n_c).all(axis=1)) / n.shape[0] * 100)
            percentage.append('%0.2f' % p[-1])

            if n_c.any():
                s = ','
                L = TRU_LABELS[n_c].tolist()
                label.append(s.join(L))
                nb_cys = np.count_nonzero(n_c)
                nb_seq.append(str(nb_cys))
            else:
                label.append('Non-Cys')
                nb_seq.append('0')

        # make the table
        table = []
        for l, p, ns in zip(label, percentage, nb_seq):
            table.append([l, p, ns])

        table = sorted(table, key=lambda x: float(x[1]), reverse=True)
        tabel_row = ['label', '% of total', '# cys in sequence']

        colors = []
        for t in table:
            ns = t[2]
            colors.append(['w', 'w', COLORS[int(ns)]])

        ax[1].set_title = 'LeaderBoard'
        ax[1].set_axis_off()

        table = ax[1].table(cellText=table,
                            colLabels=tabel_row,
                            cellColours=colors,
                            colColours=["palegreen"] * 10,
                            cellLoc='center',
                            loc='upper left')

        fig.savefig(
            dm.make_file_name(c=c,
                              file_description='cys_loop_%s' % loop,
                              fileformat='png'))
        plt.close(fig)
Exemplo n.º 14
0
import ns_nested_sampling as ns
from input_deck import inputs
import numpy as np
import ns_main_sampling as ms
import ns_data_modules as dm
import pandas as pd
import ns_sampling_modules as sm
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.use('Agg')

A = np.array([1000, 5000, 10000, 20000, 30000, 40000, 50000])
t = []
t2 = []

for a in A:
    c = inputs(Nb_sequences=a, nb_loops=2, nb_steps=8, nb_snapshots=2)
    df = ms.driver(c=c)
    #df=pd.read_pickle(path=dm.make_file_name(c=c,file_description='times',fileformat='pkl'))
    times = sm.convert2numpy(df=df, field='1th loop')
    times2 = sm.convert2numpy(df=df, field='2th loop')
    t.append(np.average(times))
    t2.append(np.average(times2))
# todo : include standard of deviation.
plt.plot(A.tolist(), t, label='loop 1')
plt.plot(A.tolist(), t2, label='loop 2')
plt.title('number of sequences vs runtime for a single step')
plt.ylabel('runtime for a single step (sec)')
plt.xlabel('number of sequences')
plt.savefig('./sampling_data/time_stats_improved_get_yield.png')
plt.close()
Exemplo n.º 15
0
def violin_saved_dataset(c,loops_2_show,y_lim=None):
    '''

    :param c: inputs() object
    :param loops_2_show: Nx1 ndarray array of loops to show in violin plot
    :param y_lim: 2x1 list . Max and min of y limits
    :return: makes violin plot of the loops specified in loops2show


    '''
    'nb strings is the number of strings in the violin plot'
    # first make the directory and get all the pkl files
    if y_lim is None:
        y_lim = [-1.5, 3]
    df_pp = pd.read_pickle(path=dm.make_file_name(c=c,file_description=fn.pp_fn))
    pp = sm.convert2numpy(df=df_pp, field=fn.pp_fn)
    df_min_yield =pd.read_pickle(path=dm.make_file_name(c=c,file_description=fn.min_yield_fn))
    min_yield=sm.convert2numpy(df=df_min_yield,field=fn.min_yield_fn)

    # make the figure
    fig,ax=plt.subplots(1, 1, figsize=[5, 3], dpi=300)

    labels=[]
    for k in np.arange(len(loops_2_show)):
        # read parralel file
        # s=strings[k]
        # n=numbers[k]

        n=loops_2_show[k]

        df=dm.read_pickle(c=c,file_description='sequences_loop_'+str(loops_2_show[k]))
        dev=sm.convert2numpy(df=df,field=c.yield2optimize)
        violin_parts =ax.violinplot([dev], positions=[k], showmedians=False,
                                                       showextrema=False, points=100,
                                                       widths=.9)
        # violin_parts['cmedians'].set_color('r')
        # violin_parts['cmedians']=min_yield[n-1]
        for pc in violin_parts['bodies']:
            pc.set_color('k')
        #TODO: figure out why min yield and pp are off by 2 idexes ,look to nested_sampling
        labels.append('Loop: %i'%n)

    nb_strings=loops_2_show.shape[0]
    ax.set_xticks(np.arange(nb_strings))
    ax.set_ylim(y_lim)
    ax.set_xticklabels(labels)
    # ax.set_xlabel('Loop',fontsize=6)
    ax.set_ylabel('Yield', fontsize=6)
    ax.tick_params(axis='both', which='major', labelsize=6)
    ax.set_title('Nested Sampling Loops: %i, Random Walk Steps: %i ' % (np.max(loops_2_show),c.nb_steps))
    start=-0.5
    for k in loops_2_show:
        x=[start,start+1]
        y=[min_yield[k-1],min_yield[k-1]]
        ax.plot(x,y,'-r',linewidth=0.5,label='Threshold')
        ax.text(np.average(x),np.average(y)-0.02,'%0.2f'%min_yield[k-1],fontsize=6,color='grey',horizontalalignment='center',
                verticalalignment='top')
        start=start+1
    ax.legend(['Threshold'])
    fig.tight_layout()
    print('saving ' +dm.make_file_name(c=c,file_description='violin_plot_nb_strings_%i'%nb_strings,fileformat='png'))
    fig.savefig(dm.make_file_name(c=c,file_description='violin_plot_nb_strings_%i'%nb_strings,fileformat='png'))
    plt.close(fig)
Exemplo n.º 16
0
import pandas as pd
import ns_sampling_modules as sm
import numpy as np

df = pd.read_pickle(
    './sampling_data/Nb_sequences_1000_Nbsteps_5_Nb_loops_100000_dynamic_10/sequences_loop_60001.pkl'
)

y = sm.convert2numpy(df, 'Developability')

idx = np.argmax(y)

print(df.loc[idx, 'Ordinal'])