def get_bvms(label, msa_file, source, dest, A, num_seqs):
    # randomSeqs of VAE are in parent_dir, all others are in data_home
    print("inside get_bvms_phylo, calling get_bvms on ", label)
    #if 'target' in label:
    #    num_seqs = 10000

    load_name = source + "/" + msa_file
    bvms_file_name = "bvms_" + label + ".npy"
    save_name = dest + "/" + bvms_file_name
    msa = seqload.loadSeqs(load_name)[0][:num_seqs]
    '''
    elif label == "natural":
        bvms_file_name = "bvms_" + label + ".npy"
        load_name = data_home + "/" + msa_file
        save_name = parent_dir_name + "/" + bvms_file_name
        msa = seqload.loadSeqs(load_name)[0]
    elif label == "mi3Seqs":
        bvms_file_name = "bvms_" + label + ".npy"
        load_name = data_home + "/" + msa_file
        save_name = data_home + "/" + bvms_file_name
        msa = seqload.loadSeqs(load_name)[0][:num_seqs]
    else:
        bvms_file_name = "bvms_" + label + ".npy"
        load_name = data_home + "/" + msa_file
        save_name = parent_dir_name + "/" + bvms_file_name
        msa = seqload.loadSeqs(load_name)[0][:num_seqs]
    '''
    print("\t\t\t\timporting msa for:\t", label, "\t", load_name)
    print("\t\t\t\tfinished msa import for:\t", label)
    print("\t\t\t\tcomputing bvms for:\t", label)
    bvms = compute_bvms(msa, A, '0')
    np.save(save_name, bvms)
    print("\t\t\t\tfinished computing bvms for:\t", label)
    return bvms_file_name
    def plot_hams_loglog(self):
        print("MAKING LOGLOG HAMS")
        # make list of labels, filenames
        fig, ax = pylab.subplots(figsize=(self.fig_size, self.fig_size))
        xlabel = r"ln($d$/$d_{Mo}$)"
        ylabel = r"ln($f$/$f_{max}$)"

        for label, seqs_file in self.vis_seqs.items():
            if label in self.skip:
                print("skipping ", label)
                continue
            if not self.which_models[label]:    # model is 'false' in the which_models{}, then continue
                continue
            label = self.label_dict[label]
            seqs_path = self.msa_dir + "/" + seqs_file
            print("computing hams for:\t", label, "\t\t\tin:\t" + seqs_path)
            seqs = loadSeqs(self.msa_dir + "/" + seqs_file, names=self.ALPHA)[0][0:self.keep_hams]
            h = histsim(seqs).astype(float)[::-1][1:].tolist()
            hams = np.arange(1, len(h)+1, 1)
            
            d_norm_x = self.norm_x(hams, h)
            d_norm_y = self.norm_y(h)
            x_mask = list()
            y_mask = list()
            #if label == "Target":

            line_style = "solid"
            if label == "Target":
                if "nat" in self.synth_nat:
                    target_label = "Nat-Target"
                else:
                    target_label = "Synth-Target"
                h = histsim(seqs).astype(float)
                h = h/np.sum(h)
                rev_h = h[::-1]
                line_style = "dashed"
                my_dashes = (1, 1)
             
                ax.plot(d_norm_x, d_norm_y, linestyle=line_style, linewidth=self.line_width, dashes=my_dashes, 
                    alpha=self.line_alpha, color=self.color_set[label], label=target_label, zorder=self.z_order[label])
            else:
                ax.plot(d_norm_x, d_norm_y, linestyle=line_style, linewidth=self.line_width,
                    alpha=self.line_alpha, color=self.color_set[label], label=label, zorder=self.z_order[label])
        
        pylab.ylim(-8, 1)
        pylab.xlim(-0.5, 0.26)
        pylab.ylabel(ylabel, fontsize=self.label_size-1)
        pylab.xlabel(xlabel, fontsize=self.label_size-1)
        x_tick_range = np.arange(-0.5, 0.5, 0.25)
        y_tick_range = np.arange(-8, 1, 2)
        pylab.xticks(x_tick_range, rotation=45)
        pylab.yticks(y_tick_range)
        pylab.tick_params(direction='in', axis='both', which='major', labelsize=self.tick_size, length=self.tick_length, width=self.tick_width)
        file_name = "/loglog_ham_" + self.name + "_" + self.synth_nat + "_" + self.which_size + ".pdf"
        #pylab.title(self.which_size, fontsize=self.title_size)
        pylab.tight_layout()
        pylab.legend(fontsize=self.tick_size-3, loc="best", frameon=False)
        save_name = self.output_dir + "/" + file_name
        pylab.savefig(save_name, dpi=self.dpi, format='pdf')
        pylab.close()
示例#3
0
def compute_hams_dist(gen_seqs_file, label, data_home, parent_dir_name, name, keep):
    print("\t\t\t\tcompute_hams() for:")
    print("\t\t\t\t\t" + label + "\tkeep:\t" + str(keep))
    

    if label == "randomSeqs":
        load_name = parent_dir_name + "/" + name + "_" + label
        print("\t\t\t\t\t\trandom seqs")
        print("\t\t\t\t\t\tloading seqs file for hams:\t", load_name)
        seqs = loadSeqs(load_name)[0][0:keep]
    else:
        seqs = loadSeqs(data_home + "/" + gen_seqs_file)[0][0:keep]

    N, L = seqs.shape
    Npairs = N*(N-1)//2
    hamming = np.empty(Npairs, dtype='i4')

    c = 0
    for i in range(N-1):
        hamming[c:c+(N-i-1)] = np.sum(seqs[i,:] != seqs[i+1:,:], axis=1)
        c += N-i-1
    
    h_counter = dict()

    for h in hamming:
        if h not in h_counter.keys():
            h_counter[h] = 1
        else:
            h_counter[h] += 1
   
    # impute 0's for missing hams
    for x in range(1, L+1):
        if x not in h_counter.keys():
            h_counter[x] = 0

    df = pd.DataFrame(h_counter.items())
    df.columns = ['ham', 'freq']
    hamdist_file_name = "hamdist_" + label + "_" + name + "_" + str(keep) + ".csv"
    df.to_csv(parent_dir_name + "/" + hamdist_file_name, index=False)
    
    return hamdist_file_name
#!/usr/bin/env python
import numpy as np
from numpy import random
import sys, os
from mi3gpu.utils import seqload
from multiprocessing import Pool, set_start_method, Lock
from highmarg import highmarg
set_start_method('fork')

###############################################################################
# load data in

print("starting homs_script")
seq_lim = 10000
reps = int(sys.argv[1])
targetSeqs = seqload.loadSeqs(sys.argv[2])[0]
refSeqs = seqload.loadSeqs(sys.argv[3])[0]
mi3Seqs = seqload.loadSeqs(sys.argv[4])[0]
randomSeqs = seqload.loadSeqs(sys.argv[5])[0]
indepSeqs = seqload.loadSeqs(sys.argv[6])[0]
ref_trunc = seqload.loadSeqs(
    sys.argv[7])[0]  # this has only 5.99M from mi3Seqs
target_trunc = seqload.loadSeqs(
    sys.argv[8])[0]  # this has only 10K from mi3Seqs
print("loaded target_trunc")
deepSeqs = seqload.loadSeqs(sys.argv[9])[0]
# progenSeqs = seqload.loadSeqs(sys.argv[10])[0]
parent_dir_name = sys.argv[10]
start = int(sys.argv[11])
end = int(sys.argv[12])
synth_nat = sys.argv[13]
    def plot_hams(self):
        print("plotting normal hams")
        fig, ax = pylab.subplots(figsize=(self.fig_size, self.fig_size))
        box_font = self.box_font_size

        # axes labels
        xlabel = r"$d$"
        ylabel = "f"
        if self.protein == "Kinase":
            start = 120
            end = 230
            x_tick_range = np.arange(start, end, 20)
            pylab.xlim(start, end)
            
        all_freqs = dict()

        for label, seqs_file in self.vis_seqs.items():
            if label in self.skip:
                print("skipping ", label)
                continue
            if not self.which_models[label]:    # model is 'false' in the which_models{}, then continue
                continue
            label = self.label_dict[label]
            print("computing hams for:\t", label)
            seqs = loadSeqs(self.msa_dir + "/" + seqs_file, names=self.ALPHA)[0][0:self.keep_hams]
            h = histsim(seqs).astype(float)
            h = h/np.sum(h)
            all_freqs[label] = h
            rev_h = h[::-1]
            if label == "Target":
                if "nat" in self.synth_nat:
                    target_label = "Nat-Target"
                else:
                    target_label = "Synth-Target"
                line_style = "dashed"
                my_dashes = (1, 1)
                ax.plot(rev_h, linestyle=line_style, linewidth=self.line_width, dashes=my_dashes,
                    alpha=self.line_alpha, color=self.color_set[label], label=target_label, zorder=self.z_order[label])
            else:
                line_style = "solid"
                ax.plot(rev_h, linestyle=line_style, linewidth=self.line_width,
                    alpha=self.line_alpha, color=self.color_set[label], label=label, zorder=self.z_order[label])

        tvds = dict()
        print("all_freqs")
        print(all_freqs.keys())
        delete_key = ''
        save_value = ''
        for data_label, f in all_freqs.items():
            if 'arget' in data_label:
                save_value = f
                delete_key = data_label
        
        del all_freqs[delete_key]
        all_freqs["Target"] = save_value

        for data_label, f in all_freqs.items():
            if data_label != 'Target':
                tvds[data_label] = round(np.sum(np.abs(all_freqs['Target'] - f))/2, 4)

        print(tvds)
        y_tick_range = np.arange(0.0, 0.08, 0.02)
        pylab.ylabel(ylabel, fontsize=self.label_size)
        pylab.xlabel(xlabel, fontsize=self.label_size)
        pylab.xticks(x_tick_range, rotation=45)
        pylab.yticks(y_tick_range)
        pylab.tick_params(direction='in',axis='both', which='major', labelsize=self.tick_size, 
            length=self.tick_length, width=self.tick_width)
        #my_title = "Hamming Distance Distributions\n" + self.parent_dir_name
        file_name = "ham_" + self.name + "_" + self.synth_nat + "_" + self.which_size + ".pdf"
        #pylab.title(self.which_size, fontsize=self.title_size)
        pylab.tight_layout()
        pylab.legend(fontsize=self.tick_size-3, loc="upper left", frameon=False)
        save_name = self.output_dir + "/" + file_name
        print(save_name)
        pylab.savefig(save_name, dpi=self.dpi, format='pdf')
        pylab.close()