def get_bvms(label, msa_file, source, dest, A, num_seqs): # randomSeqs of VAE are in parent_dir, all others are in data_home print("inside get_bvms_phylo, calling get_bvms on ", label) #if 'target' in label: # num_seqs = 10000 load_name = source + "/" + msa_file bvms_file_name = "bvms_" + label + ".npy" save_name = dest + "/" + bvms_file_name msa = seqload.loadSeqs(load_name)[0][:num_seqs] ''' elif label == "natural": bvms_file_name = "bvms_" + label + ".npy" load_name = data_home + "/" + msa_file save_name = parent_dir_name + "/" + bvms_file_name msa = seqload.loadSeqs(load_name)[0] elif label == "mi3Seqs": bvms_file_name = "bvms_" + label + ".npy" load_name = data_home + "/" + msa_file save_name = data_home + "/" + bvms_file_name msa = seqload.loadSeqs(load_name)[0][:num_seqs] else: bvms_file_name = "bvms_" + label + ".npy" load_name = data_home + "/" + msa_file save_name = parent_dir_name + "/" + bvms_file_name msa = seqload.loadSeqs(load_name)[0][:num_seqs] ''' print("\t\t\t\timporting msa for:\t", label, "\t", load_name) print("\t\t\t\tfinished msa import for:\t", label) print("\t\t\t\tcomputing bvms for:\t", label) bvms = compute_bvms(msa, A, '0') np.save(save_name, bvms) print("\t\t\t\tfinished computing bvms for:\t", label) return bvms_file_name
def plot_hams_loglog(self): print("MAKING LOGLOG HAMS") # make list of labels, filenames fig, ax = pylab.subplots(figsize=(self.fig_size, self.fig_size)) xlabel = r"ln($d$/$d_{Mo}$)" ylabel = r"ln($f$/$f_{max}$)" for label, seqs_file in self.vis_seqs.items(): if label in self.skip: print("skipping ", label) continue if not self.which_models[label]: # model is 'false' in the which_models{}, then continue continue label = self.label_dict[label] seqs_path = self.msa_dir + "/" + seqs_file print("computing hams for:\t", label, "\t\t\tin:\t" + seqs_path) seqs = loadSeqs(self.msa_dir + "/" + seqs_file, names=self.ALPHA)[0][0:self.keep_hams] h = histsim(seqs).astype(float)[::-1][1:].tolist() hams = np.arange(1, len(h)+1, 1) d_norm_x = self.norm_x(hams, h) d_norm_y = self.norm_y(h) x_mask = list() y_mask = list() #if label == "Target": line_style = "solid" if label == "Target": if "nat" in self.synth_nat: target_label = "Nat-Target" else: target_label = "Synth-Target" h = histsim(seqs).astype(float) h = h/np.sum(h) rev_h = h[::-1] line_style = "dashed" my_dashes = (1, 1) ax.plot(d_norm_x, d_norm_y, linestyle=line_style, linewidth=self.line_width, dashes=my_dashes, alpha=self.line_alpha, color=self.color_set[label], label=target_label, zorder=self.z_order[label]) else: ax.plot(d_norm_x, d_norm_y, linestyle=line_style, linewidth=self.line_width, alpha=self.line_alpha, color=self.color_set[label], label=label, zorder=self.z_order[label]) pylab.ylim(-8, 1) pylab.xlim(-0.5, 0.26) pylab.ylabel(ylabel, fontsize=self.label_size-1) pylab.xlabel(xlabel, fontsize=self.label_size-1) x_tick_range = np.arange(-0.5, 0.5, 0.25) y_tick_range = np.arange(-8, 1, 2) pylab.xticks(x_tick_range, rotation=45) pylab.yticks(y_tick_range) pylab.tick_params(direction='in', axis='both', which='major', labelsize=self.tick_size, length=self.tick_length, width=self.tick_width) file_name = "/loglog_ham_" + self.name + "_" + self.synth_nat + "_" + self.which_size + ".pdf" #pylab.title(self.which_size, fontsize=self.title_size) pylab.tight_layout() pylab.legend(fontsize=self.tick_size-3, loc="best", frameon=False) save_name = self.output_dir + "/" + file_name pylab.savefig(save_name, dpi=self.dpi, format='pdf') pylab.close()
def compute_hams_dist(gen_seqs_file, label, data_home, parent_dir_name, name, keep): print("\t\t\t\tcompute_hams() for:") print("\t\t\t\t\t" + label + "\tkeep:\t" + str(keep)) if label == "randomSeqs": load_name = parent_dir_name + "/" + name + "_" + label print("\t\t\t\t\t\trandom seqs") print("\t\t\t\t\t\tloading seqs file for hams:\t", load_name) seqs = loadSeqs(load_name)[0][0:keep] else: seqs = loadSeqs(data_home + "/" + gen_seqs_file)[0][0:keep] N, L = seqs.shape Npairs = N*(N-1)//2 hamming = np.empty(Npairs, dtype='i4') c = 0 for i in range(N-1): hamming[c:c+(N-i-1)] = np.sum(seqs[i,:] != seqs[i+1:,:], axis=1) c += N-i-1 h_counter = dict() for h in hamming: if h not in h_counter.keys(): h_counter[h] = 1 else: h_counter[h] += 1 # impute 0's for missing hams for x in range(1, L+1): if x not in h_counter.keys(): h_counter[x] = 0 df = pd.DataFrame(h_counter.items()) df.columns = ['ham', 'freq'] hamdist_file_name = "hamdist_" + label + "_" + name + "_" + str(keep) + ".csv" df.to_csv(parent_dir_name + "/" + hamdist_file_name, index=False) return hamdist_file_name
#!/usr/bin/env python import numpy as np from numpy import random import sys, os from mi3gpu.utils import seqload from multiprocessing import Pool, set_start_method, Lock from highmarg import highmarg set_start_method('fork') ############################################################################### # load data in print("starting homs_script") seq_lim = 10000 reps = int(sys.argv[1]) targetSeqs = seqload.loadSeqs(sys.argv[2])[0] refSeqs = seqload.loadSeqs(sys.argv[3])[0] mi3Seqs = seqload.loadSeqs(sys.argv[4])[0] randomSeqs = seqload.loadSeqs(sys.argv[5])[0] indepSeqs = seqload.loadSeqs(sys.argv[6])[0] ref_trunc = seqload.loadSeqs( sys.argv[7])[0] # this has only 5.99M from mi3Seqs target_trunc = seqload.loadSeqs( sys.argv[8])[0] # this has only 10K from mi3Seqs print("loaded target_trunc") deepSeqs = seqload.loadSeqs(sys.argv[9])[0] # progenSeqs = seqload.loadSeqs(sys.argv[10])[0] parent_dir_name = sys.argv[10] start = int(sys.argv[11]) end = int(sys.argv[12]) synth_nat = sys.argv[13]
def plot_hams(self): print("plotting normal hams") fig, ax = pylab.subplots(figsize=(self.fig_size, self.fig_size)) box_font = self.box_font_size # axes labels xlabel = r"$d$" ylabel = "f" if self.protein == "Kinase": start = 120 end = 230 x_tick_range = np.arange(start, end, 20) pylab.xlim(start, end) all_freqs = dict() for label, seqs_file in self.vis_seqs.items(): if label in self.skip: print("skipping ", label) continue if not self.which_models[label]: # model is 'false' in the which_models{}, then continue continue label = self.label_dict[label] print("computing hams for:\t", label) seqs = loadSeqs(self.msa_dir + "/" + seqs_file, names=self.ALPHA)[0][0:self.keep_hams] h = histsim(seqs).astype(float) h = h/np.sum(h) all_freqs[label] = h rev_h = h[::-1] if label == "Target": if "nat" in self.synth_nat: target_label = "Nat-Target" else: target_label = "Synth-Target" line_style = "dashed" my_dashes = (1, 1) ax.plot(rev_h, linestyle=line_style, linewidth=self.line_width, dashes=my_dashes, alpha=self.line_alpha, color=self.color_set[label], label=target_label, zorder=self.z_order[label]) else: line_style = "solid" ax.plot(rev_h, linestyle=line_style, linewidth=self.line_width, alpha=self.line_alpha, color=self.color_set[label], label=label, zorder=self.z_order[label]) tvds = dict() print("all_freqs") print(all_freqs.keys()) delete_key = '' save_value = '' for data_label, f in all_freqs.items(): if 'arget' in data_label: save_value = f delete_key = data_label del all_freqs[delete_key] all_freqs["Target"] = save_value for data_label, f in all_freqs.items(): if data_label != 'Target': tvds[data_label] = round(np.sum(np.abs(all_freqs['Target'] - f))/2, 4) print(tvds) y_tick_range = np.arange(0.0, 0.08, 0.02) pylab.ylabel(ylabel, fontsize=self.label_size) pylab.xlabel(xlabel, fontsize=self.label_size) pylab.xticks(x_tick_range, rotation=45) pylab.yticks(y_tick_range) pylab.tick_params(direction='in',axis='both', which='major', labelsize=self.tick_size, length=self.tick_length, width=self.tick_width) #my_title = "Hamming Distance Distributions\n" + self.parent_dir_name file_name = "ham_" + self.name + "_" + self.synth_nat + "_" + self.which_size + ".pdf" #pylab.title(self.which_size, fontsize=self.title_size) pylab.tight_layout() pylab.legend(fontsize=self.tick_size-3, loc="upper left", frameon=False) save_name = self.output_dir + "/" + file_name print(save_name) pylab.savefig(save_name, dpi=self.dpi, format='pdf') pylab.close()