def leven_search_pam(pam_obj_file, target, threshold, out_file): target = str.upper(target) targ_len = len(target) d = 0 hit = [] with open(pam_obj_file, 'rb') as pam_handle: with open(out_file, 'wb') as csvfile: pam_obj = pickle.load(pam_handle) print '####################' print 'vmb memory:' + str(vmb.memory()) print 'vmb resident: ' + str(vmb.resident()) print '####################' ct = 0 for el in pam_obj: d = levenshtein(el[0].upper(), target) if d <= threshold: hit.append(el + [d]) ct += 1 if (ct % 1000) == 0: print '####################' print 'vmb memory:' + str(vmb.memory()) print 'vmb resident: ' + str(vmb.resident()) print '####################' print str(float(ct) / float(len(pam_obj))) mywriter = csv.writer(csvfile, delimiter=',') for el in hit: mywriter.writerow(el)
def nt_query_gg_all(gg_obj_file, out_file): with open(gg_obj_file, 'rb') as gg_handle: gg_obj = pickle.load(gg_handle) rf_split = gg_obj_file.split('_') chrid = rf_split[len(rf_split) - 2] offset = int(rf_split[len(rf_split) - 1].replace('.gg', '')) print offset with open(out_file, 'wb') as csvfile: mywriter = csv.writer(csvfile, delimiter=',') print '####################' print 'vmb memory:' + str(vmb.memory()) print 'vmb resident: ' + str(vmb.resident()) print '####################' #skip overlap if offset > 0 if offset == 0: ref_lb = 0 else: ref_lb = 1000 ref_ub = len(gg_obj.ref) out = gg_obj.get_ntvar_counts(ref_lb, ref_ub) mywriter.writerow([chrid, offset] + out['A'] + out['C'] + out['G'] + out['T'])
def nt_query_gg_all(gg_obj_file,gff3_file,out_file): with open(gg_obj_file,'rb') as gg_handle: gg_obj = pickle.load(gg_handle) with open(out_file,'wb') as csvfile: mywriter = csv.writer(csvfile, delimiter=',') print '####################' print 'vmb memory:' + str(vmb.memory()) print 'vmb resident: ' + str(vmb.resident()) print '####################' gff3_file_list = gff3_file.replace('.gff3','').split('_') offset = int(gff3_file_list[len(gff3_file_list)-1]) with open(gff3_file,'rb') as fin: for line in fin: if line[0] == '#': continue line_els = line.strip().split() if line_els[2] == 'exon': ref_lb = int(line_els[3])-offset-1 ref_ub = int(line_els[4])-offset-1 if ref_lb < 0: ref_lb = 0 if ref_ub > len(gg_obj.ref): ref_ub = len(gg_obj.ref) out = gg_obj.get_ntvar_counts(ref_lb,ref_ub) mywriter.writerow(line_els+out['A']+out['T']+out['C']+out['G']+out['Cnot']+out['Gnot'])
def __init__(self, geno_genome): super(Geno_slice, self).__init__() self.ref = geno_genome.ref self.var_track = geno_genome.var_track print '####################' print 'vmb memory:' + str(vmb.memory()) print 'vmb resident: ' + str(vmb.resident()) print '####################' self.ind = 0 self.len = 0 self.name_dict = {} self.geno_dict = {}
def split_chrom_ref_vcf_gff3(ref_filename,vcf_filename,gff3_filename): with open(ref_filename,'r') as fin: for record in SeqIO.parse(fin, "fasta"): print record.id print '####################' print 'vmb memory:' + str(vmb.memory()) print 'vmb resident: ' + str(vmb.resident()) print '####################' with open(ref_filename.replace('.fa', '_'+str(record.id)+".fa"),'wb') as fout: SeqIO.write(record, fout, "fasta")
def leven_search_pam(pam_obj_file,target_file,threshold,out_file): pof_base = os.path.basename(pam_obj_file) with open(pam_obj_file,'rb') as pam_handle: pam_obj = pickle.load(pam_handle) with open(out_file,'wb') as csvfile: mywriter = csv.writer(csvfile, delimiter=',') with open(target_file,'rb') as targfile: for record in SeqIO.parse(targfile, "fasta"): target = str(record.seq).upper() name_list = str(record.name).split('|') print '####################' print 'vmb memory:' + str(vmb.memory()) print 'vmb resident: ' + str(vmb.resident()) print '####################' ct = 0 d = 0 hit = [] for el in pam_obj: # d = levenshtein(el[0].upper(),target) d = linear(el[0].upper(),target) if d <= threshold: hit.append(name_list+[target,pof_base,d]+el) ct+=1 if (ct%100000) == 0: print '####################' print 'vmb memory:' + str(vmb.memory()) print 'vmb resident: ' + str(vmb.resident()) print '####################' print str(float(ct)/float(len(pam_obj))) for el in hit: mywriter.writerow(el)
def target_query_gg_all(gg_obj_file, gff3_file, PAM_orient, PAM_list, target_len, out_file): PAM_len = len(PAM_list[0]) PAM_dict = {} for pam in PAM_list: PAM_dict[str(pam)] = pam PAM_dict_rc = {} for pam in PAM_list: PAM_dict_rc[revcomp(str(pam))] = pam gff3_pct = {} gff3_file_list = gff3_file.replace('.gff3', '').split('_') offset = int(gff3_file_list[len(gff3_file_list) - 1]) with open(gff3_file, 'rb') as fin: for line in fin: if line[0] == '#': continue row_obj = gff3.GFF3_row(line) # print str(row_obj) #protein coding transcript (PCT) and exon or UTR (EU) if (row_obj.PCT and row_obj.EU): if row_obj.INFO['transcript_id'] not in gff3_pct: gff3_pct[row_obj.INFO['transcript_id']] = gff3.PCT(row_obj) else: gff3_pct[row_obj.INFO['transcript_id']].add_element( row_obj) gff3_lines = [] for pct_id in gff3_pct: gff3_lines += gff3_pct[pct_id].get_coding() # print str(gff3_pct[pct_id]) # print gff3_lines with open(gg_obj_file, 'rb') as gg_handle: gcr_obj = gg.Geno_CRISPR(pickle.load(gg_handle)) with open(out_file, 'wb') as csvfile: mywriter = csv.writer(csvfile, delimiter=',') print '####################' print 'vmb memory:' + str(vmb.memory()) print 'vmb resident: ' + str(vmb.resident()) print '####################' for line_els in gff3_lines: # if line[0] == '#': # continue # line_els = line.strip().split() if line_els[2] == 'exon': ref_lb = int(line_els[3]) - offset - 1 ref_ub = int(line_els[4]) - offset - 1 if ref_lb < 0: ref_lb = 0 if ref_ub > gcr_obj.len: ref_ub = gcr_obj.len [target_inds, target_vars] = gcr_obj.get_var_targets_del_pams( PAM_orient, PAM_list, target_len, ref_lb, ref_ub) n_targ = len(target_inds) n_var = len(target_vars) for loc in target_inds: af = 0 hetf = 0 max_var = 0 max_var_els = [] #change 161006 taking max only if str(loc) in target_vars: # print [[el[0],el[1].get_af_adj()] for el in target_vars[str(loc)]] for el in target_vars[str(loc)]: if el[1].get_af_adj() > max_var: max_var = el[1].get_af_adj() max_var_els = [el] elif el[1].get_af_adj() == max_var: max_var_els += [el] #pam start is index 0 at this point #for equal af, take closest to PAM max_var_els.sort(key=lambda x: x[0]) ind = 0 el = max_var_els[0] if PAM_orient == 'R': #orient PAM at right hand side ind = (target_len + PAM_len) - (el[0] + 1) elif PAM_orient == 'L': ind = el[0] af = el[1].get_af_adj() hetf = el[1].get_hetf() #platinum: af < 0.0001 if af < 0.0001: out_list = [ loc[0], loc[0] + len(loc[2]), loc[1], PAM_orient ] if PAM_orient == 'R': #orient PAM at right hand side out_list += [loc[2][0:target_len]] out_list += [loc[2][target_len:len(loc[2])]] elif PAM_orient == 'L': out_list += [loc[2][PAM_len:len(loc[2])]] out_list += [loc[2][0:PAM_len]] out_list += [af] mywriter.writerow(line_els + out_list)
def PAM_query_gg_all(gg_obj_file, gff3_file, PAM_list, out_file): PAM_len = len(PAM_list[0]) PAM_dict = {} for pam in PAM_list: PAM_dict[str(pam)] = pam PAM_dict_rc = {} for pam in PAM_list: PAM_dict_rc[revcomp(str(pam))] = pam gff3_pct = {} gff3_file_list = gff3_file.replace('.gff3', '').split('_') offset = int(gff3_file_list[len(gff3_file_list) - 1]) with open(gff3_file, 'rb') as fin: for line in fin: if line[0] == '#': continue row_obj = gff3.GFF3_row(line) # print str(row_obj) #protein coding transcript (PCT) and exon or UTR (EU) if (row_obj.PCT and row_obj.EU): if row_obj.INFO['transcript_id'] not in gff3_pct: gff3_pct[row_obj.INFO['transcript_id']] = gff3.PCT(row_obj) else: gff3_pct[row_obj.INFO['transcript_id']].add_element( row_obj) gff3_lines = [] for pct_id in gff3_pct: gff3_lines += gff3_pct[pct_id].get_coding() # print str(gff3_pct[pct_id]) # print gff3_lines with open(gg_obj_file, 'rb') as gg_handle: gcr_obj = gg.Geno_CRISPR(pickle.load(gg_handle)) with open(out_file, 'wb') as csvfile: mywriter = csv.writer(csvfile, delimiter=',') print '####################' print 'vmb memory:' + str(vmb.memory()) print 'vmb resident: ' + str(vmb.resident()) print '####################' for line_els in gff3_lines: # if line[0] == '#': # continue # line_els = line.strip().split() if line_els[2] == 'exon': ref_lb = int(line_els[3]) - offset - 1 ref_ub = int(line_els[4]) - offset - 1 if ref_lb < 0: ref_lb = 0 if ref_ub > gcr_obj.len: ref_ub = gcr_obj.len af_bins = [[0.00001, 0], [0.0001, 0], [0.001, 0], [0.01, 0], [0.1, 0], [1, 0]] hetf_bins = [[0.00001, 0], [0.0001, 0], [0.001, 0], [0.01, 0], [0.1, 0], [1, 0]] [pam_inds, del_vars] = gcr_obj.get_del_PAMS(PAM_list, ref_lb, ref_ub) n_pam = len(pam_inds) n_del = len(del_vars) for loc in del_vars: af = 0 hetf = 0 for var in del_vars[loc]: #change 161006 taking max only if var.get_af_adj() > af: af = var.get_af_adj() hetf = var.get_hetf() #change 161006 taking max only varct = 1 if af < af_bins[0][0]: af_bins[0][1] += 1 hetf_bins[0][1] += hetf / varct elif af < af_bins[1][0]: af_bins[1][1] += 1 hetf_bins[1][1] += hetf / varct elif af < af_bins[2][0]: af_bins[2][1] += 1 hetf_bins[2][1] += hetf / varct elif af < af_bins[3][0]: af_bins[3][1] += 1 hetf_bins[3][1] += hetf / varct elif af < af_bins[4][0]: af_bins[4][1] += 1 hetf_bins[4][1] += hetf / varct elif af <= af_bins[5][0]: af_bins[5][1] += 1 hetf_bins[5][1] += hetf / varct mywriter.writerow(line_els + [el[1] for el in af_bins] + [el[1] for el in hetf_bins] + [n_del, n_pam, 'del']) af_bins = [[0.00001, 0], [0.0001, 0], [0.001, 0], [0.01, 0], [0.1, 0], [1, 0]] hetf_bins = [[0.00001, 0], [0.0001, 0], [0.001, 0], [0.01, 0], [0.1, 0], [1, 0]] [no_pam_inds, add_vars] = gcr_obj.get_add_PAMS(PAM_list, ref_lb, ref_ub) n_no_pam = len(no_pam_inds) n_add = len(add_vars) for loc in add_vars: af = 0 hetf = 0 for var in add_vars[loc]: #change 161006 taking max only if var.get_af_adj() > af: af = var.get_af_adj() hetf = var.get_hetf() #change 161006 taking max only varct = 1 if af < af_bins[0][0]: af_bins[0][1] += 1 hetf_bins[0][1] += hetf / varct elif af < af_bins[1][0]: af_bins[1][1] += 1 hetf_bins[1][1] += hetf / varct elif af < af_bins[2][0]: af_bins[2][1] += 1 hetf_bins[2][1] += hetf / varct elif af < af_bins[3][0]: af_bins[3][1] += 1 hetf_bins[3][1] += hetf / varct elif af < af_bins[4][0]: af_bins[4][1] += 1 hetf_bins[4][1] += hetf / varct elif af <= af_bins[5][0]: af_bins[5][1] += 1 hetf_bins[5][1] += hetf / varct mywriter.writerow(line_els + [el[1] for el in af_bins] + [el[1] for el in hetf_bins] + [n_add, n_no_pam, 'add'])
import vmb print '####################' print 'vmb memory:' + str(vmb.memory()) print 'vmb resident: ' + str(vmb.resident()) print '####################'
def PAM_compile_gg(gg_obj_file, targ_len, PAM_orient, PAM_list, out_file): PAM_len = len(PAM_list[0]) PAM_dict = {} for pam in PAM_list: PAM_dict[str(pam)] = pam PAM_dict_rc = {} for pam in PAM_list: PAM_dict_rc[revcomp(str(pam))] = pam with open(gg_obj_file, 'rb') as gg_handle: with open(out_file, 'wb') as fout: ggs_obj = gg.Geno_slice(pickle.load(gg_handle)) print '####################' print 'vmb memory:' + str(vmb.memory()) print 'vmb resident: ' + str(vmb.resident()) print '####################' print ggs_obj.ref # print ggs_obj.var_track len_slice = targ_len + (2 * PAM_len) ggs_obj.slice(0, len_slice) print '####################' print 'vmb memory:' + str(vmb.memory()) print 'vmb resident: ' + str(vmb.resident()) print '####################' ct = 0 ct2 = 0 search_seqs = [] for i in range(0, (len(ggs_obj.ref) - len_slice)): gg_seqs = ggs_obj.compile_geno_seqs() gg_seqs += [ gg.Geno_seq(ggs_obj.ref[i:(i + len_slice)], ggs_obj.ind - ggs_obj.len, -1, ['0']) ] for gg_seq in gg_seqs: if PAM_orient == 'R': if gg_seq.seq[0:PAM_len] in PAM_dict_rc: # if (levenshtein(revcomp(gg_seq.seq[PAM_len:(PAM_len+targ_len)]), # target) <= threshold): # fout.write('>'+gg_seq.seq[0:PAM_len]+ # '|'+PAM_orient+'|'+'BS'+'|'+str(gg_seq)+'\n') # fout.write(revcomp(gg_seq.seq[PAM_len:(PAM_len+targ_len)])+'\n') search_seqs.append([ revcomp(gg_seq.seq[PAM_len:(PAM_len + targ_len)]), gg_seq.seq[0:PAM_len], PAM_orient, 'BS', gg_seq.ind, gg_seq.allele, gg_seq.names ]) ct2 += 1 if gg_seq.seq[(len_slice - PAM_len):len_slice] in PAM_dict: # if (levenshtein(gg_seq.seq[PAM_len:(PAM_len+targ_len)], # target) <= threshold): # fout.write('>'+gg_seq.seq[(len_slice-PAM_len):len_slice]+ # '|'+PAM_orient+'|'+'TS'+'|'+str(gg_seq)+'\n') # fout.write(gg_seq.seq[PAM_len:(PAM_len+targ_len)]+'\n') search_seqs.append([ gg_seq.seq[PAM_len:(PAM_len + targ_len)], gg_seq.seq[(len_slice - PAM_len):len_slice], PAM_orient, 'TS', gg_seq.ind, gg_seq.allele, gg_seq.names ]) ct2 += 1 elif PAM_orient == 'L': if gg_seq.seq[0:PAM_len] in PAM_dict: # if (levenshtein(gg_seq.seq[PAM_len:(PAM_len+targ_len)], # target) <= threshold): # fout.write('>'+gg_seq.seq[0:PAM_len]+ # '|'+PAM_orient+'|'+'TS'+'|'+str(gg_seq)+'\n') # fout.write(gg_seq.seq[PAM_len:(PAM_len+targ_len)]+'\n') search_seqs.append([ gg_seq.seq[PAM_len:(PAM_len + targ_len)], gg_seq.seq[0:PAM_len], PAM_orient, 'TS', gg_seq.ind, gg_seq.allele, gg_seq.names ]) ct2 += 1 if gg_seq.seq[(len_slice - PAM_len):len_slice] in PAM_dict_rc: # if (levenshtein(revcomp(gg_seq.seq[PAM_len:(PAM_len+targ_len)]), # target) <= threshold): # fout.write('>'+gg_seq.seq[(len_slice-PAM_len):len_slice]+ # '|'+PAM_orient+'|'+'BS'+'|'+str(gg_seq)+'\n') # fout.write(revcomp(gg_seq.seq[PAM_len:(PAM_len+targ_len)])+'\n') search_seqs.append([ revcomp(gg_seq.seq[PAM_len:(PAM_len + targ_len)]), gg_seq.seq[(len_slice - PAM_len):len_slice], PAM_orient, 'BS', gg_seq.ind, gg_seq.allele, gg_seq.names ]) ct2 += 1 ct += 1 ggs_obj.ref_base_shift() if (i % 100000) == 0: print '####################' print 'vmb memory:' + str(vmb.memory()) print 'vmb resident: ' + str(vmb.resident()) print '####################' print str( float(i) / (float(len(ggs_obj.ref)) - float(len_slice))) print ct2 pickle.dump(search_seqs, fout)
def gene_query_gg_all(gg_obj_file, gff3_file, genes_file, PAM_orient, PAM_list, target_len, out_file): PAM_len = len(PAM_list[0]) PAM_dict = {} for pam in PAM_list: PAM_dict[str(pam)] = pam PAM_dict_rc = {} for pam in PAM_list: PAM_dict_rc[revcomp(str(pam))] = pam genes = [] with open(genes_file, 'rb') as csvgene: greader = csv.reader(csvgene, delimiter=',') genes = [row for row in greader] print genes gff3_pct = {} gff3_file_list = gff3_file.replace('.gff3', '').split('_') offset = int(gff3_file_list[len(gff3_file_list) - 1]) with open(gff3_file, 'rb') as fin: for line in fin: if line[0] == '#': continue row_obj = gff3.GFF3_row(line) # print str(row_obj) #protein coding transcript (PCT) and exon or UTR (EU) if (row_obj.PCT and row_obj.EU): if row_obj.INFO['transcript_id'] not in gff3_pct: gff3_pct[row_obj.INFO['transcript_id']] = gff3.PCT(row_obj) else: gff3_pct[row_obj.INFO['transcript_id']].add_element( row_obj) gff3_lines = [] for pct_id in gff3_pct: gff3_lines += gff3_pct[pct_id].get_coding() # print str(gff3_pct[pct_id]) # print gff3_lines with open(gg_obj_file, 'rb') as gg_handle: gcr_obj = gg.Geno_CRISPR(pickle.load(gg_handle)) with open(out_file, 'wb') as csvfile: mywriter = csv.writer(csvfile, delimiter=',') print '####################' print 'vmb memory:' + str(vmb.memory()) print 'vmb resident: ' + str(vmb.resident()) print '####################' for gene in genes: gene_id = gene[0] print gene_id for line_els in gff3_lines: # if line[0] == '#': # continue # line_els = line.strip().split() if ((line_els[2] == 'exon') and (gene_id in line_els[1])): ref_lb = int(line_els[3]) - offset - 1 ref_ub = int(line_els[4]) - offset - 1 if ref_lb < 0: ref_lb = 0 if ref_ub > gcr_obj.len: ref_ub = gcr_obj.len [target_inds, target_vars] = gcr_obj.get_var_targets_del_pams( PAM_orient, PAM_list, target_len, ref_lb, ref_ub) n_targ = len(target_inds) n_var = len(target_vars) # print target_inds # print target_vars for loc in target_inds: all_freq = numpy.zeros(target_len + PAM_len).astype(float) het_freq = numpy.zeros(target_len + PAM_len).astype(float) hom_freq = numpy.zeros(target_len + PAM_len).astype(float) ind_dict = {} if str(loc) in target_vars: for el in target_vars[str(loc)]: if PAM_orient == 'R': #orient PAM at right hand side ind = (target_len + PAM_len) - (el[0] + 1) elif PAM_orient == 'L': ind = el[0] #take max af allele for each position #then max of all_freq is max af of target if ind not in ind_dict: ind_dict[ind] = el else: if el[1].get_af_adj( ) > ind_dict[ind][1].get_af_adj(): ind_dict[ind] = el for ind in ind_dict: all_freq[ind] = ind_dict[ind][ 1].get_af_adj() het_freq[ind] = ind_dict[ind][ 1].get_af_het_adj() hom_freq[ind] = ind_dict[ind][ 1].get_af_hom_adj() loc[0] = loc[0] + offset mywriter.writerow(line_els + list(all_freq) + loc + [n_var, n_targ, 'ALL'] + gene) mywriter.writerow(line_els + list(het_freq) + loc + [n_var, n_targ, 'HET'] + gene) mywriter.writerow(line_els + list(hom_freq) + loc + [n_var, n_targ, 'HOM'] + gene)