def leven_search_pam(pam_obj_file, target, threshold, out_file):
    target = str.upper(target)
    targ_len = len(target)

    d = 0
    hit = []
    with open(pam_obj_file, 'rb') as pam_handle:
        with open(out_file, 'wb') as csvfile:
            pam_obj = pickle.load(pam_handle)

            print '####################'
            print 'vmb memory:' + str(vmb.memory())
            print 'vmb resident: ' + str(vmb.resident())
            print '####################'

            ct = 0
            for el in pam_obj:
                d = levenshtein(el[0].upper(), target)
                if d <= threshold:
                    hit.append(el + [d])

                ct += 1
                if (ct % 1000) == 0:
                    print '####################'
                    print 'vmb memory:' + str(vmb.memory())
                    print 'vmb resident: ' + str(vmb.resident())
                    print '####################'
                    print str(float(ct) / float(len(pam_obj)))

            mywriter = csv.writer(csvfile, delimiter=',')
            for el in hit:
                mywriter.writerow(el)
def nt_query_gg_all(gg_obj_file, out_file):

    with open(gg_obj_file, 'rb') as gg_handle:
        gg_obj = pickle.load(gg_handle)

        rf_split = gg_obj_file.split('_')
        chrid = rf_split[len(rf_split) - 2]
        offset = int(rf_split[len(rf_split) - 1].replace('.gg', ''))
        print offset

        with open(out_file, 'wb') as csvfile:
            mywriter = csv.writer(csvfile, delimiter=',')

            print '####################'
            print 'vmb memory:' + str(vmb.memory())
            print 'vmb resident: ' + str(vmb.resident())
            print '####################'

            #skip overlap if offset > 0
            if offset == 0:
                ref_lb = 0
            else:
                ref_lb = 1000
            ref_ub = len(gg_obj.ref)

            out = gg_obj.get_ntvar_counts(ref_lb, ref_ub)

            mywriter.writerow([chrid, offset] + out['A'] + out['C'] +
                              out['G'] + out['T'])
Exemplo n.º 3
0
def nt_query_gg_all(gg_obj_file,gff3_file,out_file):

	with open(gg_obj_file,'rb') as gg_handle:
		gg_obj = pickle.load(gg_handle)		
		with open(out_file,'wb') as csvfile:
			mywriter = csv.writer(csvfile, delimiter=',')	

			print '####################'
			print 'vmb memory:' + str(vmb.memory())
			print 'vmb resident: ' + str(vmb.resident())
			print '####################'

			gff3_file_list = gff3_file.replace('.gff3','').split('_')
			offset = int(gff3_file_list[len(gff3_file_list)-1])
			with open(gff3_file,'rb') as fin:
				
				for line in fin:
					if line[0] == '#':
						continue

					line_els = line.strip().split()
					if line_els[2] == 'exon':
						ref_lb = int(line_els[3])-offset-1
						ref_ub = int(line_els[4])-offset-1
						if ref_lb < 0:
							ref_lb = 0
						if ref_ub > len(gg_obj.ref):
							ref_ub = len(gg_obj.ref)
										
						out = gg_obj.get_ntvar_counts(ref_lb,ref_ub)

						mywriter.writerow(line_els+out['A']+out['T']+out['C']+out['G']+out['Cnot']+out['Gnot'])						
	def __init__(self, geno_genome):
		super(Geno_slice, self).__init__()
		self.ref = geno_genome.ref
		self.var_track = geno_genome.var_track

		print '####################'
		print 'vmb memory:' + str(vmb.memory())
		print 'vmb resident: ' + str(vmb.resident())
		print '####################'
		
		self.ind = 0
		self.len = 0
		self.name_dict = {}
		self.geno_dict = {}
def split_chrom_ref_vcf_gff3(ref_filename,vcf_filename,gff3_filename):
	
	with open(ref_filename,'r') as fin:
		for record in SeqIO.parse(fin, "fasta"):
			print record.id

			print '####################'
			print 'vmb memory:' + str(vmb.memory())
			print 'vmb resident: ' + str(vmb.resident())
			print '####################'
			
			with open(ref_filename.replace('.fa',
				'_'+str(record.id)+".fa"),'wb') as fout:
				SeqIO.write(record, fout, "fasta")			
def leven_search_pam(pam_obj_file,target_file,threshold,out_file):

	pof_base = os.path.basename(pam_obj_file)
	with open(pam_obj_file,'rb') as pam_handle:
		pam_obj = pickle.load(pam_handle)
		with open(out_file,'wb') as csvfile:
			mywriter = csv.writer(csvfile, delimiter=',')	
			with open(target_file,'rb') as targfile:
				for record in SeqIO.parse(targfile, "fasta"):
					target = str(record.seq).upper()
					name_list = str(record.name).split('|')	

					print '####################'
					print 'vmb memory:' + str(vmb.memory())
					print 'vmb resident: ' + str(vmb.resident())
					print '####################'

					ct = 0
					d = 0
					hit = []
					for el in pam_obj:
						# d = levenshtein(el[0].upper(),target)
						d = linear(el[0].upper(),target)
						if d <= threshold:
							hit.append(name_list+[target,pof_base,d]+el)
						
						ct+=1
						if (ct%100000) == 0:
							print '####################'
							print 'vmb memory:' + str(vmb.memory())
							print 'vmb resident: ' + str(vmb.resident())
							print '####################'
							print str(float(ct)/float(len(pam_obj)))
					
					for el in hit:
						mywriter.writerow(el)
Exemplo n.º 7
0
def target_query_gg_all(gg_obj_file, gff3_file, PAM_orient, PAM_list,
                        target_len, out_file):
    PAM_len = len(PAM_list[0])

    PAM_dict = {}
    for pam in PAM_list:
        PAM_dict[str(pam)] = pam
    PAM_dict_rc = {}
    for pam in PAM_list:
        PAM_dict_rc[revcomp(str(pam))] = pam

    gff3_pct = {}
    gff3_file_list = gff3_file.replace('.gff3', '').split('_')
    offset = int(gff3_file_list[len(gff3_file_list) - 1])
    with open(gff3_file, 'rb') as fin:
        for line in fin:
            if line[0] == '#':
                continue
            row_obj = gff3.GFF3_row(line)
            # print str(row_obj)
            #protein coding transcript (PCT) and exon or UTR (EU)
            if (row_obj.PCT and row_obj.EU):
                if row_obj.INFO['transcript_id'] not in gff3_pct:
                    gff3_pct[row_obj.INFO['transcript_id']] = gff3.PCT(row_obj)
                else:
                    gff3_pct[row_obj.INFO['transcript_id']].add_element(
                        row_obj)

    gff3_lines = []
    for pct_id in gff3_pct:
        gff3_lines += gff3_pct[pct_id].get_coding()
        # print str(gff3_pct[pct_id])

    # print gff3_lines

    with open(gg_obj_file, 'rb') as gg_handle:
        gcr_obj = gg.Geno_CRISPR(pickle.load(gg_handle))
        with open(out_file, 'wb') as csvfile:
            mywriter = csv.writer(csvfile, delimiter=',')

            print '####################'
            print 'vmb memory:' + str(vmb.memory())
            print 'vmb resident: ' + str(vmb.resident())
            print '####################'

            for line_els in gff3_lines:
                # if line[0] == '#':
                # 	continue

                # line_els = line.strip().split()
                if line_els[2] == 'exon':
                    ref_lb = int(line_els[3]) - offset - 1
                    ref_ub = int(line_els[4]) - offset - 1
                    if ref_lb < 0:
                        ref_lb = 0
                    if ref_ub > gcr_obj.len:
                        ref_ub = gcr_obj.len

                    [target_inds,
                     target_vars] = gcr_obj.get_var_targets_del_pams(
                         PAM_orient, PAM_list, target_len, ref_lb, ref_ub)
                    n_targ = len(target_inds)
                    n_var = len(target_vars)
                    for loc in target_inds:
                        af = 0
                        hetf = 0
                        max_var = 0
                        max_var_els = []

                        #change 161006 taking max only
                        if str(loc) in target_vars:
                            # print [[el[0],el[1].get_af_adj()] for el in target_vars[str(loc)]]
                            for el in target_vars[str(loc)]:
                                if el[1].get_af_adj() > max_var:
                                    max_var = el[1].get_af_adj()
                                    max_var_els = [el]
                                elif el[1].get_af_adj() == max_var:
                                    max_var_els += [el]

                            #pam start is index 0 at this point
                            #for equal af, take closest to PAM
                            max_var_els.sort(key=lambda x: x[0])

                            ind = 0
                            el = max_var_els[0]
                            if PAM_orient == 'R':
                                #orient PAM at right hand side
                                ind = (target_len + PAM_len) - (el[0] + 1)
                            elif PAM_orient == 'L':
                                ind = el[0]

                            af = el[1].get_af_adj()
                            hetf = el[1].get_hetf()

                        #platinum: af < 0.0001
                        if af < 0.0001:
                            out_list = [
                                loc[0], loc[0] + len(loc[2]), loc[1],
                                PAM_orient
                            ]
                            if PAM_orient == 'R':
                                #orient PAM at right hand side
                                out_list += [loc[2][0:target_len]]
                                out_list += [loc[2][target_len:len(loc[2])]]
                            elif PAM_orient == 'L':
                                out_list += [loc[2][PAM_len:len(loc[2])]]
                                out_list += [loc[2][0:PAM_len]]
                            out_list += [af]

                            mywriter.writerow(line_els + out_list)
def PAM_query_gg_all(gg_obj_file, gff3_file, PAM_list, out_file):
    PAM_len = len(PAM_list[0])

    PAM_dict = {}
    for pam in PAM_list:
        PAM_dict[str(pam)] = pam
    PAM_dict_rc = {}
    for pam in PAM_list:
        PAM_dict_rc[revcomp(str(pam))] = pam

    gff3_pct = {}
    gff3_file_list = gff3_file.replace('.gff3', '').split('_')
    offset = int(gff3_file_list[len(gff3_file_list) - 1])
    with open(gff3_file, 'rb') as fin:
        for line in fin:
            if line[0] == '#':
                continue
            row_obj = gff3.GFF3_row(line)
            # print str(row_obj)
            #protein coding transcript (PCT) and exon or UTR (EU)
            if (row_obj.PCT and row_obj.EU):
                if row_obj.INFO['transcript_id'] not in gff3_pct:
                    gff3_pct[row_obj.INFO['transcript_id']] = gff3.PCT(row_obj)
                else:
                    gff3_pct[row_obj.INFO['transcript_id']].add_element(
                        row_obj)

    gff3_lines = []
    for pct_id in gff3_pct:
        gff3_lines += gff3_pct[pct_id].get_coding()
        # print str(gff3_pct[pct_id])

    # print gff3_lines

    with open(gg_obj_file, 'rb') as gg_handle:
        gcr_obj = gg.Geno_CRISPR(pickle.load(gg_handle))
        with open(out_file, 'wb') as csvfile:
            mywriter = csv.writer(csvfile, delimiter=',')

            print '####################'
            print 'vmb memory:' + str(vmb.memory())
            print 'vmb resident: ' + str(vmb.resident())
            print '####################'

            for line_els in gff3_lines:
                # if line[0] == '#':
                # 	continue

                # line_els = line.strip().split()
                if line_els[2] == 'exon':
                    ref_lb = int(line_els[3]) - offset - 1
                    ref_ub = int(line_els[4]) - offset - 1
                    if ref_lb < 0:
                        ref_lb = 0
                    if ref_ub > gcr_obj.len:
                        ref_ub = gcr_obj.len

                    af_bins = [[0.00001, 0], [0.0001, 0], [0.001, 0],
                               [0.01, 0], [0.1, 0], [1, 0]]
                    hetf_bins = [[0.00001, 0], [0.0001, 0], [0.001, 0],
                                 [0.01, 0], [0.1, 0], [1, 0]]
                    [pam_inds,
                     del_vars] = gcr_obj.get_del_PAMS(PAM_list, ref_lb, ref_ub)
                    n_pam = len(pam_inds)
                    n_del = len(del_vars)
                    for loc in del_vars:
                        af = 0
                        hetf = 0
                        for var in del_vars[loc]:
                            #change 161006 taking max only
                            if var.get_af_adj() > af:
                                af = var.get_af_adj()
                                hetf = var.get_hetf()

                        #change 161006 taking max only
                        varct = 1
                        if af < af_bins[0][0]:
                            af_bins[0][1] += 1
                            hetf_bins[0][1] += hetf / varct
                        elif af < af_bins[1][0]:
                            af_bins[1][1] += 1
                            hetf_bins[1][1] += hetf / varct
                        elif af < af_bins[2][0]:
                            af_bins[2][1] += 1
                            hetf_bins[2][1] += hetf / varct
                        elif af < af_bins[3][0]:
                            af_bins[3][1] += 1
                            hetf_bins[3][1] += hetf / varct
                        elif af < af_bins[4][0]:
                            af_bins[4][1] += 1
                            hetf_bins[4][1] += hetf / varct
                        elif af <= af_bins[5][0]:
                            af_bins[5][1] += 1
                            hetf_bins[5][1] += hetf / varct

                    mywriter.writerow(line_els + [el[1] for el in af_bins] +
                                      [el[1] for el in hetf_bins] +
                                      [n_del, n_pam, 'del'])

                    af_bins = [[0.00001, 0], [0.0001, 0], [0.001, 0],
                               [0.01, 0], [0.1, 0], [1, 0]]
                    hetf_bins = [[0.00001, 0], [0.0001, 0], [0.001, 0],
                                 [0.01, 0], [0.1, 0], [1, 0]]
                    [no_pam_inds,
                     add_vars] = gcr_obj.get_add_PAMS(PAM_list, ref_lb, ref_ub)
                    n_no_pam = len(no_pam_inds)
                    n_add = len(add_vars)
                    for loc in add_vars:
                        af = 0
                        hetf = 0
                        for var in add_vars[loc]:
                            #change 161006 taking max only
                            if var.get_af_adj() > af:
                                af = var.get_af_adj()
                                hetf = var.get_hetf()

                        #change 161006 taking max only
                        varct = 1
                        if af < af_bins[0][0]:
                            af_bins[0][1] += 1
                            hetf_bins[0][1] += hetf / varct
                        elif af < af_bins[1][0]:
                            af_bins[1][1] += 1
                            hetf_bins[1][1] += hetf / varct
                        elif af < af_bins[2][0]:
                            af_bins[2][1] += 1
                            hetf_bins[2][1] += hetf / varct
                        elif af < af_bins[3][0]:
                            af_bins[3][1] += 1
                            hetf_bins[3][1] += hetf / varct
                        elif af < af_bins[4][0]:
                            af_bins[4][1] += 1
                            hetf_bins[4][1] += hetf / varct
                        elif af <= af_bins[5][0]:
                            af_bins[5][1] += 1
                            hetf_bins[5][1] += hetf / varct

                    mywriter.writerow(line_els + [el[1] for el in af_bins] +
                                      [el[1] for el in hetf_bins] +
                                      [n_add, n_no_pam, 'add'])
import vmb

print '####################'
print 'vmb memory:' + str(vmb.memory())
print 'vmb resident: ' + str(vmb.resident())
print '####################'
Exemplo n.º 10
0
def PAM_compile_gg(gg_obj_file, targ_len, PAM_orient, PAM_list, out_file):
    PAM_len = len(PAM_list[0])

    PAM_dict = {}
    for pam in PAM_list:
        PAM_dict[str(pam)] = pam
    PAM_dict_rc = {}
    for pam in PAM_list:
        PAM_dict_rc[revcomp(str(pam))] = pam

    with open(gg_obj_file, 'rb') as gg_handle:
        with open(out_file, 'wb') as fout:

            ggs_obj = gg.Geno_slice(pickle.load(gg_handle))

            print '####################'
            print 'vmb memory:' + str(vmb.memory())
            print 'vmb resident: ' + str(vmb.resident())
            print '####################'

            print ggs_obj.ref
            # print ggs_obj.var_track

            len_slice = targ_len + (2 * PAM_len)

            ggs_obj.slice(0, len_slice)

            print '####################'
            print 'vmb memory:' + str(vmb.memory())
            print 'vmb resident: ' + str(vmb.resident())
            print '####################'

            ct = 0
            ct2 = 0
            search_seqs = []
            for i in range(0, (len(ggs_obj.ref) - len_slice)):

                gg_seqs = ggs_obj.compile_geno_seqs()
                gg_seqs += [
                    gg.Geno_seq(ggs_obj.ref[i:(i + len_slice)],
                                ggs_obj.ind - ggs_obj.len, -1, ['0'])
                ]
                for gg_seq in gg_seqs:

                    if PAM_orient == 'R':
                        if gg_seq.seq[0:PAM_len] in PAM_dict_rc:
                            # if (levenshtein(revcomp(gg_seq.seq[PAM_len:(PAM_len+targ_len)]),
                            # 	target) <= threshold):
                            # fout.write('>'+gg_seq.seq[0:PAM_len]+
                            # 	'|'+PAM_orient+'|'+'BS'+'|'+str(gg_seq)+'\n')
                            # fout.write(revcomp(gg_seq.seq[PAM_len:(PAM_len+targ_len)])+'\n')
                            search_seqs.append([
                                revcomp(gg_seq.seq[PAM_len:(PAM_len +
                                                            targ_len)]),
                                gg_seq.seq[0:PAM_len], PAM_orient, 'BS',
                                gg_seq.ind, gg_seq.allele, gg_seq.names
                            ])
                            ct2 += 1

                        if gg_seq.seq[(len_slice -
                                       PAM_len):len_slice] in PAM_dict:
                            # if (levenshtein(gg_seq.seq[PAM_len:(PAM_len+targ_len)],
                            # 	target) <= threshold):
                            # fout.write('>'+gg_seq.seq[(len_slice-PAM_len):len_slice]+
                            # 	'|'+PAM_orient+'|'+'TS'+'|'+str(gg_seq)+'\n')
                            # fout.write(gg_seq.seq[PAM_len:(PAM_len+targ_len)]+'\n')
                            search_seqs.append([
                                gg_seq.seq[PAM_len:(PAM_len + targ_len)],
                                gg_seq.seq[(len_slice - PAM_len):len_slice],
                                PAM_orient, 'TS', gg_seq.ind, gg_seq.allele,
                                gg_seq.names
                            ])
                            ct2 += 1

                    elif PAM_orient == 'L':
                        if gg_seq.seq[0:PAM_len] in PAM_dict:
                            # if (levenshtein(gg_seq.seq[PAM_len:(PAM_len+targ_len)],
                            # 	target) <= threshold):
                            # fout.write('>'+gg_seq.seq[0:PAM_len]+
                            # 	'|'+PAM_orient+'|'+'TS'+'|'+str(gg_seq)+'\n')
                            # fout.write(gg_seq.seq[PAM_len:(PAM_len+targ_len)]+'\n')
                            search_seqs.append([
                                gg_seq.seq[PAM_len:(PAM_len + targ_len)],
                                gg_seq.seq[0:PAM_len], PAM_orient, 'TS',
                                gg_seq.ind, gg_seq.allele, gg_seq.names
                            ])
                            ct2 += 1

                        if gg_seq.seq[(len_slice -
                                       PAM_len):len_slice] in PAM_dict_rc:
                            # if (levenshtein(revcomp(gg_seq.seq[PAM_len:(PAM_len+targ_len)]),
                            # 	target) <= threshold):
                            # fout.write('>'+gg_seq.seq[(len_slice-PAM_len):len_slice]+
                            # 	'|'+PAM_orient+'|'+'BS'+'|'+str(gg_seq)+'\n')
                            # fout.write(revcomp(gg_seq.seq[PAM_len:(PAM_len+targ_len)])+'\n')
                            search_seqs.append([
                                revcomp(gg_seq.seq[PAM_len:(PAM_len +
                                                            targ_len)]),
                                gg_seq.seq[(len_slice - PAM_len):len_slice],
                                PAM_orient, 'BS', gg_seq.ind, gg_seq.allele,
                                gg_seq.names
                            ])
                            ct2 += 1
                    ct += 1

                ggs_obj.ref_base_shift()

                if (i % 100000) == 0:
                    print '####################'
                    print 'vmb memory:' + str(vmb.memory())
                    print 'vmb resident: ' + str(vmb.resident())
                    print '####################'
                    print str(
                        float(i) /
                        (float(len(ggs_obj.ref)) - float(len_slice)))
            print ct2

            pickle.dump(search_seqs, fout)
def gene_query_gg_all(gg_obj_file, gff3_file, genes_file, PAM_orient, PAM_list,
                      target_len, out_file):
    PAM_len = len(PAM_list[0])

    PAM_dict = {}
    for pam in PAM_list:
        PAM_dict[str(pam)] = pam
    PAM_dict_rc = {}
    for pam in PAM_list:
        PAM_dict_rc[revcomp(str(pam))] = pam

    genes = []
    with open(genes_file, 'rb') as csvgene:
        greader = csv.reader(csvgene, delimiter=',')
        genes = [row for row in greader]

    print genes

    gff3_pct = {}
    gff3_file_list = gff3_file.replace('.gff3', '').split('_')
    offset = int(gff3_file_list[len(gff3_file_list) - 1])
    with open(gff3_file, 'rb') as fin:
        for line in fin:
            if line[0] == '#':
                continue
            row_obj = gff3.GFF3_row(line)
            # print str(row_obj)
            #protein coding transcript (PCT) and exon or UTR (EU)
            if (row_obj.PCT and row_obj.EU):
                if row_obj.INFO['transcript_id'] not in gff3_pct:
                    gff3_pct[row_obj.INFO['transcript_id']] = gff3.PCT(row_obj)
                else:
                    gff3_pct[row_obj.INFO['transcript_id']].add_element(
                        row_obj)

    gff3_lines = []
    for pct_id in gff3_pct:
        gff3_lines += gff3_pct[pct_id].get_coding()
        # print str(gff3_pct[pct_id])

    # print gff3_lines

    with open(gg_obj_file, 'rb') as gg_handle:
        gcr_obj = gg.Geno_CRISPR(pickle.load(gg_handle))
        with open(out_file, 'wb') as csvfile:
            mywriter = csv.writer(csvfile, delimiter=',')

            print '####################'
            print 'vmb memory:' + str(vmb.memory())
            print 'vmb resident: ' + str(vmb.resident())
            print '####################'

            for gene in genes:
                gene_id = gene[0]

                print gene_id

                for line_els in gff3_lines:
                    # if line[0] == '#':
                    # 	continue

                    # line_els = line.strip().split()
                    if ((line_els[2] == 'exon') and (gene_id in line_els[1])):
                        ref_lb = int(line_els[3]) - offset - 1
                        ref_ub = int(line_els[4]) - offset - 1
                        if ref_lb < 0:
                            ref_lb = 0
                        if ref_ub > gcr_obj.len:
                            ref_ub = gcr_obj.len

                        [target_inds,
                         target_vars] = gcr_obj.get_var_targets_del_pams(
                             PAM_orient, PAM_list, target_len, ref_lb, ref_ub)
                        n_targ = len(target_inds)
                        n_var = len(target_vars)

                        # print target_inds
                        # print target_vars

                        for loc in target_inds:
                            all_freq = numpy.zeros(target_len +
                                                   PAM_len).astype(float)
                            het_freq = numpy.zeros(target_len +
                                                   PAM_len).astype(float)
                            hom_freq = numpy.zeros(target_len +
                                                   PAM_len).astype(float)

                            ind_dict = {}
                            if str(loc) in target_vars:
                                for el in target_vars[str(loc)]:
                                    if PAM_orient == 'R':
                                        #orient PAM at right hand side
                                        ind = (target_len + PAM_len) - (el[0] +
                                                                        1)
                                    elif PAM_orient == 'L':
                                        ind = el[0]

                                    #take max af allele for each position
                                    #then max of all_freq is max af of target
                                    if ind not in ind_dict:
                                        ind_dict[ind] = el
                                    else:
                                        if el[1].get_af_adj(
                                        ) > ind_dict[ind][1].get_af_adj():
                                            ind_dict[ind] = el

                                for ind in ind_dict:
                                    all_freq[ind] = ind_dict[ind][
                                        1].get_af_adj()
                                    het_freq[ind] = ind_dict[ind][
                                        1].get_af_het_adj()
                                    hom_freq[ind] = ind_dict[ind][
                                        1].get_af_hom_adj()

                            loc[0] = loc[0] + offset
                            mywriter.writerow(line_els + list(all_freq) + loc +
                                              [n_var, n_targ, 'ALL'] + gene)
                            mywriter.writerow(line_els + list(het_freq) + loc +
                                              [n_var, n_targ, 'HET'] + gene)
                            mywriter.writerow(line_els + list(hom_freq) + loc +
                                              [n_var, n_targ, 'HOM'] + gene)