def mainERRBSalign(argv): __version__ = "0.0.1" # version of ERRBSalign # Initialize all parameters to default values inputfileR1 = str() # R1 PE fastq file inputfileR2 = str() # R2 PE fastq file genomeref = str() # Reference genome directory outputdir = str() # Output directory paired= '' single='' # if any arguments are given print usage message and then exit the programm if len(argv) == 1: usageERRBSalign() sys.exit(2) # List of all options possible try: opts, args = getopt.getopt(argv[1:],"h1:2:g:o:",["paired","single=","genome_ref=","outputdir=","version"]) except getopt.GetoptError: usageERRBSalign() sys.exit(2) for opt, arg in opts: if opt == '-h': # print usage message usageERRBSalign() sys.exit() elif opt == '--paired': # paired-end read paired = 'True' elif opt == '-1': # R1 PE fastq file checkFile(arg) inputfileR1 = arg elif opt == '-2': # R2 PE fastq file checkFile(arg) inputfileR2 = arg elif opt == '--single': # single-end read single = 'True' checkFile(arg) inputfileR1 = arg elif opt in ("-g","--genome_ref"): # Reference genome directory genomeref = arg elif opt in ("-o", "--outputdir"): # Output directory outputdir = arg elif opt == '--version': # print software version print "ERRBSalign."+__version__ sys.exit(0) # Check that all necessary arguments are given checkargsAlign(paired,inputfileR1,inputfileR2,genomeref,outputdir) # Adapter filtering paired, single, R1, R2,outputdir = filtering(paired, single, inputfileR1, inputfileR2, outputdir) # Check that the given genome path exists and if the genome is already indexing or not checkGenome(genomeref) # alignment against the reference genome align(paired, single, R1, R2 , outputdir, genomeref) # Sorting the output file BAM and creation of a SAM file sortAlignFile(inputfileR1, outputdir, paired, single) sys.exit(0) # Exiting
def mainfunc(f,special=0): global node,face node=0 face=f if special==0: while conditioncheck() == 0: pass elif special==2 : while conditioncheck()==0: pass align(face) #throw() time.sleep(2) else: while specialconditioncheck() == 0: pass
def main(argv): referencefile = '' queryfile = '' outputfile = '' AG_score = -0.5 CT_score = -0.75 try: opts, args = getopt.getopt(argv, "hr:q:o:a:c:", [ "ref_file=", "query_file=", "output_file=", "AG_score=", "CT_score=" ]) except getopt.GetoptError: print( 'Error! Correct usage:\npython src.py -r <reference_fasta_file> -q <query_fasta_file> [-o <output_csv_file> -a <score for AG mismatch> -c <score for CT mismatch>]' ) sys.exit(2) for opt, arg in opts: if opt == '-h': print( 'python src.py -r <reference_fasta_file> -q <query_fasta_file> [-o <output_csv_file> -a <score for AG mismatch> -c <score for CT mismatch>]' ) sys.exit() elif opt in ("-r", "--ref_file"): referencefile = arg elif opt in ("-q", "--query_file"): queryfile = arg elif opt in ("-o", "--output_file"): outputfile = arg elif opt in ("-a", "--AG_score"): AG_score = float(arg) elif opt in ("-c", "--CT_score"): CT_score = float(arg) print('Reference file is ', referencefile) print('Query file is ', queryfile) print('AG score: ', AG_score) print('CT score: ', CT_score) all_ref_seqs = parse_fasta(referencefile) all_seqs = parse_query_file(queryfile) res_df = align(all_ref_seqs, all_seqs, AG=AG_score, CT=CT_score) if len(outputfile) == 0: output_file = queryfile.split(".")[0] + "_" + referencefile.split( "/")[1] + "_results.csv" else: output_file = outputfile print('Output file is ', output_file) res_df.to_csv(output_file, sep='\t', index=False)
def realign_filter(rec, inslib): seqn = rec['Superfamily'] + ':' + rec['Subfamily'] if seqn not in inslib: return False seq_headers = ['Genomic_Consensus_5p', 'Genomic_Consensus_3p', 'Insert_Consensus_5p', 'Insert_Consensus_3p'] matches = [] for seqtype in seq_headers: if rec[seqtype] == 'NA': continue #print seqtype, rec[seqtype] alignment = align(rec[seqtype], inslib[seqn], rec['Subfamily']) if alignment: matches.append([seqtype] + alignment) return matches
def compare_traj(namelist, dirlist, gt_dir, save_dir=None, plot=False): if len(dirlist) < 2 or len(dirlist) != len(namelist): return if save_dir is not None: if not os.path.exists(save_dir): os.makedirs(save_dir) valid_path = [] for path in dirlist: files = [ x for x in os.listdir(path) if x[:4] == 'est_' and x[-4:] == '.txt' ] if not os.path.isdir(path) or len(files) == 0: print('Invalid path:%s' % path) continue valid_path.append(path) colors = [ 'red', 'blue', 'green', 'purple', 'pink', 'sienna', 'gray', 'yellow', 'black', 'gold', 'darkcyan' ] datanames = sorted([ x[4:-4] for x in os.listdir(dirlist[0]) if x[:4] == 'est_' and x[-4:] == '.txt' ]) for dataname in datanames: plt.figure() plt.title(dataname) gt_file = os.path.join(gt_dir, '%s.txt' % dataname) gt_list = associate.read_file_list(gt_file) gt_stamps = gt_list.keys() gt_stamps.sort() gt_traj = np.matrix([[float(value) for value in gt_list[b][0:3]] for b in gt_stamps]).transpose() plt.plot(gt_traj[0, :].T, gt_traj[1, :].T, colors[0], label='ground truth') plt.plot(gt_traj[0, 0], gt_traj[1, 0], 'yp', markersize=8) cidx = 1 for name, path in zip(namelist, dirlist): traj_file = os.path.join(path, 'est_%s.txt' % dataname) if not os.path.exists(traj_file): continue traj_list = associate.read_file_list(traj_file) matches = associate.associate(gt_list, traj_list, 0, 0.02) if len(matches) < 2: continue gt_xyz = np.matrix([[float(value) for value in gt_list[a][0:3]] for a, b in matches]).transpose() traj_xyz = np.matrix( [[float(value) for value in traj_list[b][0:3]] for a, b in matches]).transpose() rot, trans, trans_error = align(traj_xyz, gt_xyz) aligned_traj = rot * traj_xyz + trans if np.max(trans_error) < 100: plt.plot(aligned_traj[0, :].T, aligned_traj[1, :].T, colors[cidx % len(colors)], label=name) cidx += 1 plt.legend() if save_dir is not None: plt.savefig(os.path.join(save_dir, '%s.png' % dataname)) if plot: plt.show()
from __future__ import division __author__ = 'hanz' #!/usr/bin/env python # coding=utf-8 import sys from initialize import * from EM_IBM1 import * from ibm2_initialize import * from EM_IBM2 import * from align import * t={} total={} count={} box_f=[] box_e=[] EM_IBM1(t,count,total,box_f,box_e) a={} total_a={} EM_IBM2(a,t,count,total,total_a,box_f,box_e) align(a,t,box_f,box_e)
def sts_alignment(sentence1, sentence2, parse_results=None, sentence_for_demoting=None): if parse_results == None: sentence1_parse_result = parseText(sentence1) sentence2_parse_result = parseText(sentence2) parse_results = [] parse_results.append(sentence1_parse_result) parse_results.append(sentence2_parse_result) else: sentence1_parse_result = parse_results[0] sentence2_parse_result = parse_results[1] sentence1_lemmatized = lemmatize(sentence1_parse_result) sentence2_lemmatized = lemmatize(sentence2_parse_result) lemmas_to_be_demoted = [] if sentence_for_demoting != None: if len(parse_results) == 2: sentence_for_demoting_parse_result = \ parseText(sentence_for_demoting) parse_results.append(sentence_for_demoting_parse_result) else: sentence_for_demoting_parse_result = parse_results[2] sentence_for_demoting_lemmatized = \ lemmatize(sentence_for_demoting_parse_result) sentence_for_demoting_lemmas = \ [item[3] for item in sentence_for_demoting_lemmatized] lemmas_to_be_demoted = \ [item.lower() for item in sentence_for_demoting_lemmas \ if item.lower() not in stop_words+punctuations] alignments = align(sentence1, sentence2, sentence1_parse_result, sentence2_parse_result)[0] sentence1_lemmas = [item[3] for item in sentence1_lemmatized] sentence2_lemmas = [item[3] for item in sentence2_lemmatized] sentence1_content_lemmas = \ [item for item in sentence1_lemmas \ if item.lower() not in \ stop_words+punctuations+lemmas_to_be_demoted] sentence2_content_lemmas = \ [item for item in sentence2_lemmas \ if item.lower() not in \ stop_words+punctuations+lemmas_to_be_demoted] if sentence1_content_lemmas == [] or sentence2_content_lemmas == []: return (0, 0, parse_results) sentence1_aligned_content_word_indexes = \ [item[0] for item in alignments if \ sentence1_lemmas[item[0]-1].lower() not in \ stop_words+punctuations+lemmas_to_be_demoted] sentence2_aligned_content_word_indexes = \ [item[1] for item in alignments if \ sentence2_lemmas[item[1]-1].lower() not in \ stop_words+punctuations+lemmas_to_be_demoted] sim_score = (len(sentence1_aligned_content_word_indexes) + \ len(sentence2_aligned_content_word_indexes)) / \ (len(sentence1_content_lemmas) + \ len(sentence2_content_lemmas)) coverage = len(sentence1_aligned_content_word_indexes) / \ len(sentence1_content_lemmas) return (sim_score, coverage, parse_results)
def main(args): l1_ref = tebreak_dir + '/../lib/mask.L1.hg19.bed.gz' alu_ref = tebreak_dir + '/../lib/mask.Alu.hg19.bed.gz' sva_ref = tebreak_dir + '/../lib/mask.SVA.hg19.bed.gz' inslib = None if args.insref: inslib = load_falib(args.insref) for fn in (l1_ref, alu_ref, sva_ref): if not os.path.exists(fn): sys.exit('reference %s not found' % fn) if not os.path.exists(fn + '.tbi'): sys.exit('index for reference %s not found' % fn) tbx = {} tbx['L1'] = pysam.Tabixfile(l1_ref) tbx['ALU'] = pysam.Tabixfile(alu_ref) tbx['SVA'] = pysam.Tabixfile(sva_ref) header = [] with open(args.tabfile, 'r') as tab: for i, line in enumerate(tab): if i == 0: # header header = line.strip().split('\t') header += [ 'ChimeraBaseCount', 'ChimeraMatchIns', 'ChimeraMatchRef', 'InsSiteHomology', 'PossibleRefEltChimera' ] print '\t'.join(header) else: rec = {} for n, field in enumerate(line.strip().split('\t')): rec[header[n]] = field ins_site_homlen = 0 # insertion site homology length ins_site_homseq = 'NA' # sequence of overlapped region ch_ref_present = False ins_pct_match = 0.0 ref_pct_match = 0.0 ref = pysam.Fastafile(args.refgenome) left = int(rec['Left_Extreme']) - 1000 right = int(rec['Right_Extreme']) + 1000 if left < 0: left = 0 ref_seq = ref.fetch(rec['Chromosome'], left, right) seqn = rec['Superfamily'] + ':' + rec['Subfamily'] if 'NA' in (rec['Superfamily'], rec['Subfamily']): continue ins_seq = inslib[seqn] alignside = '' ins_align = [] gen_align = [] if rec['Genomic_Consensus_3p'] != 'NA': ins_align = align(rec['Genomic_Consensus_3p'], ins_seq, rec['Subfamily']) gen_align = align(rec['Genomic_Consensus_3p'], ref_seq, 'Genomic') alignside = 'Genomic_Consensus_3p' else: ins_align = align(rec['Genomic_Consensus_5p'], ins_seq, rec['Subfamily']) gen_align = align(rec['Genomic_Consensus_5p'], ref_seq, 'Genomic') alignside = 'Genomic_Consensus_5p' ins_subcoords = None if ins_align: ins_subcoords = map(int, ins_align[2:4]) gen_subcoords = None if gen_align: gen_subcoords = map(int, gen_align[2:4]) else: out = False ol = None if gen_subcoords is not None and ins_subcoords is not None: ol = overlap(ins_subcoords, gen_subcoords) if ol is not None: ins_site_homlen = ol[1] - ol[0] ins_site_homseq = rec[alignside][ol[0]:ol[1]] ch_align_ins = align(ins_site_homseq, ins_seq, 'Ins') ch_align_ref = align(ins_site_homseq, ref_seq, 'Ref') if ch_align_ins: ins_pct_match = ch_align_ins[-1] if ch_align_ref: ref_pct_match = ch_align_ref[-1] # chimera with adjacent ref element check ch_ref_present = ref_filter(rec['Chromosome'], rec['Left_Extreme'], rec['Right_Extreme'], rec['Superfamily'], tbx, extend=10000) # output fields = line.strip().split() fields.append(str(ins_site_homlen)) fields.append(str(ins_pct_match)) fields.append(str(ref_pct_match)) fields.append(ins_site_homseq) fields.append(str(ch_ref_present)) print '\t'.join(fields)
#coding=utf-8 import numpy as np import cv2 import net from align import * FSIZE = 100 LINE_H = 30 filename = "/home/hal/Downloads/Final+Project/5.jpg" im = align(filename) edges = cv2.Canny(im, 50,300, apertureSize = 3) kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5,5)) edges = cv2.morphologyEx(edges, cv2.MORPH_CLOSE, kernel) #kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5,5)) #edges = cv2.morphologyEx(edges, cv2.MORPH_OPEN, kernel) # 去噪声 cv2.imshow("canny", R(edges)) cv2.waitKey(0) b = edges > 0 b = b.astype(np.uint8) * 255 valve = 170 bf = ((im[:,:,0] < valve) & (im[:,:,1] < valve) & (im[:,:,2] < valve)).astype(np.uint8) * 255 kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5,5)) b = cv2.morphologyEx(b, cv2.MORPH_CLOSE, kernel) #寻找联通分量 b, contours, hierarchy = cv2.findContours(b, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) bc = im.copy()#np.dstack([bf,bf,bf]) delete_ratio = [] cs = []
import sys from align import * FSIZE = 100 LINE_H = 100 imid = 5 if len(sys.argv) > 1: imid = int(sys.argv[1]) filename = "./%d.jpg" % imid def is_pic(num): return ("%d.jpg" % num) in filename im, source = align(filename) if is_pic(4): edges = cv2.Canny(im, 50, 500, apertureSize=3) kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5)) edges = cv2.morphologyEx(edges, cv2.MORPH_CLOSE, kernel) elif is_pic(5): edges = cv2.Canny(im, 50, 300, apertureSize=3) else: edges = cv2.Canny(im, 100, 200, apertureSize=3) kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5)) edges = cv2.morphologyEx(edges, cv2.MORPH_CLOSE, kernel) #kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5,5)) #edges = cv2.morphologyEx(edges, cv2.MORPH_OPEN, kernel) # 去噪声 b = edges > 0 b = b.astype(np.uint8) * 255
def sts_alignment(sentence1, sentence2, parse_results=None, sentence_for_demoting=None): if sentence1 and sentence2: if parse_results == None: sentence1_parse_result = parseText(sentence1) sentence2_parse_result = parseText(sentence2) parse_results = [] parse_results.append(sentence1_parse_result) parse_results.append(sentence2_parse_result) else: sentence1_parse_result = parse_results[0] sentence2_parse_result = parse_results[1] sentence1_lemmatized = lemmatize(sentence1_parse_result) sentence2_lemmatized = lemmatize(sentence2_parse_result) lemmas_to_be_demoted = [] if sentence_for_demoting != None: if len(parse_results) == 2: sentence_for_demoting_parse_result = \ parseText(sentence_for_demoting) parse_results.append(sentence_for_demoting_parse_result) else: sentence_for_demoting_parse_result = parse_results[2] sentence_for_demoting_lemmatized = \ lemmatize(sentence_for_demoting_parse_result) sentence_for_demoting_lemmas = \ [item[3] for item in sentence_for_demoting_lemmatized] lemmas_to_be_demoted = \ [item.lower() for item in sentence_for_demoting_lemmas \ if item.lower() not in stop_words+punctuations] alignments = align(sentence1, sentence2, sentence1_parse_result, sentence2_parse_result)[0] sentence1_lemmas = [item[3] for item in sentence1_lemmatized] sentence2_lemmas = [item[3] for item in sentence2_lemmatized] sentence1_content_lemmas = \ [item for item in sentence1_lemmas \ if item.lower() not in \ stop_words+punctuations+lemmas_to_be_demoted] sentence2_content_lemmas = \ [item for item in sentence2_lemmas \ if item.lower() not in \ stop_words+punctuations+lemmas_to_be_demoted] if sentence1_content_lemmas == [] or sentence2_content_lemmas == []: return (0, 0, parse_results) sentence1_aligned_content_word_indexes = \ [item[0] for item in alignments if \ sentence1_lemmas[item[0]-1].lower() not in \ stop_words+punctuations+lemmas_to_be_demoted] sentence2_aligned_content_word_indexes = \ [item[1] for item in alignments if \ sentence2_lemmas[item[1]-1].lower() not in \ stop_words+punctuations+lemmas_to_be_demoted] sim_score = (len(sentence1_aligned_content_word_indexes) + \ len(sentence2_aligned_content_word_indexes)) / \ (len(sentence1_content_lemmas) + \ len(sentence2_content_lemmas)) coverage = len(sentence1_aligned_content_word_indexes) / \ len(sentence1_content_lemmas) return (sim_score, coverage, parse_results)
def ate(first_file, second_file, offset=0, max_diff=0.02, scale=1.0, plot=1, save_png=None): first_list = associate.read_file_list(first_file) second_list = associate.read_file_list(second_file) matches = associate.associate(first_list, second_list, float(offset), float(max_diff)) if len(matches) < 2: sys.exit( "Couldn't find matching timestamp pairs between groundtruth and estimated trajectory! Did you choose the correct sequence?" ) first_xyz = np.matrix([[float(value) for value in first_list[a][0:3]] for a, b in matches]).transpose() second_xyz = np.matrix( [[float(value) * float(scale) for value in second_list[b][0:3]] for a, b in matches]).transpose() rot, trans, trans_error = align(second_xyz, first_xyz) first_stamps = first_list.keys() first_stamps.sort() first_xyz_full = np.matrix([[float(value) for value in first_list[b][0:3]] for b in first_stamps]).transpose() second_stamps = second_list.keys() second_stamps.sort() second_xyz_full = np.matrix( [[float(value) * float(scale) for value in second_list[b][0:3]] for b in second_stamps]).transpose() second_xyz_full_aligned = rot * second_xyz_full + trans rmse = np.sqrt(np.dot(trans_error, trans_error) / len(trans_error)) emean = np.mean(trans_error) emedian = np.median(trans_error) estd = np.std(trans_error) emin = np.min(trans_error) emax = np.max(trans_error) name = os.path.basename(first_file)[:-4] if save_png is not None or plot: fig = plt.figure() ax = fig.add_subplot(111) plot_traj(ax, first_stamps, first_xyz_full.transpose().A, '-', "red", "ground truth") plot_traj(ax, second_stamps, second_xyz_full_aligned.transpose().A, '-', "blue", "estimated") ax.legend() ax.set_xlabel('x [m]') ax.set_ylabel('y [m]') plt.title(name) if save_png is not None: plt.savefig(save_png) if plot: plt.show() return len(trans_error), rmse, emean, emedian, estd, emin, emax
def main(args): l1_ref = tebreak_dir + '/../lib/mask.L1.hg19.bed.gz' alu_ref = tebreak_dir + '/../lib/mask.Alu.hg19.bed.gz' sva_ref = tebreak_dir + '/../lib/mask.SVA.hg19.bed.gz' inslib = None if args.insref: inslib = load_falib(args.insref) for fn in (l1_ref, alu_ref, sva_ref): if not os.path.exists(fn): sys.exit('reference %s not found' % fn) if not os.path.exists(fn + '.tbi'): sys.exit('index for reference %s not found' %fn) tbx = {} tbx['L1'] = pysam.Tabixfile(l1_ref) tbx['ALU'] = pysam.Tabixfile(alu_ref) tbx['SVA'] = pysam.Tabixfile(sva_ref) header = [] with open(args.tabfile, 'r') as tab: for i, line in enumerate(tab): if i == 0: # header header = line.strip().split('\t') header += ['ChimeraBaseCount', 'ChimeraMatchIns', 'ChimeraMatchRef', 'InsSiteHomology', 'PossibleRefEltChimera'] print '\t'.join(header) else: rec = {} for n, field in enumerate(line.strip().split('\t')): rec[header[n]] = field ins_site_homlen = 0 # insertion site homology length ins_site_homseq = 'NA' # sequence of overlapped region ch_ref_present = False ins_pct_match = 0.0 ref_pct_match = 0.0 ref = pysam.Fastafile(args.refgenome) left = int(rec['Left_Extreme']) - 1000 right = int(rec['Right_Extreme']) + 1000 if left < 0: left = 0 ref_seq = ref.fetch(rec['Chromosome'], left, right) seqn = rec['Superfamily'] + ':' + rec['Subfamily'] if 'NA' in (rec['Superfamily'], rec['Subfamily']): continue ins_seq = inslib[seqn] alignside = '' ins_align = [] gen_align = [] if rec['Genomic_Consensus_3p'] != 'NA': ins_align = align(rec['Genomic_Consensus_3p'], ins_seq, rec['Subfamily']) gen_align = align(rec['Genomic_Consensus_3p'], ref_seq, 'Genomic') alignside = 'Genomic_Consensus_3p' else: ins_align = align(rec['Genomic_Consensus_5p'], ins_seq, rec['Subfamily']) gen_align = align(rec['Genomic_Consensus_5p'], ref_seq, 'Genomic') alignside = 'Genomic_Consensus_5p' ins_subcoords = None if ins_align: ins_subcoords = map(int, ins_align[2:4]) gen_subcoords = None if gen_align: gen_subcoords = map(int, gen_align[2:4]) else: out = False ol = None if gen_subcoords is not None and ins_subcoords is not None: ol = overlap(ins_subcoords, gen_subcoords) if ol is not None: ins_site_homlen = ol[1]-ol[0] ins_site_homseq = rec[alignside][ol[0]:ol[1]] ch_align_ins = align(ins_site_homseq, ins_seq, 'Ins') ch_align_ref = align(ins_site_homseq, ref_seq, 'Ref') if ch_align_ins: ins_pct_match = ch_align_ins[-1] if ch_align_ref: ref_pct_match = ch_align_ref[-1] # chimera with adjacent ref element check ch_ref_present = ref_filter(rec['Chromosome'], rec['Left_Extreme'], rec['Right_Extreme'], rec['Superfamily'], tbx, extend=10000) # output fields = line.strip().split() fields.append(str(ins_site_homlen)) fields.append(str(ins_pct_match)) fields.append(str(ref_pct_match)) fields.append(ins_site_homseq) fields.append(str(ch_ref_present)) print '\t'.join(fields)
def main(args): l1_ref = tebreak_dir + '/../lib/mask.L1.mm10.bed.gz' ltr_ref = tebreak_dir + '/../lib/mask.LTR.mm10.bed.gz' sine_ref = tebreak_dir + '/../lib/mask.SINE.mm10.bed.gz' map_ref = tebreak_dir + '/../lib/mm10.map50bp.bed.gz' inslib = None if args.insref: inslib = load_falib(args.insref) for fn in (l1_ref, ltr_ref, sine_ref): if not os.path.exists(fn): sys.exit('reference %s not found' % fn) if not os.path.exists(fn + '.tbi'): sys.exit('index for reference %s not found' %fn) tbx = {} tbx['L1'] = pysam.Tabixfile(l1_ref) tbx['LTR'] = pysam.Tabixfile(ltr_ref) tbx['SINE'] = pysam.Tabixfile(sine_ref) map_tbx = pysam.Tabixfile(map_ref) header = [] with open(args.tabfile, 'r') as tab: for i, line in enumerate(tab): if i == 0: # header header = line.strip().split('\t') if args.realign and args.insref: header += ['ExonerateRealign'] if args.chimera: header += ['ChimeraBaseCount', 'ChimeraMatchIns', 'ChimeraMatchRef', 'InsSiteHomology', 'PossibleRefEltChimera'] print '\t'.join(header) else: rec = {} out = True for n, field in enumerate(line.strip().split('\t')): rec[header[n]] = field #logger.debug(rec['UUID']) if int(rec['3p_Cons_Len']) < 120 and int(rec['5p_Cons_Len']) < 120: logger.debug('Filtered %s: consensus length < %d' % (rec['UUID'], 120)) out = False if 'NA' in (rec['TE_Align_Start'], rec['TE_Align_End']): logger.debug('Filtered %s: TE_Align_Start or TE_Align_End is "NA"' % rec['UUID']) out = False ref_present = False if args.wideref: ref_present = ref_filter(rec['Chromosome'], rec['Left_Extreme'], rec['Right_Extreme'], rec['Superfamily'], tbx, extend=10000) else: ref_present = ref_filter(rec['Chromosome'], rec['Left_Extreme'], rec['Right_Extreme'], rec['Superfamily'], tbx) if ref_present and not args.ignore_ref_filter: logger.debug('Filtered %s: proximity to reference TE of same superfamily' % rec['UUID']) out = False if max(float(rec['5p_Elt_Match']), float(rec['3p_Elt_Match'])) < 0.90: logger.debug('Filtered %s: max(5p_Elt_Match, 3p_Elt_Match) < 0.90' % rec['UUID']) out = False if max(float(rec['5p_Genome_Match']), float(rec['3p_Genome_Match'])) < 0.98: logger.debug('Filtered %s: max(5p_Genome_Match, 3p_Genome_Match) < 0.98' % rec['UUID']) out = False mapscore = avgmap(map_tbx, rec['Chromosome'], rec['Left_Extreme'], rec['Right_Extreme'])# * (max(int(rec['3p_Cons_Len']), int(rec['5p_Cons_Len']))/100.) if mapscore < 0.1: logger.debug('Filtered %s: mappability of %f < 0.1' % (rec['UUID'], mapscore)) out = False if float(rec['Remapped_Discordant']) < 4: logger.debug('Filtered %s: low discordant evidence (< 4 reads)' % rec['UUID']) out = False if float(rec['Remap_Disc_Fraction']) < 0.25: logger.debug('Filtered %s: low discordant evidence (%s < 25pct supporting)' % (rec['UUID'], rec['Remap_Disc_Fraction'])) out = False if rec['Insert_Consensus_5p'] == rec['Insert_Consensus_3p'] == 'NA': logger.debug('Filtered %s: no insertion consensus mapped to insertion reference' % rec['UUID']) out = False if args.lenfilter and out and len_filter(rec): logger.debug('Filtered %s: TE length filter' % rec['UUID']) out = False align_info = 'NA' if out and args.realign and args.insref: align_info = realign_filter(rec, inslib) if len(align_info) == 0: out = False well_aligned = False for alignment in align_info: seqtype, _, score, qstart, qend, tstart, tend, pi = alignment tstart = int(tstart) tend = int(tend) pi = float(pi) if pi >= 95.0 and abs(tend-tstart) >= 100: well_aligned = True if not well_aligned: out = False ins_site_homlen = 0 # insertion site homology length ins_site_homseq = 'NA' # sequence of overlapped region ch_ref_present = False ins_pct_match = 0.0 ref_pct_match = 0.0 if out and args.chimera: if not args.refgenome: sys.exit('--refgenome required in conjunction with --chimera') if not args.insref: sys.exit('--insref required in conjunction with --chimera') ref = pysam.Fastafile(args.refgenome) left = int(rec['Left_Extreme']) - 1000 right = int(rec['Right_Extreme']) + 1000 if left < 0: left = 0 ref_seq = ref.fetch(rec['Chromosome'], left, right) seqn = rec['Superfamily'] + ':' + rec['Subfamily'] ins_seq = inslib[seqn] alignside = '' ins_align = [] gen_align = [] if rec['Genomic_Consensus_3p'] != 'NA': ins_align = align(rec['Genomic_Consensus_3p'], ins_seq, rec['Subfamily']) gen_align = align(rec['Genomic_Consensus_3p'], ref_seq, 'Genomic') alignside = 'Genomic_Consensus_3p' else: ins_align = align(rec['Genomic_Consensus_5p'], ins_seq, rec['Subfamily']) gen_align = align(rec['Genomic_Consensus_5p'], ref_seq, 'Genomic') alignside = 'Genomic_Consensus_5p' ins_subcoords = None if ins_align: ins_subcoords = map(int, ins_align[2:4]) gen_subcoords = None if gen_align: gen_subcoords = map(int, gen_align[2:4]) else: out = False ol = None if gen_subcoords is not None and ins_subcoords is not None: ol = overlap(ins_subcoords, gen_subcoords) if ol is not None: ins_site_homlen = ol[1]-ol[0] ins_site_homseq = rec[alignside][ol[0]:ol[1]] ch_align_ins = align(ins_site_homseq, ins_seq, 'Ins') ch_align_ref = align(ins_site_homseq, ref_seq, 'Ref') if ch_align_ins: ins_pct_match = ch_align_ins[-1] if ch_align_ref: ref_pct_match = ch_align_ref[-1] # chimera with adjacent ref element check ch_ref_present = ref_filter(rec['Chromosome'], rec['Left_Extreme'], rec['Right_Extreme'], rec['Superfamily'], tbx, extend=10000) if out: fields = line.strip().split() if args.insref and args.realign: fields.append(','.join([';'.join(alignment) for alignment in align_info])) if args.chimera: fields.append(str(ins_site_homlen)) fields.append(str(ins_pct_match)) fields.append(str(ref_pct_match)) fields.append(ins_site_homseq) fields.append(str(ch_ref_present)) print '\t'.join(fields)