def mainERRBSalign(argv):
	__version__ = "0.0.1"  # version of ERRBSalign

	# Initialize all parameters to default values
	inputfileR1 = str() # R1 PE fastq file
	inputfileR2 = str() # R2 PE fastq file
	genomeref = str() # Reference genome directory
	outputdir = str() # Output directory
	paired= ''
	single=''

	# if any arguments are given print usage message and then exit the programm
	if len(argv) == 1:
		usageERRBSalign()
		sys.exit(2)

	# List of all options possible
	try:
		opts, args = getopt.getopt(argv[1:],"h1:2:g:o:",["paired","single=","genome_ref=","outputdir=","version"])
	except getopt.GetoptError:
		usageERRBSalign()
		sys.exit(2)
	for opt, arg in opts:
		if opt == '-h': # print usage message
			usageERRBSalign()
			sys.exit()
		elif opt == '--paired': # paired-end read
			paired = 'True'
		elif opt == '-1': # R1 PE fastq file
			checkFile(arg)
			inputfileR1 = arg
		elif opt == '-2': # R2 PE fastq file
			checkFile(arg)
			inputfileR2 = arg
		elif opt == '--single': # single-end read
			single = 'True'
			checkFile(arg)
			inputfileR1 = arg
		elif opt in ("-g","--genome_ref"): # Reference genome directory
			genomeref = arg
		elif opt in ("-o", "--outputdir"): # Output directory
			outputdir = arg
		elif opt == '--version': # print software version
			print "ERRBSalign."+__version__
			sys.exit(0)

	# Check that all necessary arguments are given
	checkargsAlign(paired,inputfileR1,inputfileR2,genomeref,outputdir)
	# Adapter filtering
	paired, single, R1, R2,outputdir = filtering(paired, single, inputfileR1, inputfileR2, outputdir)
	# Check that the given genome path exists and if the genome is already indexing or not
	checkGenome(genomeref)
	# alignment against the reference genome
	align(paired, single, R1, R2 , outputdir, genomeref)
	# Sorting the output file BAM and creation of a SAM file
	sortAlignFile(inputfileR1, outputdir, paired, single)
	sys.exit(0) # Exiting
def mainfunc(f,special=0):
        global node,face
        node=0
        face=f
        if special==0:
            while conditioncheck() == 0:
                pass
        elif special==2 :
            while conditioncheck()==0:
                pass
            align(face)
            #throw()
            time.sleep(2)
                
        else:
                while specialconditioncheck() == 0:
                        pass    
示例#3
0
def main(argv):
    referencefile = ''
    queryfile = ''
    outputfile = ''
    AG_score = -0.5
    CT_score = -0.75
    try:
        opts, args = getopt.getopt(argv, "hr:q:o:a:c:", [
            "ref_file=", "query_file=", "output_file=", "AG_score=",
            "CT_score="
        ])
    except getopt.GetoptError:
        print(
            'Error! Correct usage:\npython src.py -r <reference_fasta_file> -q <query_fasta_file> [-o <output_csv_file> -a <score for AG mismatch> -c <score for CT mismatch>]'
        )
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print(
                'python src.py -r <reference_fasta_file> -q <query_fasta_file> [-o <output_csv_file> -a <score for AG mismatch> -c <score for CT mismatch>]'
            )
            sys.exit()
        elif opt in ("-r", "--ref_file"):
            referencefile = arg
        elif opt in ("-q", "--query_file"):
            queryfile = arg
        elif opt in ("-o", "--output_file"):
            outputfile = arg
        elif opt in ("-a", "--AG_score"):
            AG_score = float(arg)
        elif opt in ("-c", "--CT_score"):
            CT_score = float(arg)
    print('Reference file is ', referencefile)
    print('Query file is ', queryfile)

    print('AG score: ', AG_score)
    print('CT score: ', CT_score)

    all_ref_seqs = parse_fasta(referencefile)

    all_seqs = parse_query_file(queryfile)

    res_df = align(all_ref_seqs, all_seqs, AG=AG_score, CT=CT_score)

    if len(outputfile) == 0:
        output_file = queryfile.split(".")[0] + "_" + referencefile.split(
            "/")[1] + "_results.csv"
    else:
        output_file = outputfile
    print('Output file is ', output_file)
    res_df.to_csv(output_file, sep='\t', index=False)
示例#4
0
def realign_filter(rec, inslib):
    seqn = rec['Superfamily'] + ':' + rec['Subfamily']
    if seqn not in inslib:
        return False

    seq_headers = ['Genomic_Consensus_5p', 'Genomic_Consensus_3p', 'Insert_Consensus_5p', 'Insert_Consensus_3p']

    matches = []

    for seqtype in seq_headers:
        if rec[seqtype] == 'NA':
            continue

        #print seqtype, rec[seqtype]

        alignment = align(rec[seqtype], inslib[seqn], rec['Subfamily'])

        if alignment:
            matches.append([seqtype] + alignment)

    return matches
示例#5
0
def compare_traj(namelist, dirlist, gt_dir, save_dir=None, plot=False):
    if len(dirlist) < 2 or len(dirlist) != len(namelist):
        return

    if save_dir is not None:
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

    valid_path = []
    for path in dirlist:
        files = [
            x for x in os.listdir(path) if x[:4] == 'est_' and x[-4:] == '.txt'
        ]
        if not os.path.isdir(path) or len(files) == 0:
            print('Invalid path:%s' % path)
            continue
        valid_path.append(path)

    colors = [
        'red', 'blue', 'green', 'purple', 'pink', 'sienna', 'gray', 'yellow',
        'black', 'gold', 'darkcyan'
    ]
    datanames = sorted([
        x[4:-4] for x in os.listdir(dirlist[0])
        if x[:4] == 'est_' and x[-4:] == '.txt'
    ])
    for dataname in datanames:
        plt.figure()
        plt.title(dataname)
        gt_file = os.path.join(gt_dir, '%s.txt' % dataname)
        gt_list = associate.read_file_list(gt_file)
        gt_stamps = gt_list.keys()
        gt_stamps.sort()
        gt_traj = np.matrix([[float(value) for value in gt_list[b][0:3]]
                             for b in gt_stamps]).transpose()
        plt.plot(gt_traj[0, :].T,
                 gt_traj[1, :].T,
                 colors[0],
                 label='ground truth')
        plt.plot(gt_traj[0, 0], gt_traj[1, 0], 'yp', markersize=8)

        cidx = 1
        for name, path in zip(namelist, dirlist):
            traj_file = os.path.join(path, 'est_%s.txt' % dataname)
            if not os.path.exists(traj_file):
                continue
            traj_list = associate.read_file_list(traj_file)
            matches = associate.associate(gt_list, traj_list, 0, 0.02)
            if len(matches) < 2:
                continue
            gt_xyz = np.matrix([[float(value) for value in gt_list[a][0:3]]
                                for a, b in matches]).transpose()
            traj_xyz = np.matrix(
                [[float(value) for value in traj_list[b][0:3]]
                 for a, b in matches]).transpose()
            rot, trans, trans_error = align(traj_xyz, gt_xyz)
            aligned_traj = rot * traj_xyz + trans
            if np.max(trans_error) < 100:
                plt.plot(aligned_traj[0, :].T,
                         aligned_traj[1, :].T,
                         colors[cidx % len(colors)],
                         label=name)
            cidx += 1
        plt.legend()
        if save_dir is not None:
            plt.savefig(os.path.join(save_dir, '%s.png' % dataname))
    if plot:
        plt.show()
示例#6
0
from __future__ import division
__author__ = 'hanz'
#!/usr/bin/env python
# coding=utf-8
import sys
from initialize import *
from EM_IBM1 import *
from ibm2_initialize import *
from EM_IBM2 import *
from align import *

t={}
total={}
count={}
box_f=[]
box_e=[]
EM_IBM1(t,count,total,box_f,box_e)
a={}
total_a={}
EM_IBM2(a,t,count,total,total_a,box_f,box_e)
align(a,t,box_f,box_e)




def sts_alignment(sentence1, sentence2,
                  parse_results=None,
                  sentence_for_demoting=None):
                      
    if parse_results == None:
        sentence1_parse_result = parseText(sentence1)
        sentence2_parse_result = parseText(sentence2)
        parse_results = []
        parse_results.append(sentence1_parse_result)
        parse_results.append(sentence2_parse_result)
    else:
        sentence1_parse_result = parse_results[0]
        sentence2_parse_result = parse_results[1]
        

    sentence1_lemmatized = lemmatize(sentence1_parse_result)
    sentence2_lemmatized = lemmatize(sentence2_parse_result)

    lemmas_to_be_demoted = []
    if sentence_for_demoting != None:
        if len(parse_results) == 2:
            sentence_for_demoting_parse_result = \
                                parseText(sentence_for_demoting)
            parse_results.append(sentence_for_demoting_parse_result)
        else:
            sentence_for_demoting_parse_result = parse_results[2]


        sentence_for_demoting_lemmatized = \
                            lemmatize(sentence_for_demoting_parse_result)
    
        sentence_for_demoting_lemmas = \
                        [item[3] for item in sentence_for_demoting_lemmatized]
    
        lemmas_to_be_demoted = \
    			[item.lower() for item in sentence_for_demoting_lemmas \
        					if item.lower() not in stop_words+punctuations]
    
    alignments = align(sentence1, sentence2, 
                       sentence1_parse_result, sentence2_parse_result)[0]
    
    sentence1_lemmas = [item[3] for item in sentence1_lemmatized]
    sentence2_lemmas = [item[3] for item in sentence2_lemmatized]

    sentence1_content_lemmas = \
            [item for item in sentence1_lemmas \
                      if item.lower() not in \
                            stop_words+punctuations+lemmas_to_be_demoted]

    sentence2_content_lemmas = \
            [item for item in sentence2_lemmas \
					if item.lower() not in \
                             stop_words+punctuations+lemmas_to_be_demoted]

    if sentence1_content_lemmas == [] or sentence2_content_lemmas == []:
        return (0, 0, parse_results)
    
    sentence1_aligned_content_word_indexes = \
		[item[0] for item in alignments if \
				sentence1_lemmas[item[0]-1].lower() not in \
                                stop_words+punctuations+lemmas_to_be_demoted]

    sentence2_aligned_content_word_indexes = \
		[item[1] for item in alignments if \
				sentence2_lemmas[item[1]-1].lower() not in \
                                stop_words+punctuations+lemmas_to_be_demoted]
    
    sim_score = (len(sentence1_aligned_content_word_indexes) + \
	             len(sentence2_aligned_content_word_indexes)) / \
                        				(len(sentence1_content_lemmas) + \
                        	              len(sentence2_content_lemmas))

    coverage = len(sentence1_aligned_content_word_indexes) / \
                                           len(sentence1_content_lemmas) 

    return (sim_score, coverage, parse_results)
示例#8
0
def main(args):

    l1_ref = tebreak_dir + '/../lib/mask.L1.hg19.bed.gz'
    alu_ref = tebreak_dir + '/../lib/mask.Alu.hg19.bed.gz'
    sva_ref = tebreak_dir + '/../lib/mask.SVA.hg19.bed.gz'

    inslib = None

    if args.insref:
        inslib = load_falib(args.insref)

    for fn in (l1_ref, alu_ref, sva_ref):
        if not os.path.exists(fn): sys.exit('reference %s not found' % fn)
        if not os.path.exists(fn + '.tbi'):
            sys.exit('index for reference %s not found' % fn)

    tbx = {}
    tbx['L1'] = pysam.Tabixfile(l1_ref)
    tbx['ALU'] = pysam.Tabixfile(alu_ref)
    tbx['SVA'] = pysam.Tabixfile(sva_ref)

    header = []

    with open(args.tabfile, 'r') as tab:
        for i, line in enumerate(tab):

            if i == 0:  # header
                header = line.strip().split('\t')
                header += [
                    'ChimeraBaseCount', 'ChimeraMatchIns', 'ChimeraMatchRef',
                    'InsSiteHomology', 'PossibleRefEltChimera'
                ]

                print '\t'.join(header)

            else:
                rec = {}
                for n, field in enumerate(line.strip().split('\t')):
                    rec[header[n]] = field

                ins_site_homlen = 0  # insertion site homology length
                ins_site_homseq = 'NA'  # sequence of overlapped region
                ch_ref_present = False
                ins_pct_match = 0.0
                ref_pct_match = 0.0

                ref = pysam.Fastafile(args.refgenome)

                left = int(rec['Left_Extreme']) - 1000
                right = int(rec['Right_Extreme']) + 1000

                if left < 0: left = 0

                ref_seq = ref.fetch(rec['Chromosome'], left, right)

                seqn = rec['Superfamily'] + ':' + rec['Subfamily']

                if 'NA' in (rec['Superfamily'], rec['Subfamily']):
                    continue

                ins_seq = inslib[seqn]

                alignside = ''

                ins_align = []
                gen_align = []

                if rec['Genomic_Consensus_3p'] != 'NA':
                    ins_align = align(rec['Genomic_Consensus_3p'], ins_seq,
                                      rec['Subfamily'])
                    gen_align = align(rec['Genomic_Consensus_3p'], ref_seq,
                                      'Genomic')
                    alignside = 'Genomic_Consensus_3p'

                else:
                    ins_align = align(rec['Genomic_Consensus_5p'], ins_seq,
                                      rec['Subfamily'])
                    gen_align = align(rec['Genomic_Consensus_5p'], ref_seq,
                                      'Genomic')
                    alignside = 'Genomic_Consensus_5p'

                ins_subcoords = None

                if ins_align:
                    ins_subcoords = map(int, ins_align[2:4])

                gen_subcoords = None

                if gen_align:
                    gen_subcoords = map(int, gen_align[2:4])
                else:
                    out = False

                ol = None

                if gen_subcoords is not None and ins_subcoords is not None:
                    ol = overlap(ins_subcoords, gen_subcoords)

                if ol is not None:
                    ins_site_homlen = ol[1] - ol[0]
                    ins_site_homseq = rec[alignside][ol[0]:ol[1]]

                    ch_align_ins = align(ins_site_homseq, ins_seq, 'Ins')
                    ch_align_ref = align(ins_site_homseq, ref_seq, 'Ref')

                    if ch_align_ins:
                        ins_pct_match = ch_align_ins[-1]
                    if ch_align_ref:
                        ref_pct_match = ch_align_ref[-1]

                # chimera with adjacent ref element check
                ch_ref_present = ref_filter(rec['Chromosome'],
                                            rec['Left_Extreme'],
                                            rec['Right_Extreme'],
                                            rec['Superfamily'],
                                            tbx,
                                            extend=10000)

                # output
                fields = line.strip().split()
                fields.append(str(ins_site_homlen))
                fields.append(str(ins_pct_match))
                fields.append(str(ref_pct_match))
                fields.append(ins_site_homseq)
                fields.append(str(ch_ref_present))

                print '\t'.join(fields)
示例#9
0
#coding=utf-8
import numpy as np
import cv2
import net
from align import *

FSIZE = 100
LINE_H = 30
filename = "/home/hal/Downloads/Final+Project/5.jpg"
im = align(filename) 
edges = cv2.Canny(im, 50,300, apertureSize = 3)
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5,5))
edges = cv2.morphologyEx(edges, cv2.MORPH_CLOSE, kernel)
#kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5,5))
#edges = cv2.morphologyEx(edges, cv2.MORPH_OPEN, kernel) # 去噪声
cv2.imshow("canny", R(edges))
cv2.waitKey(0)

b = edges > 0
b = b.astype(np.uint8) * 255
valve = 170
bf = ((im[:,:,0] < valve) & (im[:,:,1] < valve) & (im[:,:,2] < valve)).astype(np.uint8) * 255

kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5,5))
b = cv2.morphologyEx(b, cv2.MORPH_CLOSE, kernel)
#寻找联通分量
b, contours, hierarchy = cv2.findContours(b, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

bc = im.copy()#np.dstack([bf,bf,bf])
delete_ratio = []
cs = []
示例#10
0
文件: test.py 项目: mgno32/a4
import sys
from align import *

FSIZE = 100
LINE_H = 100
imid = 5
if len(sys.argv) > 1:
    imid = int(sys.argv[1])
filename = "./%d.jpg" % imid


def is_pic(num):
    return ("%d.jpg" % num) in filename


im, source = align(filename)
if is_pic(4):
    edges = cv2.Canny(im, 50, 500, apertureSize=3)
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
    edges = cv2.morphologyEx(edges, cv2.MORPH_CLOSE, kernel)
elif is_pic(5):
    edges = cv2.Canny(im, 50, 300, apertureSize=3)
else:
    edges = cv2.Canny(im, 100, 200, apertureSize=3)
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
edges = cv2.morphologyEx(edges, cv2.MORPH_CLOSE, kernel)
#kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5,5))
#edges = cv2.morphologyEx(edges, cv2.MORPH_OPEN, kernel) # 去噪声

b = edges > 0
b = b.astype(np.uint8) * 255
示例#11
0
def sts_alignment(sentence1, sentence2,
                  parse_results=None,
                  sentence_for_demoting=None):

    if sentence1 and sentence2:

        if parse_results == None:
            sentence1_parse_result = parseText(sentence1)
            sentence2_parse_result = parseText(sentence2)
            parse_results = []
            parse_results.append(sentence1_parse_result)
            parse_results.append(sentence2_parse_result)
        else:
            sentence1_parse_result = parse_results[0]
            sentence2_parse_result = parse_results[1]


        sentence1_lemmatized = lemmatize(sentence1_parse_result)
        sentence2_lemmatized = lemmatize(sentence2_parse_result)

        lemmas_to_be_demoted = []
        if sentence_for_demoting != None:
            if len(parse_results) == 2:
                sentence_for_demoting_parse_result = \
                                    parseText(sentence_for_demoting)
                parse_results.append(sentence_for_demoting_parse_result)
            else:
                sentence_for_demoting_parse_result = parse_results[2]


            sentence_for_demoting_lemmatized = \
                                lemmatize(sentence_for_demoting_parse_result)

            sentence_for_demoting_lemmas = \
                            [item[3] for item in sentence_for_demoting_lemmatized]

            lemmas_to_be_demoted = \
                    [item.lower() for item in sentence_for_demoting_lemmas \
                                if item.lower() not in stop_words+punctuations]

        alignments = align(sentence1, sentence2,
                           sentence1_parse_result, sentence2_parse_result)[0]

        sentence1_lemmas = [item[3] for item in sentence1_lemmatized]
        sentence2_lemmas = [item[3] for item in sentence2_lemmatized]

        sentence1_content_lemmas = \
                [item for item in sentence1_lemmas \
                          if item.lower() not in \
                                stop_words+punctuations+lemmas_to_be_demoted]

        sentence2_content_lemmas = \
                [item for item in sentence2_lemmas \
                        if item.lower() not in \
                                 stop_words+punctuations+lemmas_to_be_demoted]

        if sentence1_content_lemmas == [] or sentence2_content_lemmas == []:
            return (0, 0, parse_results)

        sentence1_aligned_content_word_indexes = \
            [item[0] for item in alignments if \
                    sentence1_lemmas[item[0]-1].lower() not in \
                                    stop_words+punctuations+lemmas_to_be_demoted]

        sentence2_aligned_content_word_indexes = \
            [item[1] for item in alignments if \
                    sentence2_lemmas[item[1]-1].lower() not in \
                                    stop_words+punctuations+lemmas_to_be_demoted]

        sim_score = (len(sentence1_aligned_content_word_indexes) + \
                     len(sentence2_aligned_content_word_indexes)) / \
                                            (len(sentence1_content_lemmas) + \
                                              len(sentence2_content_lemmas))

        coverage = len(sentence1_aligned_content_word_indexes) / \
                                               len(sentence1_content_lemmas)

        return (sim_score, coverage, parse_results)
示例#12
0
def ate(first_file,
        second_file,
        offset=0,
        max_diff=0.02,
        scale=1.0,
        plot=1,
        save_png=None):
    first_list = associate.read_file_list(first_file)
    second_list = associate.read_file_list(second_file)

    matches = associate.associate(first_list, second_list, float(offset),
                                  float(max_diff))
    if len(matches) < 2:
        sys.exit(
            "Couldn't find matching timestamp pairs between groundtruth and estimated trajectory! Did you choose the correct sequence?"
        )

    first_xyz = np.matrix([[float(value) for value in first_list[a][0:3]]
                           for a, b in matches]).transpose()
    second_xyz = np.matrix(
        [[float(value) * float(scale) for value in second_list[b][0:3]]
         for a, b in matches]).transpose()
    rot, trans, trans_error = align(second_xyz, first_xyz)

    first_stamps = first_list.keys()
    first_stamps.sort()
    first_xyz_full = np.matrix([[float(value) for value in first_list[b][0:3]]
                                for b in first_stamps]).transpose()

    second_stamps = second_list.keys()
    second_stamps.sort()
    second_xyz_full = np.matrix(
        [[float(value) * float(scale) for value in second_list[b][0:3]]
         for b in second_stamps]).transpose()
    second_xyz_full_aligned = rot * second_xyz_full + trans

    rmse = np.sqrt(np.dot(trans_error, trans_error) / len(trans_error))
    emean = np.mean(trans_error)
    emedian = np.median(trans_error)
    estd = np.std(trans_error)
    emin = np.min(trans_error)
    emax = np.max(trans_error)

    name = os.path.basename(first_file)[:-4]

    if save_png is not None or plot:
        fig = plt.figure()
        ax = fig.add_subplot(111)
        plot_traj(ax, first_stamps,
                  first_xyz_full.transpose().A, '-', "red", "ground truth")
        plot_traj(ax, second_stamps,
                  second_xyz_full_aligned.transpose().A, '-', "blue",
                  "estimated")
        ax.legend()
        ax.set_xlabel('x [m]')
        ax.set_ylabel('y [m]')
        plt.title(name)
    if save_png is not None:
        plt.savefig(save_png)
    if plot:
        plt.show()
    return len(trans_error), rmse, emean, emedian, estd, emin, emax
示例#13
0
def main(args):

    l1_ref  = tebreak_dir + '/../lib/mask.L1.hg19.bed.gz'
    alu_ref = tebreak_dir + '/../lib/mask.Alu.hg19.bed.gz'
    sva_ref = tebreak_dir + '/../lib/mask.SVA.hg19.bed.gz'

    inslib = None

    if args.insref:
        inslib = load_falib(args.insref)


    for fn in (l1_ref, alu_ref, sva_ref):
        if not os.path.exists(fn): sys.exit('reference %s not found' % fn)
        if not os.path.exists(fn + '.tbi'): sys.exit('index for reference %s not found' %fn)

    tbx = {}
    tbx['L1']  = pysam.Tabixfile(l1_ref)
    tbx['ALU'] = pysam.Tabixfile(alu_ref)
    tbx['SVA'] = pysam.Tabixfile(sva_ref)


    header = []

    with open(args.tabfile, 'r') as tab:
        for i, line in enumerate(tab):

            if i == 0: # header
                header = line.strip().split('\t')
                header += ['ChimeraBaseCount', 'ChimeraMatchIns', 'ChimeraMatchRef', 'InsSiteHomology', 'PossibleRefEltChimera']

                print '\t'.join(header)

            else:
                rec = {}
                for n, field in enumerate(line.strip().split('\t')):
                    rec[header[n]] = field

                ins_site_homlen = 0 # insertion site homology length
                ins_site_homseq = 'NA' # sequence of overlapped region
                ch_ref_present = False
                ins_pct_match = 0.0
                ref_pct_match = 0.0


                ref = pysam.Fastafile(args.refgenome)

                left  = int(rec['Left_Extreme']) - 1000
                right = int(rec['Right_Extreme']) + 1000

                if left < 0: left = 0

                ref_seq = ref.fetch(rec['Chromosome'], left, right)

                seqn = rec['Superfamily'] + ':' + rec['Subfamily']

                if 'NA' in (rec['Superfamily'], rec['Subfamily']):
                    continue

                ins_seq = inslib[seqn]

                alignside = ''

                ins_align = []
                gen_align = []

                if rec['Genomic_Consensus_3p'] != 'NA':
                    ins_align = align(rec['Genomic_Consensus_3p'], ins_seq, rec['Subfamily'])
                    gen_align = align(rec['Genomic_Consensus_3p'], ref_seq, 'Genomic')
                    alignside = 'Genomic_Consensus_3p'

                else:
                    ins_align = align(rec['Genomic_Consensus_5p'], ins_seq, rec['Subfamily'])
                    gen_align = align(rec['Genomic_Consensus_5p'], ref_seq, 'Genomic')
                    alignside = 'Genomic_Consensus_5p'

                ins_subcoords = None

                if ins_align:
                    ins_subcoords = map(int, ins_align[2:4])

                gen_subcoords = None

                if gen_align:
                    gen_subcoords = map(int, gen_align[2:4])
                else:
                    out = False

                ol = None

                if gen_subcoords is not None and ins_subcoords is not None:
                    ol = overlap(ins_subcoords, gen_subcoords)

                if ol is not None:
                    ins_site_homlen = ol[1]-ol[0]
                    ins_site_homseq = rec[alignside][ol[0]:ol[1]]

                    ch_align_ins = align(ins_site_homseq, ins_seq, 'Ins')
                    ch_align_ref = align(ins_site_homseq, ref_seq, 'Ref')

                    if ch_align_ins:
                        ins_pct_match = ch_align_ins[-1]
                    if ch_align_ref:
                        ref_pct_match = ch_align_ref[-1]

                # chimera with adjacent ref element check
                ch_ref_present = ref_filter(rec['Chromosome'], rec['Left_Extreme'], rec['Right_Extreme'], rec['Superfamily'], tbx, extend=10000)

                # output
                fields = line.strip().split()
                fields.append(str(ins_site_homlen))
                fields.append(str(ins_pct_match))
                fields.append(str(ref_pct_match))
                fields.append(ins_site_homseq)
                fields.append(str(ch_ref_present))
                
                print '\t'.join(fields)
示例#14
0
def main(args):

    l1_ref  = tebreak_dir + '/../lib/mask.L1.mm10.bed.gz'
    ltr_ref = tebreak_dir + '/../lib/mask.LTR.mm10.bed.gz'
    sine_ref = tebreak_dir + '/../lib/mask.SINE.mm10.bed.gz'
    map_ref = tebreak_dir + '/../lib/mm10.map50bp.bed.gz'

    inslib = None

    if args.insref:
        inslib = load_falib(args.insref)


    for fn in (l1_ref, ltr_ref, sine_ref):
        if not os.path.exists(fn): sys.exit('reference %s not found' % fn)
        if not os.path.exists(fn + '.tbi'): sys.exit('index for reference %s not found' %fn)

    tbx = {}
    tbx['L1']  = pysam.Tabixfile(l1_ref)
    tbx['LTR'] = pysam.Tabixfile(ltr_ref)
    tbx['SINE'] = pysam.Tabixfile(sine_ref)

    map_tbx = pysam.Tabixfile(map_ref)

    header = []
    with open(args.tabfile, 'r') as tab:
        for i, line in enumerate(tab):

            if i == 0: # header
                header = line.strip().split('\t')

                if args.realign and args.insref:
                    header += ['ExonerateRealign']

                if args.chimera:
                    header += ['ChimeraBaseCount', 'ChimeraMatchIns', 'ChimeraMatchRef', 'InsSiteHomology', 'PossibleRefEltChimera']

                print '\t'.join(header)

            else:
                rec = {}
                out = True
                for n, field in enumerate(line.strip().split('\t')):
                    rec[header[n]] = field

                #logger.debug(rec['UUID'])

                if int(rec['3p_Cons_Len']) < 120 and int(rec['5p_Cons_Len']) < 120:
                    logger.debug('Filtered %s: consensus length < %d' % (rec['UUID'], 120))
                    out = False

                if 'NA' in (rec['TE_Align_Start'], rec['TE_Align_End']):
                    logger.debug('Filtered %s: TE_Align_Start or TE_Align_End is "NA"' % rec['UUID'])
                    out = False

                ref_present = False

                if args.wideref:
                    ref_present = ref_filter(rec['Chromosome'], rec['Left_Extreme'], rec['Right_Extreme'], rec['Superfamily'], tbx, extend=10000)
                else:
                    ref_present = ref_filter(rec['Chromosome'], rec['Left_Extreme'], rec['Right_Extreme'], rec['Superfamily'], tbx)

                if ref_present and not args.ignore_ref_filter:
                    logger.debug('Filtered %s: proximity to reference TE of same superfamily' % rec['UUID']) 
                    out = False

                if max(float(rec['5p_Elt_Match']), float(rec['3p_Elt_Match'])) < 0.90:
                    logger.debug('Filtered %s: max(5p_Elt_Match, 3p_Elt_Match) < 0.90' % rec['UUID'])
                    out = False

                if max(float(rec['5p_Genome_Match']), float(rec['3p_Genome_Match'])) < 0.98:
                    logger.debug('Filtered %s: max(5p_Genome_Match, 3p_Genome_Match) < 0.98' % rec['UUID'])
                    out = False

                mapscore = avgmap(map_tbx, rec['Chromosome'], rec['Left_Extreme'], rec['Right_Extreme'])# * (max(int(rec['3p_Cons_Len']), int(rec['5p_Cons_Len']))/100.)
                if mapscore < 0.1:
                    logger.debug('Filtered %s: mappability of %f < 0.1' % (rec['UUID'], mapscore))
                    out = False

                if float(rec['Remapped_Discordant']) < 4:
                    logger.debug('Filtered %s: low discordant evidence (< 4 reads)' % rec['UUID'])
                    out = False

                if float(rec['Remap_Disc_Fraction']) < 0.25:
                    logger.debug('Filtered %s: low discordant evidence (%s < 25pct supporting)' % (rec['UUID'], rec['Remap_Disc_Fraction']))
                    out = False

                if rec['Insert_Consensus_5p'] == rec['Insert_Consensus_3p'] == 'NA':
                    logger.debug('Filtered %s: no insertion consensus mapped to insertion reference' % rec['UUID'])
                    out = False

                if args.lenfilter and out and len_filter(rec):
                    logger.debug('Filtered %s: TE length filter' % rec['UUID'])
                    out = False

                align_info = 'NA'

                if out and args.realign and args.insref:
                    align_info = realign_filter(rec, inslib)

                    if len(align_info) == 0:
                        out = False

                    well_aligned = False
                    for alignment in align_info:
                        seqtype, _, score, qstart, qend, tstart, tend, pi = alignment
                        tstart = int(tstart)
                        tend   = int(tend)
                        pi     = float(pi)

                        if pi >= 95.0 and abs(tend-tstart) >= 100:
                            well_aligned = True

                    if not well_aligned: out = False


                ins_site_homlen = 0 # insertion site homology length
                ins_site_homseq = 'NA' # sequence of overlapped region
                ch_ref_present = False
                ins_pct_match = 0.0
                ref_pct_match = 0.0

                if out and args.chimera:
                    if not args.refgenome:
                        sys.exit('--refgenome required in conjunction with --chimera')

                    if not args.insref:
                        sys.exit('--insref required in conjunction with --chimera')

                    ref = pysam.Fastafile(args.refgenome)

                    left  = int(rec['Left_Extreme']) - 1000
                    right = int(rec['Right_Extreme']) + 1000

                    if left < 0: left = 0

                    ref_seq = ref.fetch(rec['Chromosome'], left, right)

                    seqn = rec['Superfamily'] + ':' + rec['Subfamily']

                    ins_seq = inslib[seqn]

                    alignside = ''

                    ins_align = []
                    gen_align = []

                    if rec['Genomic_Consensus_3p'] != 'NA':
                        ins_align = align(rec['Genomic_Consensus_3p'], ins_seq, rec['Subfamily'])
                        gen_align = align(rec['Genomic_Consensus_3p'], ref_seq, 'Genomic')
                        alignside = 'Genomic_Consensus_3p'

                    else:
                        ins_align = align(rec['Genomic_Consensus_5p'], ins_seq, rec['Subfamily'])
                        gen_align = align(rec['Genomic_Consensus_5p'], ref_seq, 'Genomic')
                        alignside = 'Genomic_Consensus_5p'

                    ins_subcoords = None

                    if ins_align:
                        ins_subcoords = map(int, ins_align[2:4])

                    gen_subcoords = None

                    if gen_align:
                        gen_subcoords = map(int, gen_align[2:4])
                    else:
                        out = False

                    ol = None

                    if gen_subcoords is not None and ins_subcoords is not None:
                        ol = overlap(ins_subcoords, gen_subcoords)

                    if ol is not None:
                        ins_site_homlen = ol[1]-ol[0]
                        ins_site_homseq = rec[alignside][ol[0]:ol[1]]

                        ch_align_ins = align(ins_site_homseq, ins_seq, 'Ins')
                        ch_align_ref = align(ins_site_homseq, ref_seq, 'Ref')

                        if ch_align_ins:
                            ins_pct_match = ch_align_ins[-1]
                        if ch_align_ref:
                            ref_pct_match = ch_align_ref[-1]

                    # chimera with adjacent ref element check
                    ch_ref_present = ref_filter(rec['Chromosome'], rec['Left_Extreme'], rec['Right_Extreme'], rec['Superfamily'], tbx, extend=10000)

                if out:
                    fields = line.strip().split()

                    if args.insref and args.realign:
                        fields.append(','.join([';'.join(alignment) for alignment in align_info]))

                    if args.chimera:
                        fields.append(str(ins_site_homlen))
                        fields.append(str(ins_pct_match))
                        fields.append(str(ref_pct_match))
                        fields.append(ins_site_homseq)
                        fields.append(str(ch_ref_present))
                    
                    print '\t'.join(fields)