Exemplo n.º 1
0
if feature_coords_fn:
    feature_coords = load(open(feature_coords_fn))

print >> stderr, "Note: using FASTA file at %s" % fastafn

f = open(gtffn)
junctions = set()
for row in f:  # for each row in the input GTF
    # we need to get rid of first and last exons
    if row.strip().split('\t')[2] in ['start_codon', 'stop_codon', 'CDS']:
        continue
    if feature_coords_fn:  # if we have the resource to check whether this is a terminal exon or not
        #		print >> stderr,row.strip()
        strand = row.strip().split('\t')[6]
        feature, blank = parse_lines([row.strip()], strand, get_transcripts)
        #		print >> stderr,feature
        is_terminal_feature, up_exon, down_exon = terminal_exon(
            list(feature)[0], feature_coords)
        if is_terminal_feature:  # if this is terminal ignore it
            continue
        # now we know that this is not a terminal exon so there must be neighbouring exons
        up_exon_gtf = region_to_GTF(up_exon, feature_coords[up_exon],
                                    get_transcripts)
        down_exon_gtf = region_to_GTF(down_exon, feature_coords[down_exon],
                                      get_transcripts)

        if get_introns:
            up_exon_five, up_exon_three = GTFrow_to_5p3pcoords(
                up_exon_gtf, offset, use_chromnames)
            five, three = GTFrow_to_5p3pcoords(row, offset, use_chromnames)
#!/home/paulk/software/bin/python
from __future__ import division
from sys import argv,exit,stderr
from subprocess import Popen,PIPE
from key_functions import parse_lines
import pysam 

f = open(argv[1])
t = pysam.Tabixfile("resources/Homo_sapiens.GRCh37.66.gtf.gz")
for row in f:
	if row[0] == 'u': continue
	l = row.strip().split('\t')
	regions = [l[0],l[1],l[5],l[6]]
	the_exons = ""
	for region in regions:
		result = t.fetch(region[3:-2])
		strand = region[-1]
		exons,no_lines = parse_lines(result,strand,get_transcripts=True)
		the_exons += ",".join(exons) + "\t"
	print row.strip()+"\t"+the_exons
f.close()
	
Exemplo n.º 3
0
get_introns = args.introns

if feature_coords_fn:
	feature_coords = load(open(feature_coords_fn))

print >> stderr,"Note: using FASTA file at %s" % fastafn

f = open(gtffn)
junctions = set()
for row in f: # for each row in the input GTF
	# we need to get rid of first and last exons
	if row.strip().split('\t')[2] in ['start_codon','stop_codon','CDS']: continue
	if feature_coords_fn: # if we have the resource to check whether this is a terminal exon or not
#		print >> stderr,row.strip()
		strand = row.strip().split('\t')[6]
		feature,blank = parse_lines([row.strip()],strand,get_transcripts)
#		print >> stderr,feature
		is_terminal_feature,up_exon,down_exon = terminal_exon(list(feature)[0],feature_coords)
		if is_terminal_feature:	# if this is terminal ignore it
			continue
		# now we know that this is not a terminal exon so there must be neighbouring exons
		up_exon_gtf = region_to_GTF(up_exon,feature_coords[up_exon],get_transcripts)
		down_exon_gtf = region_to_GTF(down_exon,feature_coords[down_exon],get_transcripts)

 		if get_introns:
 			up_exon_five,up_exon_three = GTFrow_to_5p3pcoords(up_exon_gtf,offset,use_chromnames)
 			five,three = GTFrow_to_5p3pcoords(row,offset,use_chromnames)
 			down_exon_five,down_exon_three = GTFrow_to_5p3pcoords(down_exon_gtf,offset,use_chromnames)
 			junctions.add((up_exon_five,three,five,down_exon_three))
 		else:
			five,three = GTFrow_to_5p3pcoords(row,offset,use_chromnames)
Exemplo n.º 4
0
	if chrom == '---': continue
	st,sp = st_sp.split('-')
	if not chrom_names:	# if tabix needs 'chr' removed
		chrom = chrom[3:]
	else:	# if tabix can work with 'chr'
		pass
	if chrom == "chrM": chrom = "chrMT"
	if chrom == "M": chrom = "MT"
	try:
		lines = tabix_file_ptr.fetch(region="%s:%s-%s" % (chrom,st,sp))
	except ValueError:
		ps_missing.append((int(l[0]),ps_detail[int(l[colno])]))
#		print l[0]
		continue
	
	gene_exons,no_lines = parse_lines(lines,sd,get_transcripts)
	if gene_exons == []:
		ps_missing.append((int(l[0]),ps_detail[int(l[colno])]))
		#print l[0]
		continue
		
	if report:
		print >> h,l[colno]+"\t"+",".join(gene_exons)
#		for ge in gene_exons:
#			print >> h,l[colno]+"\t"+ge
#	"""
#	"""

	#if len(gene_exons) > 0: print >> g,"\t".join([",".join(gene_exons)]+l[colno+1:])
	if len(gene_exons) > 0:
		ge = random.choice(list(gene_exons))