def __init__(self, jncli): assert len(jncli) > 0 self.jncli = tuple(jncli) self.gs = jncli[0].gs self.don = GenomicSegment(self.gs.chrom, self.gs.start, self.gs.start + 1, self.gs.strand) self.acc = GenomicSegment(self.gs.chrom, self.gs.end - 1, self.gs.end, self.gs.strand)
def get_jnc(self, idx): assert 1 <= idx <= self.numjnc segs = self.sgc.segments s = SegmentChain(*segs[:idx]).spanning_segment.end e = SegmentChain(*segs[idx:]).spanning_segment.start gs = GenomicSegment(self.chrom, s, e, self.strand) return Jnc(gs, self, idx)
def test_jncsite1(self, bam): ja = JncAssembler(bam, window=1, minevi=1) ja.get_jncs("MN996528", 1, 500, "+") assert len(ja.jsdi) == 2 gs1 = GenomicSegment("MN996528", 130, 330, "+") js1 = ja.jsdi[gs1] assert js1.ntot == 4 assert js1.nevi == 3 gs2 = GenomicSegment("MN996528", 130, 332, "+") js2 = ja.jsdi[gs2] assert js2.ntot == 1 assert js2.nevi == 1 assert js2.as_bed() == [ 'MN996528', '130', '332', 'MN996528:130-131^331-332(+)', '0', '+', '130', '332', '0,0,0', '2', '1,1,', '0,201,', '1', '1' ]
def extend_gtf_frame(pickle): pickle_path = global_args.output_dir + \ global_args.annotation_file[:-4].split("/")[-1] + ".sav" if os.path.isfile(pickle_path) == False: create_assembly_dill(global_args.annotation_file) gtf_coords_file = list(dill.load(open(pickle_path, "rb"))) for transcript in gtf_coords_file: span = transcript.spanning_segment new_region = GenomicSegment(span.chrom, span.start - global_args.offset, span.start, span.strand) new_region_2 = GenomicSegment(span.chrom, span.end, span.end + global_args.offset, span.strand) transcript.add_segments(new_region, new_region_2) yield transcript
def __init__(self, r): assert isinstance(r, pysam.AlignedSegment) self.r = r self.chrom = r.reference_name self.strand = '-' if self.r.is_reverse else '+' self.blocks = self.r.blocks self.tags = {tag: val for tag, val in self.r.tags} self.sgc = SegmentChain() for (start, end) in self.r.blocks: self.sgc.add_segments( GenomicSegment(self.chrom, start, end, self.strand))
def main(args, loglevel): logging.basicConfig(format="%(levelname)s: %(message)s", level=loglevel) ORFS = GFF3_TranscriptAssembler(open(args.gff_file)) with open(args.output_file, 'w') as output_file: for my_orf in ORFS: span = my_orf.spanning_segment if my_orf.strand == "+": new_region = GenomicSegment( span.chrom, span.start - args.upstream_utr_length, span.end + args.downstream_utr_length, span.strand) else: new_region = GenomicSegment( span.chrom, span.start - args.downstream_utr_length, span.end + args.upstream_utr_length, span.strand) # copy metadata attributes from old ORF logging.debug(my_orf.attr) new_transcript = Transcript(new_region, **my_orf.attr) logging.debug(new_transcript) output_file.write(new_transcript.as_gff3())
def shift(self, read, genome): """ mapping1: AAAAA..............abcYXXBBBBB mapping2: AAAAAabc..............YXXBBBBB Convert mapping2 to mapping1 """ left_seq = read.fetch_sequence(self.gs.start - self.ARM_MINLEN, self.gs.start) right_ref_seq = genome.fetch(self.gs.chrom, self.gs.end - self.ARM_MINLEN, self.gs.end).upper() rev_left_seq = left_seq[::-1] rev_right_ref_seq = right_ref_seq[::-1] shift_len = 0 for indx in range(self.ARM_MINLEN): if rev_left_seq[indx] == rev_right_ref_seq[indx]: shift_len += 1 else: break new_gs = GenomicSegment(self.gs.chrom, self.gs.start - shift_len, self.gs.end - shift_len, self.gs.strand) self.gs = new_gs
def test_segmentchain(sgc1): sgc1.add_segments(GenomicSegment("chr1", 350, 450, "+")) assert str(sgc1) == "chr1:100-200^300-450(+)"
def sgc3(): return SegmentChain(GenomicSegment("chr1", 350, 450, "+"))
def sgc2(): exon1 = GenomicSegment("chr1", 1150, 1200, "+") exon2 = GenomicSegment("chr1", 1300, 1450, "+") return SegmentChain(exon1, exon2, ID="SGC2", alias="SGC2")
def sgc1(): exon1 = GenomicSegment("chr1", 100, 200, "+") exon2 = GenomicSegment("chr1", 300, 400, "+") return SegmentChain(exon1, exon2, ID="SGC1", alias="SGC1")
def as_seg(hit): chrom, start, end, strand = hit.split("\t") return GenomicSegment(chrom, int(start), int(end), strand)
def main(args, loglevel): logging.basicConfig(format="%(levelname)s: %(message)s", level=loglevel) logging.debug("Building sequence dictionary") seq_dict = SeqIO.index(args.fasta, "fasta") logging.debug("Reading Annotations") if args.gff: transcripts = list( GFF3_TranscriptAssembler(open(args.gff), add_three_for_stop=args.add_three)) elif args.gtf: transcripts = list( GTF2_TranscriptAssembler(open(args.gtf), add_three_for_stop=args.add_three)) logging.debug("Reading Alignments") alignments = BAMGenomeArray([args.bam]) if sum([args.threeprime, args.fiveprime]) != 1: logging.error("Must specify only one and at least one mapping type " "(--fiveprime or --threeprime)") exit(1) if args.threeprime: alignments.set_mapping(ThreePrimeMapFactory(offset=args.offset)) elif args.fiveprime: alignments.set_mapping(FivePrimeMapFactory(offset=args.offset)) alignments.add_filter( "size", SizeFilterFactory(min=args.min_length, max=args.max_length)) outfh = open(args.outfile, 'w') outfh.write("%s\n" % "\t".join( ("transcript_id", "gene_id", "codon_seq", "codon_index", "codon_count_sum", "position_1_count", "position_2_count", "position_3_count"))) for (i, transcript) in enumerate(transcripts): if (i == 0 or (i + 1) % 100 == 0): logging.info("Evaluated %s genes" % (i + 1)) logging.debug(transcript.get_name()) logging.debug(pprint.pformat(transcript.attr)) if (transcript.get_cds().get_length() <= 0): logging.info("Transcript %s has zero (0) length CDS, skipping!", transcript.get_name()) continue if transcript.attr.get("pseudo", None) == "true": logging.info("Transcript %s is a pseudogene, skipping!", transcript.get_name()) continue logging.debug('Transcript {} attributes: {}'.format( transcript.get_name(), transcript.attr)) # Many Ensembl MT annotations have incomplete codon records. # These are coded with an `ensembl_end_phase` attribute # These should be filled in with 'A's, which come from the # polyA tail transcript_cds = transcript.get_cds() transcript_seq = transcript_cds.get_sequence(seq_dict) end_phase = transcript_cds.get_length() % 3 extra_bases = 0 if end_phase != 0: extra_bases = 3 - end_phase logging.warning("Transcript %s CDS length (%i) is not a multiple " "of three, adding %i \"A\" bases" % (transcript.get_name(), transcript_cds.get_length(), extra_bases)) transcript_seq = transcript_seq + "A" * extra_bases last_segment = transcript_cds[-1] logging.debug(last_segment) transcript_cds.add_segments( GenomicSegment(last_segment.chrom, last_segment.end, last_segment.end + extra_bases, last_segment.strand)) num_codons = int(numpy.floor(len(transcript_seq) / 3)) logging.debug("Trancript %s length %i basepairs, %i codons" % (transcript.get_name(), len(transcript_seq), num_codons)) logging.debug('>{} {}\n{}'.format(transcript.get_name(), transcript.get_gene(), transcript_seq.upper())) start_codon = transcript_seq[:3].upper() stop_codon = transcript_seq[-3:].upper() if start_codon not in args.start_codons: logging.error('Transcript {} start codon "{}" is not valid'.format( transcript.get_name(), start_codon)) if stop_codon not in args.stop_codons: logging.error('Transcript {} stop codon "{}" is not valid'.format( transcript.get_name(), stop_codon)) logging.debug(transcript_cds.as_gff3()) transcript_counts = transcript_cds.get_counts(alignments) for codon_index in range(1, num_codons + 1): codon_start = (codon_index - 1) * 3 codon_stop = codon_start + 3 codon_seq = transcript_seq[codon_start:codon_stop] codon_counts = transcript_counts[codon_start:codon_stop] codon_count_sum = sum(codon_counts) transcript_id = transcript.get_name() if ":" in transcript_id: prefix, transcript_id = transcript_id.split(":", 1) gene_ids_raw = transcript.attr.get("Parent", "") gene_ids = [] for gene_id_raw in gene_ids_raw: if ":" in gene_id_raw: prefix, gene_id = gene_id_raw.split(":", 1) gene_ids.append(gene_id) outfh.write("%s\t%s\t%s\t%i\t%i\t%i\t%i\t%i\n" % (transcript_id, ",".join(gene_ids), codon_seq.upper(), codon_index, codon_count_sum, codon_counts[0], codon_counts[1], codon_counts[2]))
def get_junc(self): js = self.larm.spanning_segment.end je = self.rarm.spanning_segment.start return GenomicSegment(self.larm.chrom, js, je, self.larm.strand)
def sgc(self): sgc = SegmentChain() for (start, end) in self.blocks: sgc.add_segments( GenomicSegment(self.chrom, start, end, self.strand)) return sgc