예제 #1
0
파일: subgenome.py 프로젝트: sqreb/cov2sg-1
 def __init__(self, jncli):
     assert len(jncli) > 0
     self.jncli = tuple(jncli)
     self.gs = jncli[0].gs
     self.don = GenomicSegment(self.gs.chrom, self.gs.start,
                               self.gs.start + 1, self.gs.strand)
     self.acc = GenomicSegment(self.gs.chrom, self.gs.end - 1, self.gs.end,
                               self.gs.strand)
예제 #2
0
파일: subgenome.py 프로젝트: sqreb/cov2sg-1
 def get_jnc(self, idx):
     assert 1 <= idx <= self.numjnc
     segs = self.sgc.segments
     s = SegmentChain(*segs[:idx]).spanning_segment.end
     e = SegmentChain(*segs[idx:]).spanning_segment.start
     gs = GenomicSegment(self.chrom, s, e, self.strand)
     return Jnc(gs, self, idx)
예제 #3
0
 def test_jncsite1(self, bam):
     ja = JncAssembler(bam, window=1, minevi=1)
     ja.get_jncs("MN996528", 1, 500, "+")
     assert len(ja.jsdi) == 2
     gs1 = GenomicSegment("MN996528", 130, 330, "+")
     js1 = ja.jsdi[gs1]
     assert js1.ntot == 4
     assert js1.nevi == 3
     gs2 = GenomicSegment("MN996528", 130, 332, "+")
     js2 = ja.jsdi[gs2]
     assert js2.ntot == 1
     assert js2.nevi == 1
     assert js2.as_bed() == [
         'MN996528', '130', '332', 'MN996528:130-131^331-332(+)', '0', '+',
         '130', '332', '0,0,0', '2', '1,1,', '0,201,', '1', '1'
     ]
예제 #4
0
def extend_gtf_frame(pickle):

    pickle_path = global_args.output_dir + \
        global_args.annotation_file[:-4].split("/")[-1] + ".sav"

    if os.path.isfile(pickle_path) == False:
        create_assembly_dill(global_args.annotation_file)
    gtf_coords_file = list(dill.load(open(pickle_path, "rb")))

    for transcript in gtf_coords_file:
        span = transcript.spanning_segment
        new_region = GenomicSegment(span.chrom,
                                    span.start - global_args.offset,
                                    span.start, span.strand)
        new_region_2 = GenomicSegment(span.chrom, span.end,
                                      span.end + global_args.offset,
                                      span.strand)
        transcript.add_segments(new_region, new_region_2)
        yield transcript
예제 #5
0
 def __init__(self, r):
     assert isinstance(r, pysam.AlignedSegment)
     self.r = r
     self.chrom = r.reference_name
     self.strand = '-' if self.r.is_reverse else '+'
     self.blocks = self.r.blocks
     self.tags = {tag: val for tag, val in self.r.tags}
     self.sgc = SegmentChain()
     for (start, end) in self.r.blocks:
         self.sgc.add_segments(
             GenomicSegment(self.chrom, start, end, self.strand))
예제 #6
0
def main(args, loglevel):

    logging.basicConfig(format="%(levelname)s: %(message)s", level=loglevel)

    ORFS = GFF3_TranscriptAssembler(open(args.gff_file))
    with open(args.output_file, 'w') as output_file:
        for my_orf in ORFS:
            span = my_orf.spanning_segment
            if my_orf.strand == "+":
                new_region = GenomicSegment(
                    span.chrom, span.start - args.upstream_utr_length,
                    span.end + args.downstream_utr_length, span.strand)
            else:
                new_region = GenomicSegment(
                    span.chrom, span.start - args.downstream_utr_length,
                    span.end + args.upstream_utr_length, span.strand)
            # copy metadata attributes from old ORF
            logging.debug(my_orf.attr)
            new_transcript = Transcript(new_region, **my_orf.attr)
            logging.debug(new_transcript)
            output_file.write(new_transcript.as_gff3())
예제 #7
0
파일: subgenome.py 프로젝트: sqreb/cov2sg-1
 def shift(self, read, genome):
     """
     mapping1: AAAAA..............abcYXXBBBBB
     mapping2: AAAAAabc..............YXXBBBBB
     Convert mapping2 to mapping1
     """
     left_seq = read.fetch_sequence(self.gs.start - self.ARM_MINLEN,
                                    self.gs.start)
     right_ref_seq = genome.fetch(self.gs.chrom,
                                  self.gs.end - self.ARM_MINLEN,
                                  self.gs.end).upper()
     rev_left_seq = left_seq[::-1]
     rev_right_ref_seq = right_ref_seq[::-1]
     shift_len = 0
     for indx in range(self.ARM_MINLEN):
         if rev_left_seq[indx] == rev_right_ref_seq[indx]:
             shift_len += 1
         else:
             break
     new_gs = GenomicSegment(self.gs.chrom, self.gs.start - shift_len,
                             self.gs.end - shift_len, self.gs.strand)
     self.gs = new_gs
예제 #8
0
def test_segmentchain(sgc1):
    sgc1.add_segments(GenomicSegment("chr1", 350, 450, "+"))
    assert str(sgc1) == "chr1:100-200^300-450(+)"
예제 #9
0
def sgc3():
    return SegmentChain(GenomicSegment("chr1", 350, 450, "+"))
예제 #10
0
def sgc2():
    exon1 = GenomicSegment("chr1", 1150, 1200, "+")
    exon2 = GenomicSegment("chr1", 1300, 1450, "+")
    return SegmentChain(exon1, exon2, ID="SGC2", alias="SGC2")
예제 #11
0
def sgc1():
    exon1 = GenomicSegment("chr1", 100, 200, "+")
    exon2 = GenomicSegment("chr1", 300, 400, "+")
    return SegmentChain(exon1, exon2, ID="SGC1", alias="SGC1")
예제 #12
0
파일: sj.py 프로젝트: sqreb/cov2sg-1
 def as_seg(hit):
     chrom, start, end, strand = hit.split("\t")
     return GenomicSegment(chrom, int(start), int(end), strand)
예제 #13
0
def main(args, loglevel):

    logging.basicConfig(format="%(levelname)s: %(message)s", level=loglevel)

    logging.debug("Building sequence dictionary")
    seq_dict = SeqIO.index(args.fasta, "fasta")
    logging.debug("Reading Annotations")

    if args.gff:
        transcripts = list(
            GFF3_TranscriptAssembler(open(args.gff),
                                     add_three_for_stop=args.add_three))
    elif args.gtf:
        transcripts = list(
            GTF2_TranscriptAssembler(open(args.gtf),
                                     add_three_for_stop=args.add_three))

    logging.debug("Reading Alignments")
    alignments = BAMGenomeArray([args.bam])

    if sum([args.threeprime, args.fiveprime]) != 1:
        logging.error("Must specify only one and at least one mapping type "
                      "(--fiveprime or --threeprime)")
        exit(1)

    if args.threeprime:
        alignments.set_mapping(ThreePrimeMapFactory(offset=args.offset))
    elif args.fiveprime:
        alignments.set_mapping(FivePrimeMapFactory(offset=args.offset))

    alignments.add_filter(
        "size", SizeFilterFactory(min=args.min_length, max=args.max_length))
    outfh = open(args.outfile, 'w')
    outfh.write("%s\n" % "\t".join(
        ("transcript_id", "gene_id", "codon_seq", "codon_index",
         "codon_count_sum", "position_1_count", "position_2_count",
         "position_3_count")))
    for (i, transcript) in enumerate(transcripts):
        if (i == 0 or (i + 1) % 100 == 0):
            logging.info("Evaluated %s genes" % (i + 1))
        logging.debug(transcript.get_name())
        logging.debug(pprint.pformat(transcript.attr))
        if (transcript.get_cds().get_length() <= 0):
            logging.info("Transcript %s has zero (0) length CDS, skipping!",
                         transcript.get_name())
            continue
        if transcript.attr.get("pseudo", None) == "true":
            logging.info("Transcript %s is a pseudogene, skipping!",
                         transcript.get_name())
            continue
        logging.debug('Transcript {} attributes: {}'.format(
            transcript.get_name(), transcript.attr))

        # Many Ensembl MT annotations have incomplete codon records.
        # These are coded with an `ensembl_end_phase` attribute
        # These should be filled in with 'A's, which come from the
        # polyA tail
        transcript_cds = transcript.get_cds()
        transcript_seq = transcript_cds.get_sequence(seq_dict)

        end_phase = transcript_cds.get_length() % 3
        extra_bases = 0
        if end_phase != 0:
            extra_bases = 3 - end_phase
            logging.warning("Transcript %s CDS length (%i) is not a multiple "
                            "of three, adding %i \"A\" bases" %
                            (transcript.get_name(),
                             transcript_cds.get_length(), extra_bases))
            transcript_seq = transcript_seq + "A" * extra_bases
            last_segment = transcript_cds[-1]
            logging.debug(last_segment)
            transcript_cds.add_segments(
                GenomicSegment(last_segment.chrom, last_segment.end,
                               last_segment.end + extra_bases,
                               last_segment.strand))

        num_codons = int(numpy.floor(len(transcript_seq) / 3))
        logging.debug("Trancript %s length %i basepairs, %i codons" %
                      (transcript.get_name(), len(transcript_seq), num_codons))
        logging.debug('>{} {}\n{}'.format(transcript.get_name(),
                                          transcript.get_gene(),
                                          transcript_seq.upper()))

        start_codon = transcript_seq[:3].upper()
        stop_codon = transcript_seq[-3:].upper()
        if start_codon not in args.start_codons:
            logging.error('Transcript {} start codon "{}" is not valid'.format(
                transcript.get_name(), start_codon))
        if stop_codon not in args.stop_codons:
            logging.error('Transcript {} stop codon "{}" is not valid'.format(
                transcript.get_name(), stop_codon))
        logging.debug(transcript_cds.as_gff3())

        transcript_counts = transcript_cds.get_counts(alignments)

        for codon_index in range(1, num_codons + 1):
            codon_start = (codon_index - 1) * 3
            codon_stop = codon_start + 3
            codon_seq = transcript_seq[codon_start:codon_stop]
            codon_counts = transcript_counts[codon_start:codon_stop]
            codon_count_sum = sum(codon_counts)
            transcript_id = transcript.get_name()
            if ":" in transcript_id:
                prefix, transcript_id = transcript_id.split(":", 1)
            gene_ids_raw = transcript.attr.get("Parent", "")
            gene_ids = []
            for gene_id_raw in gene_ids_raw:
                if ":" in gene_id_raw:
                    prefix, gene_id = gene_id_raw.split(":", 1)
                    gene_ids.append(gene_id)
            outfh.write("%s\t%s\t%s\t%i\t%i\t%i\t%i\t%i\n" %
                        (transcript_id, ",".join(gene_ids), codon_seq.upper(),
                         codon_index, codon_count_sum, codon_counts[0],
                         codon_counts[1], codon_counts[2]))
예제 #14
0
 def get_junc(self):
     js = self.larm.spanning_segment.end
     je = self.rarm.spanning_segment.start
     return GenomicSegment(self.larm.chrom, js, je, self.larm.strand)
예제 #15
0
파일: subgenome.py 프로젝트: sqreb/cov2sg-1
 def sgc(self):
     sgc = SegmentChain()
     for (start, end) in self.blocks:
         sgc.add_segments(
             GenomicSegment(self.chrom, start, end, self.strand))
     return sgc