Exemplo n.º 1
0
    def __set_fasta_seq(self, fasta_seq):

        if isinstance(fasta_seq, Seq.Seq):
            self.__fasta_seq = pyfaidx.Sequence(name=self.id, seq=str(fasta_seq))
        elif isinstance(fasta_seq, str):
            self.__fasta_seq = pyfaidx.Sequence(name=self.id, seq=str(fasta_seq))
            assert len(self.__fasta_seq) == len(str(fasta_seq))
        elif isinstance(fasta_seq, pyfaidx.Sequence):
            self.__fasta_seq = fasta_seq
        else:
            raise ValueError("Unkown type: {}".format(type(fasta_seq)))
Exemplo n.º 2
0
def tests():
    seq_a = pyfaidx.Sequence('contig1', 'ATGCGT', start=1, end=6)
    print(seq_a, seq_a.start, seq_a.end)
    seq_b = pyfaidx.Sequence('contig1', 'GCGAGT', start=13, end=18)
    seq_a = connectFragment(seq_a, seq_b)
    # Connect fragment after original
    print(seq_a, seq_a.start, seq_a.end)
    # Wrong contig
    seq_c = pyfaidx.Sequence('different_contig1', 'CCCCCC', start=7, end=12)
    seq_a = connectFragment(seq_a, seq_c)
    seq_c = pyfaidx.Sequence('contig1', 'CCCCCC', start=7, end=12)
    seq_a = connectFragment(seq_a, seq_c)
    # Substitute fragment in centre
    print(seq_a, seq_a.start, seq_a.end)
    seq_a = pyfaidx.Sequence('contig1', 'ATGCGT', start=1, end=6)
    seq_b = pyfaidx.Sequence('contig1', 'GCGAGT', start=13, end=18)
    seq_b.percentIdentity = 97.1
    seq_b = connectFragment(seq_b, seq_a)
    # Connect fragment before original
    print(seq_b, seq_b.start, seq_b.end, seq_b.percentIdentity)
    seq_c = pyfaidx.Sequence('contig1', 'CCC', start=5, end=7)
    seq_c.percentIdentity = 97.2
    seq_b = connectFragment(seq_b, seq_c)
    print(seq_b, seq_b.start, seq_b.end, seq_b.percentIdentity)
    seq_c = pyfaidx.Sequence('contig1', 'AATTT', start=8, end=12)
    seq_b = connectFragment(seq_b, seq_c)
    print(seq_b, seq_b.start, seq_b.end, len(seq_b))
Exemplo n.º 3
0
    def test_init(self):

        with self.assertRaises(ValueError):
            tcheck = TranscriptChecker(self.model, None)

        for wrong_splices in ["AGGT", None, 100]:
            with self.assertRaises(ValueError):
                tcheck = TranscriptChecker(self.model, self.model_fasta, canonical_splices=wrong_splices)

        tcheck = TranscriptChecker(self.model, self.model_fasta)
        self.assertEqual(tcheck.cdna_length, 1718)
        self.assertEqual(sorted(tcheck.exons), sorted([(exon.start, exon.end) for exon in self.exons]))
        self.assertEqual(tcheck.fasta_seq, self.model_fasta)

        with self.subTest(initializer=Bio.Seq.Seq):
            _ = TranscriptChecker(self.model, Bio.Seq.Seq(str(self.model_fasta)))

        with self.subTest(initializer=str):
            _ = TranscriptChecker(self.model, str(self.model_fasta))

        with self.subTest(initializer=pyfaidx.Sequence):
            _ = TranscriptChecker(self.model, pyfaidx.Sequence(seq=str(self.model_fasta), name=tcheck.id))

        # Now check initializing with a GFF/GTF line
        for out_format in ["gtf", "gff3"]:
            with self.subTest(out_format=out_format):
                line = self.model.format(out_format).split("\n")[0]
                try:
                    tcheck = TranscriptChecker(line, self.model_fasta)
                except ValueError as exc:
                    raise ValueError(line)
Exemplo n.º 4
0
def match_seq(rec: pd.Series, sequences: pyfaidx.Fasta) -> pyfaidx.Sequence:
    """Given a feature in a GTF/GFF read in by gtfparse, match_seq() will extract the corresponding
    DNA sequence and create a new pyfaidx.Sequence object

    Parameters
    ----------
    rec : :class:`~pandas.Series`
        Information for a feature (i.e. gene, exon, etc...). Requires the following indices: strand,
        gene_name, feature, strand, start, end, seq_hash
    sequences : :class:`~pyfaidx.Sequence`
        Object containing sequences to match against the positions in the index.

    Returns
    -------
    :class:`~pyfaidx.Sequence object` with annotation from `rec` and sequence information from
    `sequences`.
    """

    try:
        rev: bool = bool(rec["strand"] == "-")

        seq = pyfaidx.Sequence(
            name=f"{rec['gene_name']}_"
            f"{rec['feature']}_"
            f"{rec['strand']}_"
            f"{rec['start']}_"
            f"{rec['end']}_"
            f"{rec['seq_hash']}",
            seq=sequences.get_seq(name=rec["seqname"],
                                  start=rec["start"],
                                  end=rec["end"],
                                  rc=rev).seq,
        )
        return seq
    except ValueError:
        print(f"problem with {rec['gene_name']} {rec['start']} "
              f"{rec['end']} {rec['seqname']} {rec['strand']}")
Exemplo n.º 5
0
    def pad_transcripts(self):

        """
        """

        try:
            self.fai = pyfaidx.Fasta(self.json_conf["reference"]["genome"])
        except KeyError:
            raise KeyError(self.json_conf.keys())

        five_graph = self.define_graph(self.transcripts, self.__share_extreme, three_prime=False)
        three_graph = self.define_graph(self.transcripts, self.__share_extreme, three_prime=True)

        five_comm = deque(sorted(self.find_communities(five_graph),
                              key=lambda clique: min(self[_].start for _ in clique)))
        three_comm = deque(sorted(self.find_cliques(three_graph),
                              key=lambda clique: max(self[_].end for _ in clique),
                               reverse=True))

        five_found = set()

        # First do the 5' end

        __to_modify = dict()

        while len(five_comm) > 0:

            comm = five_comm.popleft()
            comm = deque(sorted(list(set.difference(set(comm), five_found)),
                         key=lambda tid: self[tid].start))
            if len(comm) == 1:
                continue
            first = comm.popleft()
            five_found.add(first)
            comm_start = self[first].start
            # self[first].strip_cds()
            for tid in comm:
                if ((self[tid].start - comm_start + 1) <
                        self.json_conf["pick"]["alternative_splicing"]["ts_distance"] and
                        len([_ for _ in self.splices if comm_start <= _ <= self[tid].start]) <
                        self.json_conf["pick"]["alternative_splicing"]["ts_max_splices"] and
                        self[tid].start > comm_start):
                    __to_modify[tid] = [comm_start, False]
                    five_found.add(tid)
                else:
                    continue
            comm = deque([_ for _ in comm if _ not in five_found])

            if comm:
                five_comm.appendleft(comm)

        # Then do the 3' end

        three_found = set()

        while len(three_comm) > 0:

            comm = three_comm.popleft()
            comm = deque(sorted(list(set.difference(set(comm), three_found)),
                         key=lambda tid: self[tid].end, reverse=True))
            if len(comm) == 1:
                continue
            first = comm.popleft()
            three_found.add(first)
            comm_end = self[first].end
            for tid in comm:
                if ((self[tid].end - comm_end + 1) <
                        self.json_conf["pick"]["alternative_splicing"]["ts_distance"] and
                        len([_ for _ in self.splices if self[tid].end <= _ <= comm_end]) <
                        self.json_conf["pick"]["alternative_splicing"]["ts_max_splices"] and
                        self[tid].end < comm_end):

                    if tid in __to_modify:
                        __to_modify[tid][1] = comm_end
                    else:
                        __to_modify[tid] = [False, comm_end]

                    three_found.add(tid)
                else:
                    continue
            comm = deque([_ for _ in comm if _ not in three_found ])
            if comm:
                three_comm.appendleft(comm)

        # Now we can do the proper modification
        for tid in __to_modify:
            new_transcript = self[tid].copy()
            old_length = new_transcript.cdna_length
            # First get the ORFs
            if new_transcript.combined_cds_length > 0:
                internal_orfs = list(new_transcript.get_internal_orf_beds())
            else:
                internal_orfs = []
            # Remove the CDS and unfinalize
            new_transcript.strip_cds()
            new_transcript.unfinalize()

            upstream = 0
            downstream = 0
            if __to_modify[tid][0]:
                __new_exon = (__to_modify[tid][0], new_transcript.exons[0][1])
                upstream = new_transcript.start - __to_modify[tid][0]
                new_transcript.start = __to_modify[tid][0]
                new_transcript.remove_exon(new_transcript.exons[0])
                new_transcript.add_exon(__new_exon)
                new_transcript.exons = sorted(new_transcript.exons)
            if __to_modify[tid][1]:
                __new_exon = (new_transcript.exons[-1][0], __to_modify[tid][1])
                downstream = __to_modify[tid][1] - new_transcript.end
                new_transcript.end = __to_modify[tid][1]
                new_transcript.remove_exon(new_transcript.exons[-1])
                new_transcript.add_exon(__new_exon)
                new_transcript.exons = sorted(new_transcript.exons)
            # Now for the difficult part
            if internal_orfs and (__to_modify[tid][1] or __to_modify[tid][0]):
                self.logger.warning("Enlarging the ORFs for TID %s (%s)",
                                    tid, __to_modify[tid])

                new_orfs = []
                seq = ''
                for exon in new_transcript.exons:
                    seq += self.fai[self.chrom][exon[0] - 1:exon[1]].seq
                seq = pyfaidx.Sequence(tid, seq)
                self.logger.warning("For TID %s we have new length %d, old length %d, exons:\n%s",
                                    tid, len(seq), old_length, new_transcript.exons)
                if self.strand == "-":
                    seq = seq.reverse.complement
                    upstream, downstream = downstream, upstream
                for orf in internal_orfs:
                    self.logger.warning("Old ORF: %s", str(orf))
                    orf.expand(seq, upstream, downstream)
                    self.logger.warning("New ORF: %s", str(orf))
                    new_orfs.append(orf)
                from ..utilities.log_utils import create_default_logger
                new_transcript.logger = create_default_logger("TEMP")
                new_transcript.logger.setLevel("DEBUG")
                new_transcript.load_orfs(new_orfs)
                new_transcript.logger.setLevel("WARNING")

            # Now finalize again
            new_transcript.finalize()
            self.transcripts[tid] = new_transcript