Exemplo n.º 1
0
def fetch(args):
    fasta = Fasta(args.fasta)
    regions = args.regions
    if args.list:
        with args.list as listfile:
            for region in listfile:
                regions.append(region.rstrip())
    for region in regions:
        region = region.split()[0]
        try:
            rname, interval = region.split(':')
        except ValueError:
            rname = region
            interval = None
        try:
            start, end = interval.split('-')
            sequence = fasta[rname][int(start) - 1:int(end)]
        except (AttributeError, ValueError):
            sequence = fasta[rname][:]
        if args.complement:
            sequence = sequence.complement
        if args.reverse:
            sequence = sequence.reverse
        line_len = fasta[rname]._fa.faidx.index[rname]['lenc']
        if args.name:
            sys.stdout.write('>' + sequence.name + '\n')
            for line in wrap_sequence(line_len, sequence.seq):
                sys.stdout.write(line)
        else:
            for line in wrap_sequence(line_len, sequence.seq):
                sys.stdout.write(line)
    fasta.close()
Exemplo n.º 2
0
Arquivo: UM.py Projeto: tzeitim/ogtk
def fa_to_tabular(ifn, oufn, start, end):
    fa = Fasta(ifn)
    fout = open(oufn, 'w')
    fout.write("\t".join(["mm","hash","umi", "counts","seq"]) + '\n')
    for i in fa:
        name = i.name.split("_")
        mm = name[0]
        hash_key = name[1]
        umi = name[2]
        counts = name [3]
        seq = i[:].seq[start:end]
        fout.write("\t".join([mm, hash_key, umi, counts, seq])+'\n')
    fout.close()
    fa.close()
Exemplo n.º 3
0
 def _get_seq(self, chrom, start, stop):
     if self.in_mem:
         seq = self.fasta[chrom][start:stop]
     else:
         if self.thread_safe:
             fasta = Fasta(self.fasta,
                           as_raw=True,
                           sequence_always_upper=True,
                           read_ahead=self.read_ahead)
             seq = np.array(list(fasta[chrom][start:stop]))
             fasta.close()
         else:
             seq = np.array(list(self.fasta[chrom][start:stop]))
     return seq
Exemplo n.º 4
0
    def test_reverse_var(self):
        ''' check that reverse_var works correctly
        '''
        genome = Fasta(self.fa)
        var = self.Var(chrom='chrN', pos=11, ref='G', alts=['A', 'C'])
        rev = reverse_var(var, genome)
        self.assertEqual(rev.ref, 'C')

        # the position stays the same
        self.assertEqual(var.pos, 11)

        # multi-allelic variants with indels return None
        var = self.Var(pos=10, ref='G', alts=['A', 'CC'], info={})
        self.assertIsNone(reverse_var(var, genome))
        genome.close()
Exemplo n.º 5
0
class FastaStringExtractor(BaseExtractor):
    """Fasta file extractor

    NOTE: The extractor is not thread-save.
    If you with to use it with multiprocessing,
    create a new extractor object in each process.

    # Arguments
      fasta_file (str): path to the fasta_file
      use_strand (bool): if True, the extracted sequence
        is reverse complemented in case interval.strand == "-"
      force_upper (bool): Force uppercase output
    """

    def __init__(self, fasta_file, use_strand=False, force_upper=False):
        from pyfaidx import Fasta

        self.fasta_file = fasta_file
        self._use_strand = use_strand
        self.fasta = Fasta(self.fasta_file)
        self.force_upper = force_upper

    def extract(self, interval: Interval, use_strand=None, **kwargs) -> str:
        """
        Returns the FASTA sequence in some given interval as string

        Args:
            interval: the interval to query
            use_strand (bool, optional): if True, the extracted sequence
                is reverse complemented in case interval.strand == "-".
                Overrides `self.use_strand`
            **kwargs:

        Returns:
            sequence of requested interval

        """
        # reverse-complement seq the negative strand
        if use_strand is None:
            use_strand = self.use_strand
        rc = use_strand and interval.strand == "-"

        # pyfaidx wants a 1-based interval
        seq = str(self.fasta.get_seq(
            interval.chrom,
            interval.start + 1,
            interval.stop,
            rc=rc
        ).seq)

        # optionally, force upper-case letters
        if self.force_upper:
            seq = seq.upper()
        return seq

    def close(self):
        return self.fasta.close()
Exemplo n.º 6
0
class FastaWrapper(GenomeWrapper):
    def __init__(self,
                 fasta_file,
                 alpha='dna',
                 one_hot=True,
                 channel_last=True,
                 in_mem=False,
                 thread_safe=False,
                 read_ahead=10000):
        super().__init__(alpha, one_hot, channel_last, in_mem, thread_safe)
        self.fasta = Fasta(fasta_file,
                           as_raw=True,
                           sequence_always_upper=True,
                           read_ahead=read_ahead)
        self._chroms = list(self.fasta.keys())
        seq_lens = [len(self.fasta[chrom]) for chrom in self._chroms]
        self._chroms_size = dict(zip(self._chroms, seq_lens))
        self.read_ahead = read_ahead
        if in_mem:
            fasta_onehot_dict = self._encode_seqs(self.fasta)
            self.fasta.close()
            self.fasta = fasta_onehot_dict
            self.thread_safe = True
        else:
            if thread_safe:
                self.fasta.close()
                self.fasta = fasta_file

    def close(self):
        if not self.thread_safe:
            self.fasta.close()

    @staticmethod
    def _encode_seqs(fasta):
        # Converts a FASTA object into a dictionary of one-hot coded boolean matrices
        fasta_dict = {}
        pbar = tqdm(fasta)
        for record in pbar:
            pbar.set_description(desc='Loading sequence: ' + record.name)
            seq = record[:]
            seq = np.array(list(seq))
            fasta_dict[record.name] = seq
        return fasta_dict

    def _get_seq(self, chrom, start, stop):
        if self.in_mem:
            seq = self.fasta[chrom][start:stop]
        else:
            if self.thread_safe:
                fasta = Fasta(self.fasta,
                              as_raw=True,
                              sequence_always_upper=True,
                              read_ahead=self.read_ahead)
                seq = np.array(list(fasta[chrom][start:stop]))
                fasta.close()
            else:
                seq = np.array(list(self.fasta[chrom][start:stop]))
        return seq
Exemplo n.º 7
0
    def integrate(self, output_table, gtf_file, fasta_file):
        log.info("Integrating results")

        def insert_in_index(index, entries, score, i):
            if score not in index:
                index[score] = {}

            key = entries[0].chrA + ':' + str(
                entries[0].posA
            ) + '(' + entries[0].strandA + ')-' + entries[0].chrB + ':' + str(
                entries[0].posB) + '(' + entries[0].strandB + ')|' + str(i)
            index[score][key] = entries

        with open(output_table, 'w') as fh_out:
            header = self.header.split("\t")
            header = "\t".join(header[:-5] + [
                'full-gene-dysregulation', 'frameshift=0', 'frameshift=+1',
                'frameshift=+2', 'splice-motif-edit-distance',
                "exons from (5')", "exons to (3')"
            ] + header[-5:])

            fh_out.write("shared-id\tfusion\t" + header)

            # index used to find duplicates
            self.idx = HTSeq.GenomicArrayOfSets("auto", stranded=True)

            # index used to annotate gene names: TMPRSS2->ERG
            gene_annotation = GeneAnnotation(gtf_file)
            dfs = DetectFrameShifts(gtf_file) if gtf_file else None

            ffs = Fasta(fasta_file) if fasta_file else None

            intronic_linear = []
            remainder = []

            # Find 'duplicates' or fusions that belong to each other
            log.info(
                "Searching for intronic and exonic breaks that belong to the same event"
            )
            for e in self:
                if dfs and e.RNAstrandA != '.' and e.RNAstrandB != '.':
                    done_breaks = set([])

                    if e.donorA > e.donorB:
                        exons_from, exons_to, frame_shifts = dfs.evaluate(
                            [e.chrA, e.posA, e.RNAstrandA],
                            [e.chrB, e.posB, e.RNAstrandB], 2)
                    else:
                        exons_from, exons_to, frame_shifts = dfs.evaluate(
                            [e.chrB, e.posB, e.RNAstrandB],
                            [e.chrA, e.posA, e.RNAstrandA], 2)

                    done_breaks.add(e.chrA + ':' + str(e.posA) + '/' +
                                    str(e.posA + 1) + '(' + e.strandA + ')->' +
                                    e.chrB + ':' + str(e.posB) + '/' +
                                    str(e.posB + 1) + '(' + e.strandB + ')')

                    fgd = [x[0] + '->' + x[1] for x in frame_shifts['fgd']]
                    frameshifts_0 = [
                        x[0][0] + '->' + x[1][0] for x in frame_shifts[0]
                    ]
                    frameshifts_1 = [
                        x[0][0] + '(+' + str(x[0][1]) + ')->' + x[1][0] +
                        '(+' + str(x[1][1]) + ')' for x in frame_shifts[1]
                    ]
                    frameshifts_2 = [
                        x[0][0] + '(+' + str(x[0][1]) + ')->' + x[1][0] +
                        '(+' + str(x[1][1]) + ')' for x in frame_shifts[2]
                    ]

                    for additional_breaks in e.structure.split('&'):
                        if additional_breaks != '':
                            params = additional_breaks.split(':(')
                            n_split_reads = sum([
                                int(x.split(':')[1])
                                for x in params[1].rstrip(')').split(',')
                                if x.split(':')[0] != 'discordant_mates'
                            ])

                            posAB = params[0].split(':')
                            posA, posB = int(posAB[1].split('/')[0]), int(
                                posAB[2].split('/')[0])

                            if params[
                                    0] not in done_breaks and n_split_reads > 0:
                                if e.donorA > e.donorB:  # nice, use same thing to swap if necessary
                                    exons_from_, exons_to_, frame_shifts = dfs.evaluate(
                                        [e.chrA, posA, e.RNAstrandA],
                                        [e.chrB, posB, e.RNAstrandB], 2)
                                else:
                                    exons_from_, exons_to_, frame_shifts = dfs.evaluate(
                                        [e.chrB, posB, e.RNAstrandB],
                                        [e.chrA, posA, e.RNAstrandA], 2)

                                exons_from += exons_from_
                                exons_to += exons_to_
                                del (exons_from_, exons_to_)

                                fgd += [
                                    x[0] + '->' + x[1]
                                    for x in frame_shifts['fgd']
                                ]
                                frameshifts_0 += [
                                    x[0][0] + '->' + x[1][0]
                                    for x in frame_shifts[0]
                                ]
                                frameshifts_1 += [
                                    x[0][0] + '(+' + str(x[0][1]) + ')->' +
                                    x[1][0] + '(+' + str(x[1][1]) + ')'
                                    for x in frame_shifts[1]
                                ]
                                frameshifts_2 += [
                                    x[0][0] + '(+' + str(x[0][1]) + ')->' +
                                    x[1][0] + '(+' + str(x[1][1]) + ')'
                                    for x in frame_shifts[2]
                                ]

                            done_breaks.add(params[0])

                    e.exons_from = sorted(list(set(exons_from)))
                    e.exons_to = sorted(list(set(exons_to)))
                    del (exons_from, exons_to)

                    e.fgd = ','.join(sorted(list(set(fgd))))
                    e.frameshift_0 = ','.join(sorted(list(set(frameshifts_0))))
                    e.frameshift_1 = ','.join(sorted(list(set(frameshifts_1))))
                    e.frameshift_2 = ','.join(sorted(list(set(frameshifts_2))))
                    del (fgd, frameshifts_0, frameshifts_1, frameshifts_2)

                if ffs:
                    e.is_on_splice_junction_motif(ffs)

                if e.x_onic == 'intronic' and e.circ_lin == 'linear':
                    intronic_linear.append(e)
                else:
                    remainder.append(e)

                def insert(pos, e):
                    if pos[0][0:3] == 'chr':
                        chrom = pos[0][3:]
                    else:
                        chrom = pos[0]

                    # position_accession = HTSeq.GenomicPosition(pos[0], pos[1], pos[2])
                    position_accession = HTSeq.GenomicInterval(
                        chrom, pos[1], pos[1] + 1, pos[2])
                    position = self.idx[position_accession]
                    position += e

                insert((e.chrA, e.posA, e.strandA), e)
                insert((e.chrB, e.posB, e.strandB), e)

            if ffs != None:
                ffs.close()

            # Reorder
            log.info("Re-order and find matching entries")
            idx2 = {}
            q = 0
            for e in intronic_linear:
                results_split = [set([]), set([])]
                positions = [(e.chrA, e.posA, e.strandA),
                             (e.chrB, e.posB, e.strandB)]

                for pos_i in [0, 1]:
                    pos = positions[pos_i]

                    if pos[2] == '-':
                        pos1 = pos[1] - 200000
                        pos2 = pos[1]
                    else:
                        pos1 = pos[1]
                        pos2 = pos[1] + 200000

                    if pos[0][0:3] == 'chr':
                        chrom = pos[0][3:]
                    else:
                        chrom = pos[0]

                    for step in self.idx[HTSeq.GenomicInterval(
                            chrom, max(0, pos1), pos2, pos[2])].steps():
                        for e2 in [_ for _ in step[1] if _ != e]:
                            if e2.strandA == e.strandA and e2.strandB == e.strandB:
                                results_split[pos_i].add(e2)

                results = results_split[0].intersection(results_split[1])
                top_result = (None, 9999999999999)
                for r in results:
                    d1 = (r.posA - e.posA)
                    d2 = (r.posB - e.posB)
                    sq_d = math.sqrt(pow(d1, 2) + pow(d2, 2))

                    shared_score = math.sqrt(
                        (pow(e.score, 2) + pow(r.score, 2)) * 0.5)
                    penalty = 1.0 * sq_d / shared_score

                    if penalty < top_result[1]:
                        top_result = (r, penalty)

                if top_result[0]:
                    insert_in_index(idx2, [e, top_result[0]],
                                    e.score + top_result[0].score, q)
                else:
                    insert_in_index(idx2, [e], e.score, q)
                q += 1

            for e in remainder:
                insert_in_index(idx2, [e], e.score, q)
                q += 1

            log.info("Determining fusion gene names and generate output")
            # Generate output
            i = 1
            exported = set([])
            for score in sorted(idx2.keys(), reverse=True):
                for key in sorted(idx2[score].keys()):
                    added = 0
                    for entry in idx2[score][key]:
                        if entry not in exported:
                            acceptors_donors = entry.get_donors_acceptors(
                                gene_annotation)
                            line = entry.line[:-5] + [
                                entry.fgd, entry.frameshift_0,
                                entry.frameshift_1, entry.frameshift_2,
                                entry.edit_dist_to_splice_motif, ",".join(
                                    entry.exons_from), ",".join(entry.exons_to)
                            ] + entry.line[-5:]

                            fh_out.write(
                                str(i) + "\t" + acceptors_donors + "\t" +
                                "\t".join(line) + "\n")
                            exported.add(entry)
                            added += 1

                    if added > 0:
                        i += 1
def main():
    parser = argparse.ArgumentParser(description='Download genebank sequence')
    parser.add_argument('--query',
                        '-q',
                        help="Genebank id for download",
                        required=True)
    parser.add_argument('--fasta',
                        '-o',
                        help="Full length sequence in genebank",
                        default="Rfam/seeds/CMfinder-set-full-length.fasta")
    parser.add_argument('--download',
                        '-d',
                        action="store_true",
                        help="whether perform downloading",
                        default=False)
    args = parser.parse_args()

    Entrez.email = "*****@*****.**"
    gbIds = np.unique(open(args.query).read().strip().split("\n"))
    print("Load downloaded sequences ...")
    fastaObj = Fasta(args.fasta)
    downloaded = list(fastaObj.keys())
    fastaObj.close()
    print("Done .")
    print("Total downloaded sequences: {}".format(len(fastaObj.keys())))
    print("{} query sequence".format(gbIds.shape[0]))
    gbIds = np.setdiff1d(gbIds, downloaded)
    print("{} query sequence are not downloaded".format(gbIds.shape[0]))
    N = 0
    for gbId in gbIds:
        if gbId.startswith("URS"):
            N += 1
    print("{} query sequence are in RNAcentral annotation".format(N))

    if not args.download:
        sys.exit(0)

    fout = open(args.fasta, "a")

    for gbId in gbIds:
        try:
            if gbId.startswith("URS"):
                continue
                RNAcentralId, taxo = gbId.strip().split("_")
                print("Start retriving {} from RNAcentral...".format(gbId),
                      file=sys.stderr)
                content = requests.get(
                    "https://rnacentral.org/api/v1/rna/{}/{}".format(
                        RNAcentralId, taxo),
                    headers={
                        "Accept": "application/json"
                    }).text
                data = json.loads(content)
                entry = ">" + gbId + " " + data["description"]
                sequence = data["sequence"]
                print(entry, file=fout)
                print(entry)
                print(sequence, file=fout)
                print("Done.")
            else:
                print(
                    "Start retriving {} from ncbi nucleotide...".format(gbId),
                    file=sys.stderr)
                handle = Entrez.efetch(db="nucleotide",
                                       id=gbId,
                                       rettype="fasta",
                                       retmode="text")
                content = handle.read().strip()
                contents = content.split("\n")
                entry = contents[0] + "\n" + "".join(contents[1:])
                print(entry, file=fout)
                print("Done.")
        except:
            print("Error retriving {}, skip ...".format(gbId), file=sys.stderr)
        time.sleep(0.5)
    fout.close()
    if os.path.exists(args.fasta + ".fai"):
        os.remove(args.fasta + ".fai")
Exemplo n.º 9
0
def get_reference_sequence(CHROM):
    genome = Fasta(cfg.get("ref", "genome"))
    sequence_by_chromosome[CHROM] = genome[str(CHROM)][:].seq
    chromosome_lengths[CHROM] = len(sequence_by_chromosome[CHROM])
    genome.close()
Exemplo n.º 10
0
class Reference(object):
    """Interface to the human genome reference file.

    This class uses ``pyfaidx`` to parse the genome reference file referenced
    by ``settings.REFERENCE_PATH``.

    This can only be a single plain fasta file.

    Also note that if the path is not in the ``~/.gtconfig/gtrc.ini`` file,
    gepyto will look for an environment variable named ``REFERENCE_PATH``.

    If the genome file can't be found, this class fallbacks to the Ensembl
    remote API to get the sequences.

    This behaviour can also be forced by using the ``remote=True`` argument.

    """
    def __init__(self, remote=False):
        if not remote:
            try:
                self.ref = Fasta(settings.REFERENCE_PATH)
            except IOError:
                self.ref = _RemoteReference(settings.BUILD)
        else:
            self.ref = _RemoteReference(settings.BUILD)

        # Add a get method. This will not be sensitive to "chr" prefixes.
        def get(fasta, chrom):
            chr_prefix = chrom.startswith("chr")
            try:
                return fasta[chrom]
            except KeyError:
                pass
            try:
                # If there was a prefix, we try without.
                if chr_prefix:
                    return fasta[chrom[3:]]
                # If there was no prefix, we try with.
                else:
                    return fasta["chr{}".format(chrom)]
            except KeyError:
                # If it is a true mismatch, we return None.
                return None

        self.ref.get = functools.partial(get, self.ref)

    def check_variant_reference(self, variant, flip=False):
        """Given a variant, makes sure that the 'ref' allele is consistent
        with the human genome reference.

        :param variant: The variant to verify.
        :type variant: :py:class:`gepyto.structures.variants.Variant` subclass

        :param flip: If ``True`` incorrect ``(ref, alt)`` pairs will be
                     flipped (Default: False).
        :type flip: bool

        :returns: If flip is True, it returns the correct variant or raises
                  a ``ValueError`` in case it is not salvageable. If flip
                  is False, a bool is simply returned.

        """

        type_message = ("Unsupported argument to check_variant_reference. "
                        "A SNP or Indel object has to be provided.")

        if not (hasattr(variant, "chrom") and hasattr(variant, "pos")
                and hasattr(variant, "ref") and hasattr(variant, "alt")):
            raise TypeError(type_message)

        if (len(variant.ref) == len(variant.alt) == 1
                and "-" not in (variant.ref + variant.alt)):
            return check_snp_reference(variant, self, flip)
        else:
            return check_indel_reference(variant, self, flip)

    def get_nucleotide(self, chrom, pos):
        """Get the nucleotide at the given genomic position. """
        return self.get_sequence(str(chrom), pos, length=1)

    def get_sequence(self, chrom, start, end=None, length=None):
        """Get the nucleotide sequence at the given genomic locus.

        :param chrom: The chromosome.
        :type chrom: str

        :param start: The start position of the locus.
        :type start: int

        :param end: The end position.
        :type end: int

        :param length: The length of the sequence to fetch.
        :type length: int

        Either an ``end`` or a ``length`` parameter has to be provided.

        The ranges are incluse, this means that (start, end) positions will
        both be included in the sequence.

        """
        if (end is None and length is None) or (end and length):
            raise TypeError("get_sequence needs either an 'end' OR 'length' "
                            "argument.")

        if length:
            end = start + length - 1

        try:
            seq = self.ref[str(chrom)][start - 1:end]
        except KeyError:
            seq = None

        if seq is None:
            error_message = "chr{}:{}-{} is an invalid genomic mapping"
            error_message = error_message.format(chrom, start, end)
            raise InvalidMapping(error_message)

        return str(seq.seq).upper()

    def close(self):
        self.ref.close()

    def __enter__(self):
        return self

    def __exit__(self, *args):
        self.close()
Exemplo n.º 11
0
class Reference(object):
    """Interface to the human genome reference file.

    This class uses ``pyfaidx`` to parse the genome reference file referenced
    by ``settings.REFERENCE_PATH``.

    This can only be a single plain fasta file.

    Also note that if the path is not in the ``~/.gtconfig/gtrc.ini`` file,
    gepyto will look for an environment variable named ``REFERENCE_PATH``.

    If the genome file can't be found, this class fallbacks to the Ensembl
    remote API to get the sequences.

    This behaviour can also be forced by using the ``remote=True`` argument.

    """
    def __init__(self, remote=False):
        if not remote:
            try:
                self.ref = Fasta(settings.REFERENCE_PATH)
            except IOError:
                self.ref = _RemoteReference(settings.BUILD)
        else:
            self.ref = _RemoteReference(settings.BUILD)

        # Add a get method. This will not be sensitive to "chr" prefixes.
        def get(fasta, chrom):
            chr_prefix = chrom.startswith("chr")
            try:
                return fasta[chrom]
            except KeyError:
                pass
            try:
                # If there was a prefix, we try without.
                if chr_prefix:
                    return fasta[chrom[3:]]
                # If there was no prefix, we try with.
                else:
                    return fasta["chr{}".format(chrom)]
            except KeyError:
                # If it is a true mismatch, we return None.
                return None

        self.ref.get = functools.partial(get, self.ref)

    def check_variant_reference(self, variant, flip=False):
        """Given a variant, makes sure that the 'ref' allele is consistent
        with the human genome reference.

        :param variant: The variant to verify.
        :type variant: :py:class:`gepyto.structures.variants.Variant` subclass

        :param flip: If ``True`` incorrect ``(ref, alt)`` pairs will be
                     flipped (Default: False).
        :type flip: bool

        :returns: If flip is True, it returns the correct variant or raises
                  a ``ValueError`` in case it is not salvageable. If flip
                  is False, a bool is simply returned.

        """

        type_message = ("Unsupported argument to check_variant_reference. "
                        "A SNP or Indel object has to be provided.")

        if not (hasattr(variant, "chrom") and
                hasattr(variant, "pos") and
                hasattr(variant, "ref") and
                hasattr(variant, "alt")):
            raise TypeError(type_message)

        if (len(variant.ref) == len(variant.alt) == 1 and
            "-" not in (variant.ref + variant.alt)):
            return check_snp_reference(variant, self, flip)
        else:
            return check_indel_reference(variant, self, flip)

    def get_nucleotide(self, chrom, pos):
        """Get the nucleotide at the given genomic position. """
        return self.get_sequence(str(chrom), pos, length=1)

    def get_sequence(self, chrom, start, end=None, length=None):
        """Get the nucleotide sequence at the given genomic locus.

        :param chrom: The chromosome.
        :type chrom: str

        :param start: The start position of the locus.
        :type start: int

        :param end: The end position.
        :type end: int

        :param length: The length of the sequence to fetch.
        :type length: int

        Either an ``end`` or a ``length`` parameter has to be provided.

        The ranges are incluse, this means that (start, end) positions will
        both be included in the sequence.

        """
        if (end is None and length is None) or (end and length):
            raise TypeError("get_sequence needs either an 'end' OR 'length' "
                            "argument.")

        if length:
            end = start + length - 1

        try:
            seq = self.ref[str(chrom)][start - 1: end]
        except KeyError:
            seq = None

        if seq is None:
            error_message = "chr{}:{}-{} is an invalid genomic mapping"
            error_message = error_message.format(chrom, start, end)
            raise InvalidMapping(error_message)

        return str(seq.seq).upper()

    def close(self):
        self.ref.close()

    def __enter__(self):
        return self

    def __exit__(self, *args):
        self.close()
Exemplo n.º 12
0
cid_path = args.cid
en_path = args.encode
if en_path[-1] == '/':
    en_path = en_path[:-1]
if gen_path[-1] == '/':
    gen_path = gen_path[:-1]
if out_path[-1] == '/':
    out_path = out_path[:-1]
if not os.path.exists(out_path):
    os.mkdir(out_path)

# read fa file of gRNAs.
fa_gRNA = Fasta(gRNA_path, sequence_always_upper=True)
gid = list(fa_gRNA.keys())
gRNAs = [fa_gRNA[i][:].seq for i in gid]
fa_gRNA.close()

## get POT list of gRNA based on Cas-OFFinder.
cas_input(gen_path, gRNAs, mismatch, gpu)
f_gRNA, f_pot = pot(gid, gRNAs)
## encode ots and predict with deepcrispr
f_cid = pd.read_csv(cid_path, sep='\t', names=['cid', 'cell'])
cid = f_cid.cid[f_cid.cell == cell].tolist()[0]
encode(f_gRNA, en_path, cid)
f_deep = deepots(f_pot, gpu)
## integrate to igwos
igwos(gRNA_path, f_deep, out_path, mismatch)
if cp == 1:
    print("visualize the genome-wide off-target profile with the circos plot")
    os.system("./circos.sh {0}/igwos.tab {1} {2}".format(
        out_path, genome, out_path))