Пример #1
0
def _download_mirbase(args, version="CURRENT"):
    """
    Download files from mirbase
    """
    if not args.hairpin or not args.mirna:
        logger.info("Working with version %s" % version)
        hairpin_fn = op.join(op.abspath(args.out), "hairpin.fa.gz")
        mirna_fn = op.join(op.abspath(args.out), "miRNA.str.gz")
        if not file_exists(hairpin_fn):
            cmd_h = "wget ftp://mirbase.org/pub/mirbase/%s/hairpin.fa.gz -O %s &&  gunzip -f !$" % (
                version, hairpin_fn)
            do.run(cmd_h, "download hairpin")
        if not file_exists(mirna_fn):
            cmd_m = "wget ftp://mirbase.org/pub/mirbase/%s/miRNA.str.gz -O %s && gunzip -f !$" % (
                version, mirna_fn)
            do.run(cmd_m, "download mirna")
    else:
        return args.hairpin, args.mirna
Пример #2
0
def miraligner(args):
    """
    Realign BAM hits to miRBAse to get better accuracy and annotation
    """
    hairpin, mirna = _download_mirbase(args)
    precursors = _read_precursor(args.hairpin, args.sps)
    matures = _read_mature(args.mirna, args.sps)
    gtf = _read_gtf(args.gtf)
    out_dts = []
    for bam_fn in args.files:
        sample = op.splitext(op.basename(bam_fn))[0]
        if bam_fn.endswith("bam") or bam_fn.endswith("sam"):
            logger.info("Reading %s" % bam_fn)
            bam_fn = _sam_to_bam(bam_fn)
            bam_sort_by_n = _bam_sort(bam_fn)
            reads = _read_bam(bam_sort_by_n, precursors)
        elif bam_fn.endswith("fasta") or bam_fn.endswith(
                "fa") or bam_fn.endswith("fastq"):
            out_file = op.join(args.out, sample + ".premirna")
            bam_fn = _filter_seqs(bam_fn)
            if args.miraligner:
                _cmd_miraligner(bam_fn, out_file, args.sps, args.hairpin)
                reads = _read_miraligner(out_file)
            else:
                logger.error("No other aligner supported")
                raise ValueError("Specify --miraligner or give BAM files")
        else:
            raise ValueError("Format not recognized.")

        if not args.miraligner:
            reads = _annotate(reads, matures, precursors)
        out_file = op.join(args.out, sample + ".mirna")
        out_file, dt, dt_pre = _tab_output(reads, out_file, sample)
        try:
            vcf_file = op.join(args.out, sample + ".vcf")
            if not file_exists(vcf_file):
                # if True:
                create_vcf(dt_pre, matures, gtf, vcf_file)
            try:
                import vcf
                vcf.Reader(filename=vcf_file)
            except Exception as e:
                logger.warning(e.__doc__)
                logger.warning(e.message)
        except Exception as e:
            # traceback.print_exc()
            logger.warning(e.__doc__)
            logger.warning(e.message)
        if isinstance(dt, pd.DataFrame):
            out_dts.append(dt)

    if out_dts:
        _create_counts(out_dts, args.out)
        # _summarize(out_dts)
    else:
        print "No files analyzed!"
Пример #3
0
def _cmd_miraligner(fn, out_file, species, hairpin):
    """
    Run miraligner for miRNA annotation
    """
    tool = _get_miraligner()
    path_db = op.dirname(op.abspath(hairpin))
    opts = "-Xms750m -Xmx4g"
    cmd = "{tool} -freq -i {fn} -o {out_file} -s {species} -db {path_db} -sub 1 -trim 3 -add 3"
    if not file_exists(out_file):
        do.run(cmd.format(**locals()), "miraligner with %s" % fn)
        shutil.move(out_file + ".mirna", out_file)
    return out_file
Пример #4
0
def _filter_seqs(fn):
    """Convert names of sequences to unique ids"""
    out_file = op.splitext(fn)[0] + "_unique.fa"
    idx = 0
    if not file_exists(out_file):
        with open(out_file, 'w') as out_handle:
            with open(fn) as in_handle:
                for line in in_handle:
                    if line.startswith("@") or line.startswith(">"):
                        fixed_name = _make_unique(line.strip(), idx)
                        seq = in_handle.next().strip()
                        counts = _get_freq(fixed_name)
                        if len(seq) < 26 and (counts > 1 or counts == 0):
                            idx += 1
                            print >> out_handle, fixed_name
                            print >> out_handle, seq
                        try:
                            if line.startswith("@"):
                                in_handle.next()
                                in_handle.next()
                        except:
                            pass

    return out_file
Пример #5
0
def _bam_sort(bam_fn):
    bam_sort_by_n = op.splitext(bam_fn)[0] + "_sort.bam"
    if not file_exists(bam_sort_by_n):
        do.run(("samtools sort -n -o {bam_sort_by_n} {bam_fn}").format(
            **locals()))
    return bam_sort_by_n
Пример #6
0
def _bam_sort(bam_fn):
    bam_sort_by_n = op.splitext(bam_fn)[0] + "_sort.bam"
    if not file_exists(bam_sort_by_n):
        do.run(("samtools sort -n -o {bam_sort_by_n} {bam_fn}").format(
            **locals()))
    return bam_sort_by_n