Пример #1
0
def test_conv_mosdepth_gz():
    infile = bioconvert_data("test_measles.sorted.bam")
    with TempFile(suffix=".bedgraph.gz") as tempfile:
        convert = BAM2BEDGRAPH(infile, tempfile.name)
        convert(method="mosdepth")
Пример #2
0
def test_wget():
    from easydev import TempFile
    with TempFile() as fh:
        wget("https://github.com/sequana/sequana/raw/master/README.rst",
             fh.name)
Пример #3
0
def main(args=None):

    user_options = Options(prog="sequana")

    if args is None:
        args = sys.argv

    # If --help or no options provided, show the help
    if len(args) == 1:
        user_options.parse_args(["prog", "--help"])
    else:
       options = user_options.parse_args(args[1:])

    if options.version:
        import sequana
        print(sequana.version)
        sys.exit()

    if options.jobs > 20 and options.bypass is False:
        raise ValueError('The number of jobs is limited to 20. You can ' +
            'force this limit by using --bypass-job-limit')

    if misc.on_cluster("tars-") and options.unlock is False:
        if options.cluster is None:
            raise ValueError("You are on TARS (Institut Pasteur). You " +
                " must use --cluster option to provide the scheduler " +
                " options (typically ' --cluster 'sbatch --qos normal' )")

    # valid codecs:
    valid_extensions = [("fastq." + ext2).rstrip(".")
                        for ext2 in ['', 'bz2', 'gz', 'dsrc']]

    valid_extensions += [("fq." + ext2).rstrip(".")
                        for ext2 in ['', 'bz2', 'gz', 'dsrc']]

    valid_combos = [(x, y) for x in valid_extensions
                           for y in valid_extensions
                           if x!=y]

    if (options.source, options.target) not in valid_combos:
        raise ValueError("""--target and --source combo not valid.
Must be one of fastq, fastq.gz, fastq.bz2 or fastq.dsrc""")

    # Create the config file locally
    module = Module("compressor")

    with TempFile(suffix=".yaml", dir=".") as temp:
        cfg = SequanaConfig(module.config)
        cfg.config.compressor.source = options.source
        cfg.config.compressor.target = options.target
        cfg.config.compressor.recursive = options.recursive
        cfg.config.compressor.verbose = options.verbose
        cfg.config.compressor.threads = options.threads
        cfg._update_yaml()
        cfg.save(filename=temp.name)

        # The Snakefile can stay in its original place:
        rule = module.path + os.sep +  "compressor.rules"

        # Run the snakemake command itself.
        cmd = 'snakemake -s %s  --configfile %s -j %s ' % \
                (rule, temp.name, options.jobs)

        if options.dryrun:
            cmd += " --dryrun "

        if options.verbose is False:
            cmd += " --quiet "
        else:
            cmd += " -p "

        # for slurm only: --cores-per-socket
        if options.cluster:
            cluster = ' --cluster "%s" ' % options.cluster
            cmd += cluster

        if options.snakemake:
            if " -s " in options.snakemake or " -j " in options.snakemake:
                raise ValueError("-s or -j cannot be used in " +
                    " --snakemake-options    (already used internally")
            cmd += options.snakemake

        if options.unlock:
            cmd += " --unlock "

        if options.verbose:
            print(cmd)

        # On travis, snakemake.shell command from snakemake fails.
        # Most probably because travis itself uses a subprocess.
        # excute from easydev uses pexpect.spawn, which seems to work well
        from easydev import execute
        execute(cmd, showcmd=False)
Пример #4
0
def test_genomecov():
    filename = sequana_data('JB409847.bed')

    # wrong file
    try:
        bed = bedtools.GenomeCov("dummy.csv")
        assert False
    except:
        assert True

    # wrong threshold
    try:
        bed = bedtools.GenomeCov(filename, high_threshold=2)
        assert False
    except:
        assert True

    # wrong threshold
    try:
        bed = bedtools.GenomeCov(filename, low_threshold=-2)
        assert False
    except:
        assert True

    # wrong genbank
    try:
        bed = bedtools.GenomeCov(filename, "dummy.gbk")
        assert False
    except:
        assert True

    # !now let us read the good data sets by chunkd
    bed = bedtools.GenomeCov(filename,
                             sequana_data('JB409847.gbk'),
                             chunksize=5000)
    for c in bed.chr_list:
        c.run(1001, k=2)

    # setter must be bool
    try:
        bed.circular = 1
        assert False
    except:
        assert True

    # cant use setter
    try:
        bed.feature_dict = {}
        assert False
    except:
        assert True

    assert len(bed) == 1
    # a getter for the first chromosome
    bed[0]

    # setter available but not sure this is useful
    bed.window_size = 4000
    bed.window_size = 4001
    bed.hist()

    # This requires to call other method before
    for chrom in bed:
        chrom.moving_average(n=501)
        chrom.running_median(n=501, circular=True)
        chrom.running_median(n=501, circular=False)

        chrom.compute_zscore()
        roi = chrom.get_rois()
        with TempFile(suffix='.png') as fh:
            chrom.plot_coverage(filename=fh.name)
        with TempFile(suffix='.png') as fh:
            chrom.plot_hist_zscore(filename=fh.name)
        with TempFile(suffix='.png') as fh:
            chrom.plot_hist_normalized_coverage(filename=fh.name)

        len(chrom)
        print(chrom)
        chrom.get_size()
        chrom.DOC
        chrom.CV
    with TempFile(suffix='.csv') as fh:
        bed.gc_window_size = 100
        bed.to_csv(fh.name)

    # plotting
    bed.chr_list[0].plot_hist_coverage()
    bed.chr_list[0].plot_hist_coverage(logx=False, logy=True)
    bed.chr_list[0].plot_hist_coverage(logx=True, logy=False)
    with TempFile(suffix=".png") as fh:
        bed.chr_list[0].plot_hist_coverage(logx=False,
                                           logy=False,
                                           filename=fh.name)
Пример #5
0
def test_converter():

    infile = bioconvert_data("test_measles.sorted.bam")
    with TempFile(suffix=".bed") as tempfile:
        cmd = "bioconvert %s %s --force" % (infile, tempfile.name)
        subprocess.Popen(cmd, shell=True)
Пример #6
0
def test_md5():
    with TempFile() as temp:
        fh = open(temp.name, "w")
        fh.write("youpi")
        fh.close()
        assert md5(fh.name) == "538e957924f0770b415f473ce900d686"
Пример #7
0
def test_xmfa2phy(method):
    infile = bioconvert_data("test_phylip2xmfa.xmfa")
    #outfile = bioconvert_data("test_phylip2xmfa.xmfa")
    with TempFile(suffix=".xmfa") as tempfile:
        converter = XMFA2PHYLIP(infile, tempfile.name)
        converter(method=method)
Пример #8
0
def test_indirect_conversion():
    infile = bioconvert_data("fastqutils_1.fastq")
    with TempFile(suffix=".clustal") as fout:
        c = Bioconvert(infile, fout.name, force=True)
        c()
Пример #9
0
def test_conv(method):
    infile = bioconvert_data("test_bcf2vcf_v1.bcf")
    with TempFile(suffix=".vcf") as tempfile:
        convert = BCF2VCF(infile, tempfile.name)
        convert(method=method)
Пример #10
0
def test_bioconvert():
    infile = bioconvert_data("test_measles.sorted.bam")
    with TempFile(suffix=".fasta") as fout:
        c = Bioconvert(infile, fout.name, force=True)
        c()
Пример #11
0
def test_bioconvert_decompression_compression_mode():
    infile = bioconvert_data("fastqutils_1.fastq.gz")
    with TempFile(suffix=".fastq.bz2") as fout:
        c = Bioconvert(infile, fout.name, force=True)
Пример #12
0
def test_pacbio_stride():
    b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam"))
    with TempFile() as fh:
        b.stride(fh.name, stride=2)
    with TempFile() as fh:
        b.stride(fh.name, stride=2, random=True)
Пример #13
0
def install_package(query,
                    dependencies=False,
                    verbose=True,
                    repos="http://cran.univ-paris1.fr/"):
    """Install a R package

    :param str query: It can be a valid URL to a R package (tar ball), a CRAN
        package, a path to a R package (tar ball), or simply the directory
        containing a R package source.
    :param bool dependencies:
    :param repos: if provided, install_packages automatically select the
        provided repositories otherwise a popup window will ask you to select a repo

    ::

        >>> rtools.install_package("path_to_a_valid_Rpackage.tar.gz")
        >>> rtools.install_package("http://URL_to_a_valid_Rpackage.tar.gz")
        >>> rtools.install_package("hash") # a CRAN package
        >>> rtools.install_package("path to a valid R package directory")

    .. seealso:: :class:`biokit.rtools.RPackageManager`
    """
    session = RSession(verbose=verbose)

    # Is it a local file?
    if os.path.exists(query):
        repos = "NULL"
    else:
        repos = '"{0}"'.format(
            repos)  # we want the " to be part of the string later on

    try:
        # PART for fetching a file on the web, download and install locally
        if verbose:
            print("Trying from the web ?")
        data = urlopen(query)
        fh = TempFile(suffix=".tar.gz")
        with open(fh.name, "w") as fh:
            for x in data.readlines():
                fh.write(x)
        code = """install.packages("%s", dependencies=%s """ % (
            fh.name,
            bool2R(dependencies),
        )
        code += """ , repos=NULL) """
        session.run(code)

    except Exception as err:
        if verbose:
            print(err)
            print("trying local or from repos")
            print(
                "RTOOLS warning: URL provided does not seem to exist %s. Trying from CRAN"
                % query)
        code = """install.packages("%s", dependencies=%s """ % (
            query,
            bool2R(dependencies),
        )

        code += """ , repos=%s) """ % repos
        session.run(code)
        return
Пример #14
0
def test_create_graph():
    with TempFile(suffix=".png") as fout:
        create_graph(fout.name, use_singularity=False)
Пример #15
0
def test_touch():
    with TempFile() as fh:
        fh.name
        tools.touch(fh.name)
Пример #16
0
def test_conv(method):
    infile = bioconvert_data("test_fastq2fasta_v1.fastq")

    #expected_outfile = bioconvert_data("test_fastq2qual_v1.qual")
    with TempFile(suffix=".fasta") as fout:
        FASTQ2QUAL(infile, fout.name)
Пример #17
0
def test_conv():
    infile = bioconvert_data("test_vcf2bcf_v1.vcf")
    with TempFile(suffix=".bcf") as tempfile:
        convert = VCF2BCF(infile, tempfile.name)
        convert(method="bcftools")
Пример #18
0
def test_gfa():
    with TempFile(suffix=".gfa") as fout:
        f = gfa.GFASim(fout.name)
        f.simulate()
Пример #19
0
def test_convbase():
    infile = bioconvert_data("test_measles.fa")

    with TempFile(suffix=".bed") as outfile:
        Bam2Bed(infile, outfile.name)

    # Wrong name
    try:

        class TEST(ConvBase):
            input_ext = ".fa"
            output_ext = ".fq"

            def __call__(self):
                pass

        assert False
    except:
        assert True

    # add dot
    class in2out(ConvBase):
        input_ext = "in"
        output_ext = "out"

        def __call__(self):
            pass

    # wrong input extension (int)
    try:

        class int2out(ConvBase):
            input_ext = [1]
            output_ext = ".out"

            def __call__(self):
                pass

        assert False
    except:
        assert True

    # add dot  mix case
    class in2out(ConvBase):
        input_ext = ["in", ".in2"]
        output_ext = "out"

        def __call__(self):
            pass

    try:

        class in2out(ConvBase):
            input_ext = 1
            output_ext = 2

            def __call__(self):
                pass

        assert False
    except:
        assert True

    class in2out(ConvBase):
        input_ext = [".fa"]
        output_ext = [".fq"]

        def __call__(self):
            self.execute("ls")

    this = in2out("test.fa", "test.fq")
    assert this.name == "in2out"
    this()
Пример #20
0
def test_conv(method):
    infile = bioconvert_data("GFF2/gff2_example.gff")
    with TempFile(suffix=".tsv") as tempfile:
        convert = GFF22GFF3(infile, tempfile.name)
        convert(method=method)
Пример #21
0
def test_savePathwayAs(wikipath):
    # Note that not all WP have the PDF format available.
    # WP4 has not (march 2018)
    with TempFile(suffix=".png") as fout:
        wikipath.savePathwayAs("WP232", fout.name, display=False)
Пример #22
0
def test_vcf_filter_dp4():

    data = sequana_data("test_vcf_mpileup_4dot1.vcf")
    v = VCF(data)
    variant = next(v.vcf)

    def validate_variant_alternatate(variant):
        # variant.ALT must be different from "." for this test
        assert str(variant.ALT[0]).strip() != "."

        # test minimum depth of alternate must be >= 4
        variant.INFO['DP4'] = [0, 0, 2, 2]
        assert v.vcf.is_valid_dp4(variant, 4, 2, 0.75)

        # here, not enough depth on alternate strand reverse or forward
        variant.INFO['DP4'] = [0, 0, 4, 1]
        assert v.vcf.is_valid_dp4(variant, 4, 2, 0.75) is False
        variant.INFO['DP4'] = [0, 0, 1, 4]
        assert v.vcf.is_valid_dp4(variant, 4, 2, 0.75) is False

        # mimimum ratio must be > 0.75
        variant.INFO['DP4'] = [25, 0, 75, 75]
        assert v.vcf.is_valid_dp4(variant, 4, 2, 0.75) is True

        variant.INFO['DP4'] = [25, 25, 75,
                               74]  # just below 0.75 for the alt reverse
        assert v.vcf.is_valid_dp4(variant, 4, 2, 0.75) is False
        variant.INFO['DP4'] = [25, 25, 74,
                               75]  # just below 0.75 for the alt forward
        assert v.vcf.is_valid_dp4(variant, 4, 2, 0.75) is False

    # variant.ALT is equal to "A"
    validate_variant_alternatate(variant)

    def validate_variant_reference(variant):
        # variant.ALT must be different from "." for this test
        assert str(variant.ALT[0]).strip() == "."

        # test minimum depth of alternate must be >= 4
        variant.INFO['DP4'] = [2, 2, 0, 0]
        assert v.vcf.is_valid_dp4(variant, 4, 2, 0.75)

        # here, not enough depth on alternate strand reverse or forward
        variant.INFO['DP4'] = [4, 1, 0, 0]
        assert v.vcf.is_valid_dp4(variant, 4, 2, 0.75) is False
        variant.INFO['DP4'] = [1, 4, 0, 0]
        assert v.vcf.is_valid_dp4(variant, 4, 2, 0.75) is False

        # mimimum ratio must be > 0.75
        variant.INFO['DP4'] = [75, 75, 25, 0]
        assert v.vcf.is_valid_dp4(variant, 4, 2, 0.75) is True

        variant.INFO['DP4'] = [75, 74, 25,
                               25]  # just below 0.75 for the alt reverse
        assert v.vcf.is_valid_dp4(variant, 4, 2, 0.75) is False
        variant.INFO['DP4'] = [74, 75, 25,
                               25]  # just below 0.75 for the alt forward
        assert v.vcf.is_valid_dp4(variant, 4, 2, 0.75) is False

    # variant.ALT is equal to "A"
    variant.ALT[0].sequence = "."
    validate_variant_reference(variant)

    # Now, let us do the filtering with the vcf_filter method
    v = VCF(data)
    v.vcf.apply_dp4_filter = True
    with TempFile() as fh:
        res = v.vcf.filter_vcf(fh.name)
    assert res == {'N': 573, 'filtered': 414, 'unfiltered': 159}
Пример #23
0
def test_converter1():
    infile = bioconvert_data("test_measles.sorted.bam")
    with TempFile(suffix=".bed") as tempfile:
        import sys
        sys.argv = ["bioconvert", infile, tempfile.name, "--force"]
        converter.main()
Пример #24
0
def test_input():
    filename = sequana_data('test_gtf_fixer.gtf')

    with TempFile() as fout:
        gtf = GTFFixer(filename)
        gtf.fix(fout.name)
Пример #25
0
def test_fasta():
    with TempFile(suffix=".fasta") as fout:
        f = fasta.FastaSim(fout.name)
        f.nreads = 1000
        f.simulate()
Пример #26
0
    def __init__(self, names, nodes):
        """

        :param names: can be a local file or URL
        :param nodes: can be a local file or URL

        """
        # Path to existing files
        logger.info("Reading input files")
        self.names = names
        self.nodes = nodes

        # First, the nodes
        if os.path.exists(nodes):
            self.df_nodes = pd.read_csv(nodes, sep="|", header=None)
        else:
            with TempFile() as fout_nodes:
                logger.info("Loading nodes.dmp from an URL {}".format(nodes))
                wget(nodes, fout_nodes.name)
                self.df_nodes = pd.read_csv(fout_nodes.name,
                                            sep="|",
                                            header=None)
        for i, _type in enumerate(self.df_nodes.dtypes):
            if _type == "O":
                self.df_nodes[i] = self.df_nodes[i].str.strip('\t')
        """
        tax_id                  -- node id in GenBank taxonomy database
        parent tax_id               -- parent node id in GenBank taxonomy database
        rank                    -- rank of this node (superkingdom, kingdom, ...) 
        embl code               -- locus-name prefix; not unique
        division id             -- see division.dmp file
        inherited div flag  (1 or 0)        -- 1 if node inherits division from parent
        genetic code id             -- see gencode.dmp file
        inherited GC  flag  (1 or 0)        -- 1 if node inherits genetic code from parent
        mitochondrial genetic code id       -- see gencode.dmp file
        inherited MGC flag  (1 or 0)        -- 1 if node inherits mitochondrial gencode from parent
        GenBank hidden flag (1 or 0)            -- 1 if name is suppressed in
        GenBank entry lineage
        hidden subtree root flag (1 or 0)       -- 1 if this subtree has no sequence data yet
        comments                -- free-text comments and citations
        """

        try:
            self.df_nodes.columns = [
                "taxid", "parent", "rank", 4, 5, "gc_id", "mt_id", 7, 8, 9, 10,
                11, 12, 13
            ]
            del self.df_nodes[13]
        except:
            self.df_nodes.columns = ["taxid", "parent", "rank", 4, 5]
            del self.df_nodes[5]

        # make sure they are ordered by taxon ID
        self.df_nodes.sort_values("taxid", inplace=True)
        self.df_nodes.set_index("taxid", inplace=True)

        # now we read the names
        if os.path.exists(names):
            self.df_names = pd.read_csv(names, sep="|", header=None)
        else:
            with TempFile() as fout_names:
                logger.info("Loading names.dmp from an URL {}".format(names))
                wget(names, fout_names.name)
                self.df_names = pd.read_csv(fout_names.name,
                                            sep="|",
                                            header=None)

        for i, _type in enumerate(self.df_names.dtypes):
            if _type == "O":
                self.df_names[i] = self.df_names[i].str.strip('\t')
        del self.df_names[4]
        self.df_names.columns = ['taxid', 'name', 'unique_name', 'key']
        self.df_names.set_index("taxid", inplace=True)
Пример #27
0
def test_genomecov():
    filename = sequana_data('JB409847.bed')

    try:
        bed = bedtools.GenomeCov("dummy.csv")
        assert False
    except:
        assert True

    try:
        bed = bedtools.GenomeCov(filename, "dummy.gbk")
        assert False
    except:
        assert True

    # !now let us read the good data sets
    bed = bedtools.GenomeCov(filename, sequana_data('JB409847.gbk'))
    bed.compute_coverage(4001)

    bed = bedtools.GenomeCov(filename, sequana_data('JB409847.gbk'))
    bed2 = bedtools.GenomeCov(filename, sequana_data('JB409847.gbk'))
    assert bed == bed

    # test equality for same chromosome but different data
    bed2.chr_list[0].df["cov"] += 100
    assert bed != bed2
    # test equality for same chromosome but different data
    bed2.chr_list[0].df["cov"] -= 100
    bed2.chr_list.append("dummy")
    assert bed != bed2


    # setter must be bool
    try:
        bed.circular = 1
        assert False
    except:
        assert True

    # cant use setter
    try:
        bed.feature_dict = {}
        assert False
    except:
        assert True

    assert len(bed) == 1
    # a getter for the first chromosome
    bed[0]

    # setter available but not sure this is useful
    bed.window_size = 4001
    bed.hist()

    # This requires to call other method before
    for chrom in bed:
        chrom.moving_average(n=501)
        chrom.running_median(n=501, circular=True)
        chrom.running_median(n=501, circular=False)

        chrom.compute_zscore()
        roi = chrom.get_roi()
        with TempFile(suffix='.png') as fh:
            chrom.plot_coverage(filename=fh.name)
        with TempFile(suffix='.png') as fh:
            chrom.plot_hist_zscore(filename=fh.name)
        with TempFile(suffix='.png') as fh:
            chrom.plot_hist_normalized_coverage(filename=fh.name)

        len(chrom)
        print(chrom)
        chrom.get_size()
        chrom.get_mean_cov()
        chrom.get_var_coef()
    with TempFile(suffix='.csv') as fh:
        bed.to_csv(fh.name)
        bed2 = bedtools.GenomeCov(fh.name, sequana_data('JB409847.gbk'))

    # plotting
    bed.chr_list[0].plot_hist_coverage()
    bed.chr_list[0].plot_hist_coverage(logx=False,logy=True)
    bed.chr_list[0].plot_hist_coverage(logx=True,logy=False)
    with TempFile(suffix=".png") as fh:
        bed.chr_list[0].plot_hist_coverage(logx=False,logy=False,
            filename=fh.name)
Пример #28
0
def test_nx2phy_biopython(method):
    infile = bioconvert_data(method + ".nexus")
    outfile = bioconvert_data(method + ".phylip")
    with TempFile(suffix=".phylip") as tempfile:
        converter = NEXUS2PHYLIP(infile, tempfile.name)
        converter(method=method)
Пример #29
0
def create_graph(filename, layout="dot", use_singularity=False):
    """

    :param filename: should end in .png or .svg or .dot

    If extension is .dot, only the dot file is created.
    This is useful if you have issues installing graphviz.
    If so, under Linux you could use our singularity container
    see github.com/cokelaer/graphviz4all

    """
    from bioconvert.core.registry import Registry
    rr = Registry()

    try:
        if filename.endswith(".dot") or use_singularity is True:
            raise
        from pygraphviz import AGraph
        dg = AGraph(directed=True)

        for a, b in rr.get_conversions():
            dg.add_edge(a, b)

        dg.layout(layout)
        dg.draw(filename)

    except:

        dot = """
strict digraph{
    node [label="\\N"];

    """
        nodes = set([item for items in rr.get_conversions() for item in items])

        for node in nodes:
            dot += "\"{}\";\n".format(node)
        for a, b in rr.get_conversions():
            dot += "\"{}\" -> \"{}\";\n".format(a, b)
        dot += "}\n"

        from easydev import TempFile
        from bioconvert import shell
        dotfile = TempFile(suffix=".dot")
        with open(dotfile.name, "w") as fout:
            fout.write(dot)

        dotpath = ""
        if use_singularity:
            from bioconvert.core.downloader import download_singularity_image
            singfile = download_singularity_image(
                "graphviz.simg", "shub://cokelaer/graphviz4all:v1",
                "4288088d91c848e5e3a327282a1ab3d1")

            dotpath = "singularity run {} ".format(singfile)
            on_rtd = environ.get('READTHEDOCS', None) == 'True'
            if on_rtd:
                dotpath = ""

        ext = filename.rsplit(".", 1)[1]
        cmd = "{}dot -T{} {} -o {}".format(dotpath, ext, dotfile.name,
                                           filename)
        try:
            shell(cmd)
        except:
            import os
            os.system(cmd)
Пример #30
0
def test_pacbio_random():
    b = BAMPacbio(sequana_data("test_pacbio_subreads.bam"))
    with TempFile() as fh:
        b.random_selection(fh.name, nreads=10)