def test_conv_mosdepth_gz(): infile = bioconvert_data("test_measles.sorted.bam") with TempFile(suffix=".bedgraph.gz") as tempfile: convert = BAM2BEDGRAPH(infile, tempfile.name) convert(method="mosdepth")
def test_wget(): from easydev import TempFile with TempFile() as fh: wget("https://github.com/sequana/sequana/raw/master/README.rst", fh.name)
def main(args=None): user_options = Options(prog="sequana") if args is None: args = sys.argv # If --help or no options provided, show the help if len(args) == 1: user_options.parse_args(["prog", "--help"]) else: options = user_options.parse_args(args[1:]) if options.version: import sequana print(sequana.version) sys.exit() if options.jobs > 20 and options.bypass is False: raise ValueError('The number of jobs is limited to 20. You can ' + 'force this limit by using --bypass-job-limit') if misc.on_cluster("tars-") and options.unlock is False: if options.cluster is None: raise ValueError("You are on TARS (Institut Pasteur). You " + " must use --cluster option to provide the scheduler " + " options (typically ' --cluster 'sbatch --qos normal' )") # valid codecs: valid_extensions = [("fastq." + ext2).rstrip(".") for ext2 in ['', 'bz2', 'gz', 'dsrc']] valid_extensions += [("fq." + ext2).rstrip(".") for ext2 in ['', 'bz2', 'gz', 'dsrc']] valid_combos = [(x, y) for x in valid_extensions for y in valid_extensions if x!=y] if (options.source, options.target) not in valid_combos: raise ValueError("""--target and --source combo not valid. Must be one of fastq, fastq.gz, fastq.bz2 or fastq.dsrc""") # Create the config file locally module = Module("compressor") with TempFile(suffix=".yaml", dir=".") as temp: cfg = SequanaConfig(module.config) cfg.config.compressor.source = options.source cfg.config.compressor.target = options.target cfg.config.compressor.recursive = options.recursive cfg.config.compressor.verbose = options.verbose cfg.config.compressor.threads = options.threads cfg._update_yaml() cfg.save(filename=temp.name) # The Snakefile can stay in its original place: rule = module.path + os.sep + "compressor.rules" # Run the snakemake command itself. cmd = 'snakemake -s %s --configfile %s -j %s ' % \ (rule, temp.name, options.jobs) if options.dryrun: cmd += " --dryrun " if options.verbose is False: cmd += " --quiet " else: cmd += " -p " # for slurm only: --cores-per-socket if options.cluster: cluster = ' --cluster "%s" ' % options.cluster cmd += cluster if options.snakemake: if " -s " in options.snakemake or " -j " in options.snakemake: raise ValueError("-s or -j cannot be used in " + " --snakemake-options (already used internally") cmd += options.snakemake if options.unlock: cmd += " --unlock " if options.verbose: print(cmd) # On travis, snakemake.shell command from snakemake fails. # Most probably because travis itself uses a subprocess. # excute from easydev uses pexpect.spawn, which seems to work well from easydev import execute execute(cmd, showcmd=False)
def test_genomecov(): filename = sequana_data('JB409847.bed') # wrong file try: bed = bedtools.GenomeCov("dummy.csv") assert False except: assert True # wrong threshold try: bed = bedtools.GenomeCov(filename, high_threshold=2) assert False except: assert True # wrong threshold try: bed = bedtools.GenomeCov(filename, low_threshold=-2) assert False except: assert True # wrong genbank try: bed = bedtools.GenomeCov(filename, "dummy.gbk") assert False except: assert True # !now let us read the good data sets by chunkd bed = bedtools.GenomeCov(filename, sequana_data('JB409847.gbk'), chunksize=5000) for c in bed.chr_list: c.run(1001, k=2) # setter must be bool try: bed.circular = 1 assert False except: assert True # cant use setter try: bed.feature_dict = {} assert False except: assert True assert len(bed) == 1 # a getter for the first chromosome bed[0] # setter available but not sure this is useful bed.window_size = 4000 bed.window_size = 4001 bed.hist() # This requires to call other method before for chrom in bed: chrom.moving_average(n=501) chrom.running_median(n=501, circular=True) chrom.running_median(n=501, circular=False) chrom.compute_zscore() roi = chrom.get_rois() with TempFile(suffix='.png') as fh: chrom.plot_coverage(filename=fh.name) with TempFile(suffix='.png') as fh: chrom.plot_hist_zscore(filename=fh.name) with TempFile(suffix='.png') as fh: chrom.plot_hist_normalized_coverage(filename=fh.name) len(chrom) print(chrom) chrom.get_size() chrom.DOC chrom.CV with TempFile(suffix='.csv') as fh: bed.gc_window_size = 100 bed.to_csv(fh.name) # plotting bed.chr_list[0].plot_hist_coverage() bed.chr_list[0].plot_hist_coverage(logx=False, logy=True) bed.chr_list[0].plot_hist_coverage(logx=True, logy=False) with TempFile(suffix=".png") as fh: bed.chr_list[0].plot_hist_coverage(logx=False, logy=False, filename=fh.name)
def test_converter(): infile = bioconvert_data("test_measles.sorted.bam") with TempFile(suffix=".bed") as tempfile: cmd = "bioconvert %s %s --force" % (infile, tempfile.name) subprocess.Popen(cmd, shell=True)
def test_md5(): with TempFile() as temp: fh = open(temp.name, "w") fh.write("youpi") fh.close() assert md5(fh.name) == "538e957924f0770b415f473ce900d686"
def test_xmfa2phy(method): infile = bioconvert_data("test_phylip2xmfa.xmfa") #outfile = bioconvert_data("test_phylip2xmfa.xmfa") with TempFile(suffix=".xmfa") as tempfile: converter = XMFA2PHYLIP(infile, tempfile.name) converter(method=method)
def test_indirect_conversion(): infile = bioconvert_data("fastqutils_1.fastq") with TempFile(suffix=".clustal") as fout: c = Bioconvert(infile, fout.name, force=True) c()
def test_conv(method): infile = bioconvert_data("test_bcf2vcf_v1.bcf") with TempFile(suffix=".vcf") as tempfile: convert = BCF2VCF(infile, tempfile.name) convert(method=method)
def test_bioconvert(): infile = bioconvert_data("test_measles.sorted.bam") with TempFile(suffix=".fasta") as fout: c = Bioconvert(infile, fout.name, force=True) c()
def test_bioconvert_decompression_compression_mode(): infile = bioconvert_data("fastqutils_1.fastq.gz") with TempFile(suffix=".fastq.bz2") as fout: c = Bioconvert(infile, fout.name, force=True)
def test_pacbio_stride(): b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) with TempFile() as fh: b.stride(fh.name, stride=2) with TempFile() as fh: b.stride(fh.name, stride=2, random=True)
def install_package(query, dependencies=False, verbose=True, repos="http://cran.univ-paris1.fr/"): """Install a R package :param str query: It can be a valid URL to a R package (tar ball), a CRAN package, a path to a R package (tar ball), or simply the directory containing a R package source. :param bool dependencies: :param repos: if provided, install_packages automatically select the provided repositories otherwise a popup window will ask you to select a repo :: >>> rtools.install_package("path_to_a_valid_Rpackage.tar.gz") >>> rtools.install_package("http://URL_to_a_valid_Rpackage.tar.gz") >>> rtools.install_package("hash") # a CRAN package >>> rtools.install_package("path to a valid R package directory") .. seealso:: :class:`biokit.rtools.RPackageManager` """ session = RSession(verbose=verbose) # Is it a local file? if os.path.exists(query): repos = "NULL" else: repos = '"{0}"'.format( repos) # we want the " to be part of the string later on try: # PART for fetching a file on the web, download and install locally if verbose: print("Trying from the web ?") data = urlopen(query) fh = TempFile(suffix=".tar.gz") with open(fh.name, "w") as fh: for x in data.readlines(): fh.write(x) code = """install.packages("%s", dependencies=%s """ % ( fh.name, bool2R(dependencies), ) code += """ , repos=NULL) """ session.run(code) except Exception as err: if verbose: print(err) print("trying local or from repos") print( "RTOOLS warning: URL provided does not seem to exist %s. Trying from CRAN" % query) code = """install.packages("%s", dependencies=%s """ % ( query, bool2R(dependencies), ) code += """ , repos=%s) """ % repos session.run(code) return
def test_create_graph(): with TempFile(suffix=".png") as fout: create_graph(fout.name, use_singularity=False)
def test_touch(): with TempFile() as fh: fh.name tools.touch(fh.name)
def test_conv(method): infile = bioconvert_data("test_fastq2fasta_v1.fastq") #expected_outfile = bioconvert_data("test_fastq2qual_v1.qual") with TempFile(suffix=".fasta") as fout: FASTQ2QUAL(infile, fout.name)
def test_conv(): infile = bioconvert_data("test_vcf2bcf_v1.vcf") with TempFile(suffix=".bcf") as tempfile: convert = VCF2BCF(infile, tempfile.name) convert(method="bcftools")
def test_gfa(): with TempFile(suffix=".gfa") as fout: f = gfa.GFASim(fout.name) f.simulate()
def test_convbase(): infile = bioconvert_data("test_measles.fa") with TempFile(suffix=".bed") as outfile: Bam2Bed(infile, outfile.name) # Wrong name try: class TEST(ConvBase): input_ext = ".fa" output_ext = ".fq" def __call__(self): pass assert False except: assert True # add dot class in2out(ConvBase): input_ext = "in" output_ext = "out" def __call__(self): pass # wrong input extension (int) try: class int2out(ConvBase): input_ext = [1] output_ext = ".out" def __call__(self): pass assert False except: assert True # add dot mix case class in2out(ConvBase): input_ext = ["in", ".in2"] output_ext = "out" def __call__(self): pass try: class in2out(ConvBase): input_ext = 1 output_ext = 2 def __call__(self): pass assert False except: assert True class in2out(ConvBase): input_ext = [".fa"] output_ext = [".fq"] def __call__(self): self.execute("ls") this = in2out("test.fa", "test.fq") assert this.name == "in2out" this()
def test_conv(method): infile = bioconvert_data("GFF2/gff2_example.gff") with TempFile(suffix=".tsv") as tempfile: convert = GFF22GFF3(infile, tempfile.name) convert(method=method)
def test_savePathwayAs(wikipath): # Note that not all WP have the PDF format available. # WP4 has not (march 2018) with TempFile(suffix=".png") as fout: wikipath.savePathwayAs("WP232", fout.name, display=False)
def test_vcf_filter_dp4(): data = sequana_data("test_vcf_mpileup_4dot1.vcf") v = VCF(data) variant = next(v.vcf) def validate_variant_alternatate(variant): # variant.ALT must be different from "." for this test assert str(variant.ALT[0]).strip() != "." # test minimum depth of alternate must be >= 4 variant.INFO['DP4'] = [0, 0, 2, 2] assert v.vcf.is_valid_dp4(variant, 4, 2, 0.75) # here, not enough depth on alternate strand reverse or forward variant.INFO['DP4'] = [0, 0, 4, 1] assert v.vcf.is_valid_dp4(variant, 4, 2, 0.75) is False variant.INFO['DP4'] = [0, 0, 1, 4] assert v.vcf.is_valid_dp4(variant, 4, 2, 0.75) is False # mimimum ratio must be > 0.75 variant.INFO['DP4'] = [25, 0, 75, 75] assert v.vcf.is_valid_dp4(variant, 4, 2, 0.75) is True variant.INFO['DP4'] = [25, 25, 75, 74] # just below 0.75 for the alt reverse assert v.vcf.is_valid_dp4(variant, 4, 2, 0.75) is False variant.INFO['DP4'] = [25, 25, 74, 75] # just below 0.75 for the alt forward assert v.vcf.is_valid_dp4(variant, 4, 2, 0.75) is False # variant.ALT is equal to "A" validate_variant_alternatate(variant) def validate_variant_reference(variant): # variant.ALT must be different from "." for this test assert str(variant.ALT[0]).strip() == "." # test minimum depth of alternate must be >= 4 variant.INFO['DP4'] = [2, 2, 0, 0] assert v.vcf.is_valid_dp4(variant, 4, 2, 0.75) # here, not enough depth on alternate strand reverse or forward variant.INFO['DP4'] = [4, 1, 0, 0] assert v.vcf.is_valid_dp4(variant, 4, 2, 0.75) is False variant.INFO['DP4'] = [1, 4, 0, 0] assert v.vcf.is_valid_dp4(variant, 4, 2, 0.75) is False # mimimum ratio must be > 0.75 variant.INFO['DP4'] = [75, 75, 25, 0] assert v.vcf.is_valid_dp4(variant, 4, 2, 0.75) is True variant.INFO['DP4'] = [75, 74, 25, 25] # just below 0.75 for the alt reverse assert v.vcf.is_valid_dp4(variant, 4, 2, 0.75) is False variant.INFO['DP4'] = [74, 75, 25, 25] # just below 0.75 for the alt forward assert v.vcf.is_valid_dp4(variant, 4, 2, 0.75) is False # variant.ALT is equal to "A" variant.ALT[0].sequence = "." validate_variant_reference(variant) # Now, let us do the filtering with the vcf_filter method v = VCF(data) v.vcf.apply_dp4_filter = True with TempFile() as fh: res = v.vcf.filter_vcf(fh.name) assert res == {'N': 573, 'filtered': 414, 'unfiltered': 159}
def test_converter1(): infile = bioconvert_data("test_measles.sorted.bam") with TempFile(suffix=".bed") as tempfile: import sys sys.argv = ["bioconvert", infile, tempfile.name, "--force"] converter.main()
def test_input(): filename = sequana_data('test_gtf_fixer.gtf') with TempFile() as fout: gtf = GTFFixer(filename) gtf.fix(fout.name)
def test_fasta(): with TempFile(suffix=".fasta") as fout: f = fasta.FastaSim(fout.name) f.nreads = 1000 f.simulate()
def __init__(self, names, nodes): """ :param names: can be a local file or URL :param nodes: can be a local file or URL """ # Path to existing files logger.info("Reading input files") self.names = names self.nodes = nodes # First, the nodes if os.path.exists(nodes): self.df_nodes = pd.read_csv(nodes, sep="|", header=None) else: with TempFile() as fout_nodes: logger.info("Loading nodes.dmp from an URL {}".format(nodes)) wget(nodes, fout_nodes.name) self.df_nodes = pd.read_csv(fout_nodes.name, sep="|", header=None) for i, _type in enumerate(self.df_nodes.dtypes): if _type == "O": self.df_nodes[i] = self.df_nodes[i].str.strip('\t') """ tax_id -- node id in GenBank taxonomy database parent tax_id -- parent node id in GenBank taxonomy database rank -- rank of this node (superkingdom, kingdom, ...) embl code -- locus-name prefix; not unique division id -- see division.dmp file inherited div flag (1 or 0) -- 1 if node inherits division from parent genetic code id -- see gencode.dmp file inherited GC flag (1 or 0) -- 1 if node inherits genetic code from parent mitochondrial genetic code id -- see gencode.dmp file inherited MGC flag (1 or 0) -- 1 if node inherits mitochondrial gencode from parent GenBank hidden flag (1 or 0) -- 1 if name is suppressed in GenBank entry lineage hidden subtree root flag (1 or 0) -- 1 if this subtree has no sequence data yet comments -- free-text comments and citations """ try: self.df_nodes.columns = [ "taxid", "parent", "rank", 4, 5, "gc_id", "mt_id", 7, 8, 9, 10, 11, 12, 13 ] del self.df_nodes[13] except: self.df_nodes.columns = ["taxid", "parent", "rank", 4, 5] del self.df_nodes[5] # make sure they are ordered by taxon ID self.df_nodes.sort_values("taxid", inplace=True) self.df_nodes.set_index("taxid", inplace=True) # now we read the names if os.path.exists(names): self.df_names = pd.read_csv(names, sep="|", header=None) else: with TempFile() as fout_names: logger.info("Loading names.dmp from an URL {}".format(names)) wget(names, fout_names.name) self.df_names = pd.read_csv(fout_names.name, sep="|", header=None) for i, _type in enumerate(self.df_names.dtypes): if _type == "O": self.df_names[i] = self.df_names[i].str.strip('\t') del self.df_names[4] self.df_names.columns = ['taxid', 'name', 'unique_name', 'key'] self.df_names.set_index("taxid", inplace=True)
def test_genomecov(): filename = sequana_data('JB409847.bed') try: bed = bedtools.GenomeCov("dummy.csv") assert False except: assert True try: bed = bedtools.GenomeCov(filename, "dummy.gbk") assert False except: assert True # !now let us read the good data sets bed = bedtools.GenomeCov(filename, sequana_data('JB409847.gbk')) bed.compute_coverage(4001) bed = bedtools.GenomeCov(filename, sequana_data('JB409847.gbk')) bed2 = bedtools.GenomeCov(filename, sequana_data('JB409847.gbk')) assert bed == bed # test equality for same chromosome but different data bed2.chr_list[0].df["cov"] += 100 assert bed != bed2 # test equality for same chromosome but different data bed2.chr_list[0].df["cov"] -= 100 bed2.chr_list.append("dummy") assert bed != bed2 # setter must be bool try: bed.circular = 1 assert False except: assert True # cant use setter try: bed.feature_dict = {} assert False except: assert True assert len(bed) == 1 # a getter for the first chromosome bed[0] # setter available but not sure this is useful bed.window_size = 4001 bed.hist() # This requires to call other method before for chrom in bed: chrom.moving_average(n=501) chrom.running_median(n=501, circular=True) chrom.running_median(n=501, circular=False) chrom.compute_zscore() roi = chrom.get_roi() with TempFile(suffix='.png') as fh: chrom.plot_coverage(filename=fh.name) with TempFile(suffix='.png') as fh: chrom.plot_hist_zscore(filename=fh.name) with TempFile(suffix='.png') as fh: chrom.plot_hist_normalized_coverage(filename=fh.name) len(chrom) print(chrom) chrom.get_size() chrom.get_mean_cov() chrom.get_var_coef() with TempFile(suffix='.csv') as fh: bed.to_csv(fh.name) bed2 = bedtools.GenomeCov(fh.name, sequana_data('JB409847.gbk')) # plotting bed.chr_list[0].plot_hist_coverage() bed.chr_list[0].plot_hist_coverage(logx=False,logy=True) bed.chr_list[0].plot_hist_coverage(logx=True,logy=False) with TempFile(suffix=".png") as fh: bed.chr_list[0].plot_hist_coverage(logx=False,logy=False, filename=fh.name)
def test_nx2phy_biopython(method): infile = bioconvert_data(method + ".nexus") outfile = bioconvert_data(method + ".phylip") with TempFile(suffix=".phylip") as tempfile: converter = NEXUS2PHYLIP(infile, tempfile.name) converter(method=method)
def create_graph(filename, layout="dot", use_singularity=False): """ :param filename: should end in .png or .svg or .dot If extension is .dot, only the dot file is created. This is useful if you have issues installing graphviz. If so, under Linux you could use our singularity container see github.com/cokelaer/graphviz4all """ from bioconvert.core.registry import Registry rr = Registry() try: if filename.endswith(".dot") or use_singularity is True: raise from pygraphviz import AGraph dg = AGraph(directed=True) for a, b in rr.get_conversions(): dg.add_edge(a, b) dg.layout(layout) dg.draw(filename) except: dot = """ strict digraph{ node [label="\\N"]; """ nodes = set([item for items in rr.get_conversions() for item in items]) for node in nodes: dot += "\"{}\";\n".format(node) for a, b in rr.get_conversions(): dot += "\"{}\" -> \"{}\";\n".format(a, b) dot += "}\n" from easydev import TempFile from bioconvert import shell dotfile = TempFile(suffix=".dot") with open(dotfile.name, "w") as fout: fout.write(dot) dotpath = "" if use_singularity: from bioconvert.core.downloader import download_singularity_image singfile = download_singularity_image( "graphviz.simg", "shub://cokelaer/graphviz4all:v1", "4288088d91c848e5e3a327282a1ab3d1") dotpath = "singularity run {} ".format(singfile) on_rtd = environ.get('READTHEDOCS', None) == 'True' if on_rtd: dotpath = "" ext = filename.rsplit(".", 1)[1] cmd = "{}dot -T{} {} -o {}".format(dotpath, ext, dotfile.name, filename) try: shell(cmd) except: import os os.system(cmd)
def test_pacbio_random(): b = BAMPacbio(sequana_data("test_pacbio_subreads.bam")) with TempFile() as fh: b.random_selection(fh.name, nreads=10)