def test_snpeff(): # a custom refrence fh_log = TempFile() mydata = snpeff.SnpEff(reference=sequana_data("JB409847.gbk"), log=fh_log.name) with TempFile() as fh: mydata.launch_snpeff(sequana_data("JB409847.vcf"), fh.name) fh_log.delete() # cleanup try: os.remove("snpEff.config") except: pass try: os.remove("snpEff_genes.txt") except: pass try: os.remove("snpEff_summary.html") except: pass try: snpeff.SnpEff(reference="dummy") assert False except SystemExit: assert True except: assert False
def test_attrdict(): a = tools.AttrDict(value=1) assert a.value == 1 assert 'value' in list(a.keys()) assert 1 in (a.values()) a.description = 'test' assert a['description'] == 'test' a['output'] = 'txt' assert a.output == 'txt' d = {'a':{'b':1}, 'aa':2} ad = tools.AttrDict(**d) assert ad.a.b == 1 ad.a.b = 2 assert ad.a.b == 2 ad['d'] = 4 assert ad.d == 4 try: ad.update(1) assert False except: assert True # check json capabilities fh = TempFile() js = ad.to_json() ad.to_json(filename=fh.name) ad.from_json(fh.name) fh.delete()
def test_add_locus_with_modification(): # Alter the original GBK to alter the locus name data = open(sequana_data("JB409847.gbk"), "r").read() newdata = data.replace("JB409847", "DUMMY_JB409847") fh = TempFile(suffix="gbk") with open(fh.name, 'w') as fout: fout.write(newdata) # Now we read this new GBK file that has a different locus name as # compared to the fasta mydata = snpeff.SnpEff(reference=fh.name) # Here is the corresponding FASTA fasta = sequana_data("JB409847.fasta") with TempFile(suffix="fasta") as fh2: mydata.add_locus_in_fasta(fasta, fh2.name) # In theory, in the newly created fasta file, we should find back the # DUMMY tag # cleanup try: os.remove("snpEff.config") except: pass data = open(fh2.name, "r").read() assert "DUMMY" in data fh.delete()
def test_yeast_annotations(): from easydev import gsf filename = gsf('msdas', "data", "YEAST_raw_sample.csv") r = MassSpecReader(filename, verbose=verbose) a = AnnotationsYeast(r, verbose=verbose) a.df = a.df.ix[0:200] # 200 is enough to get gene name cases and ambiguous gene names cases # e.g., ALD3_YEAST ['P54114', 'P40047'] a.get_uniprot_entries() a.update_mapping() a.set_annotations() a.annotations.Sequence t = TempFile() a.to_csv(t.name) t.delete() t = TempFile() a.to_pickle("test", overwrite=True) try: a.to_pickle("test", overwrite=False) assert False except IOError: assert True a.read_pickle("YEAST_annotations_test.pkl") # create constructor given the annotations a = AnnotationsYeast(r, verbose=verbose, annotations="YEAST_annotations_test.pkl") a.get_uniprot_entries() # populate entry and entry_names in the df a.plot_goid_histogram() #cleanup os.remove("YEAST_annotations_test.pkl")
def score_sc2(self, prediction_file): fh = TempFile() _, gs2 = self.download_gs() script = self.classpath + os.sep + "DREAM_Olfaction_scoring_Q2.pl" cmd = "perl %s %s %s %s" cmd = cmd % (script, prediction_file, fh.name, gs2) shellcmd(cmd) df = pd.read_csv(fh.name, sep='\t', index_col=None).ix[0] fh.delete() return df
def test_sequana_config(): s = snaketools.Module("quality_control") config = snaketools.SequanaConfig(s.config) assert config.config.get("kraken:dummy", "test") == "test" assert config.config.get("kraken:dummy") == None # --------------------------------- tests different constructors config = snaketools.SequanaConfig() config = snaketools.SequanaConfig({"test":1}) assert config.config.test == 1 # with a dictionary config = snaketools.SequanaConfig(config.config) # with a sequanaConfig instance config = snaketools.SequanaConfig(config) # with a non-yaml file try: json = sequana_data('test_summary_fastq_stats.json') config = snaketools.SequanaConfig(json) assert False except: assert True try: config = snaketools.SequanaConfig("dummy_dummy") assert False except: assert True # Test an exception s = snaketools.Module("quality_control") config = snaketools.SequanaConfig(s.config) config._recursive_update(config._yaml_code, {"input_directory_dummy": "test"}) # loop over all pipelines, read the config, save it and check the content is # identical. This requires to remove the templates. We want to make sure the # empty strings are kept and that "no value" are kept as well # # field1: "" # field2: # # is unchanged from easydev import TempFile output = TempFile(suffix=".yaml") for pipeline in snaketools.pipeline_names: config_filename = Module(pipeline)._get_config() cfg1 = SequanaConfig(config_filename) cfg1.cleanup() # remove templates and strip strings cfg1.save(output.name) cfg2 = SequanaConfig(output.name) assert cfg2._yaml_code == cfg1._yaml_code cfg2._update_config() assert cfg1.config == cfg2.config output.delete()
def test_models(): data = np.array([[1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0], [1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0], [1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1], [1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1], [1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0], [1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0], [1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0], [1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0], [1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1], [1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1], [1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1], [1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1], [1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1]]) columns = [u'EGF=PI3K', u'TNFa=PI3K', u'Jnk=cJun', u'PI3K=Akt', u'Raf=Mek', u'!Akt=Mek', u'Mek=p90RSK', u'Mek=Erk', u'Erk=Hsp27', u'TNFa=Jnk', u'TNFa=NFkB', u'TNFa=Hsp27', u'EGF=Raf', u'EGF^TNFa=PI3K', u'Raf^!Akt=Mek', u'Erk^TNFa=Hsp27'] df = pd.DataFrame(data, columns=columns) fh = TempFile() df.to_csv(fh.name) m1 = Models(df) m2 = Models(m1) m3 = Models(fh.name, index_col=0) # there is an index column with no name fh.delete() # trying a stupid constructor try: Models(1) assert False except: assert True return m1, m2 assert m1 == m2 assert m1 == m3 # plots m1.plot() m1.plot(1) m1.plot('cv') m1.errorbar() m1.heatmap() # exporters fh = TempFile() m1.to_csv(fh.name) fh.delete() fh = TempFile() m1.to_sif(fh.name) fh.delete() # m1 and m2 are identical. Adding them gets rid of duplicates so it should be # equal to itself. m1 == m1 + m2
def install_package(query, dependencies=False, verbose=True, repos = "http://cran.univ-paris1.fr/"): """Install a R package :param str query: It can be a valid URL to a R package (tar ball), a CRAN package, a path to a R package (tar ball), or simply the directory containing a R package source. :param bool dependencies: :param repos: if provided, install_packages automatically select the provided repositories otherwise a popup window will ask you to select a repo :: >>> rtools.install_package("path_to_a_valid_Rpackage.tar.gz") >>> rtools.install_package("http://URL_to_a_valid_Rpackage.tar.gz") >>> rtools.install_package("hash") # a CRAN package >>> rtools.install_package("path to a valid R package directory") .. seealso:: :class:`biokit.rtools.RPackageManager` """ session = RSession(verbose=verbose) # Is it a local file? if os.path.exists(query): repos = 'NULL' else: repos = '"{0}"'.format(repos) # we want the " to be part of the string later on try: # PART for fetching a file on the web, download and install locally if verbose: print("Trying from the web ?") data = urlopen(query) fh = TempFile(suffix=".tar.gz") with open(fh.name, 'w') as fh: for x in data.readlines(): fh.write(x) code = """install.packages("%s", dependencies=%s """ % \ (fh.name, bool2R(dependencies)) code += """ , repos=NULL) """ session.run(code) except Exception as err: if verbose: print(err) print("trying local or from repos") print("RTOOLS warning: URL provided does not seem to exist %s. Trying from CRAN" % query) code = """install.packages("%s", dependencies=%s """ % \ (query, bool2R(dependencies)) code += """ , repos=%s) """ % repos session.run(code) return
def sbmlqual_from_datasets(identifier): # a simple model s1 = SIF() s2 = SIF(cnodata("PKN-" + identifier + ".sif")) fh = TempFile() s2.to_sbmlqual(fh.name) s1.read_sbmlqual(fh.name) fh.delete() assert s1 == s2 s3 = SIF(cnodata("PKN-" + identifier + ".xml")) assert s1 == s3 and s2 == s3
def test_phosphogrid(): m = MassSpecReader(get_yeast_small_data(), verbose=False) gene_names = set(list(m.df.Protein)) p = phosphogrid.PhosphoGRID(directory = "../share/data") p.run(gene_names=gene_names) fh = TempFile(suffix='.sif') p.export2sif(filename=fh.name) p.plot() #p.run() fh.delete()
def test_read_ic50(): # -------------------------------- functionalities r = IC50(ic50_test) # we can also instanciate from a valid dataframe r = IC50(r) # test repr r # and print statement print(r) # the copy method assert r == r.copy() r.hist() r.plot_ic50_count() r.cosmicIds f = TempFile() r.to_csv(f.name) f.delete() # columns may be duplicated r = IC50(ic50_test) df = pd.concat([r.df, r.df[999]], axis=1) # create new instance that should raise an error try: IC50(df) assert False except: assert True # ---------------------------------------- different IC50 formats # test all files available for key in testing.keys() : filename = testing[key].location if filename.startswith('ic50_test'): ic = IC50(filename) # some specific checks: #ic = IC50(testing['ic50_test_header_drug_prefix_only'].location) #assert ic.df.shape == (2,2) #assert all(ic.df.columns == ['1','2']) ic = IC50(testing['ic50_test_header_no_drug_prefix'].location) assert ic.drugIds == [1, 2] ic = IC50(testing['ic50_test_header_drug_prefix_only'].location) assert ic.drugIds == [1, 2] ic = IC50(testing['ic50_test_header_mixed_drug_prefix'].location) assert ic.drugIds == [1, 2]
def score_sc1(self, prediction_file): """Compute all results and compare user prediction with all official participants This scoring function can take a long time (about 5-10 minutes). """ fh = TempFile() gs1, _ = self.download_gs() script = self.classpath + os.sep + "DREAM_Olfaction_scoring_Q1.pl" cmd = "perl %s %s %s %s" cmd = cmd % (script, prediction_file, fh.name, gs1) shellcmd(cmd) df = pd.read_csv(fh.name, sep='\t', index_col=None).ix[0] fh.delete() return df
def test_toypb_bioservices(): from bioservices import biomodels b = biomodels.BioModels() sbml = b.getModelSBMLById('MODEL1305240000') fh =TempFile(suffix='.xml') with open(fh.name, 'w') as fh: fh.write(sbml.encode('utf-8')) c = CNOGraph(fh.name) c2 = CNOGraph(cnodata("PKN-ToyPB.sif")) assert c == c2
def test_d2c1(): s = D2C1() s.test() filename = s.download_template() d = s.score(filename) assert_almost_equal(d['AUPR'], 0.2563463, 7) from easydev import TempFile fh = TempFile() s._create_templates(filename=fh.name) fh.delete() s.score_and_compare_with_lb(s.download_template())
def test_simple_sbmlqual(): # a simple example with simple OR, simple link, mix of OR and AND and single ANd c = CNOGraph() c.add_reaction("!A=C") c.add_reaction("C=D") c.add_reaction("B=C") c.expand_and_gates() c.add_reaction("a1=b") c.add_reaction("a2=b") c.add_reaction("D^b=E") fh = TempFile(suffix='.xml') c.to_sbmlqual(fh.name) c2 = CNOGraph(fh.name) fh.delete() assert c == c2
def test_MSReader(): # we can just create an instance r = MassSpecReader(verbose=verbose) # fails if wrong file try: r = MassSpecReader("dummy.csv", verbose=verbose) assert False except: assert True filename = yeast.get_yeast_filenames()[0] r = MassSpecReader(filename, verbose=verbose) print(r) r.mode r.N r.df r.measurements r.metadata try: r.mode = None assert False except: assert True r.sort_psites_ors_only() r['DIG1'] r['DIG1',"S142"] r['DIG1_S142'] try: r['DIG1', 'S142', 'dummy'] assert False except: assert True r.sequences r.psites from easydev import TempFile f = TempFile() r.to_csv(f.name) f.delete()
def test_yeast_june(): #y = yeast.YEAST2MIDAS(get_yeast_small_data(), get_yeast_raw_data(), verbose=False) #y.cleanup_june() #y.cleanup_june() #len(y.df)<100 filename = gsf("msdas", "data", "PKN-yeastScaffold.sif") data.cleanup_june() c,m,e = data.export_pkn_and_midas_june(filename) from easydev import TempFile f = TempFile() data.to_midas(f.name) f.delete() cv = data.get_cv() m = data.get_midas() data.pcolor_na() data.plot_timeseries("DIG1_S126+S127")
def score_A(self, filename): from easydev import TempFile fh = TempFile() script = self._pj([self.classpath, 'weighted_average_concordance_index.pl']) datadir = self._pj([self.classpath, 'data']) cmd = "perl %s %s %s %s" cmd = cmd % (script, filename, datadir , fh.name) shellcmd(cmd, verbose=True, ignore_errors=True) try: df = pd.read_csv(fh.name, sep='\t', header=None) except: print("Something wrong in the Scoring while executing \n %s. " % cmd) print("\n The D7C4 challenge requires a Perl package to be installed") print("See D7C4 documentation e.g., on dreamtools.readthedocs.org") import sys sys.exit(1) df.columns = ['DrugID', 'probabilistic c-index', 'weighted probabilistic c-index', 'zscores'] df = df.set_index('DrugID') fh.delete() ws = (df.sum() / df.sum().ix['zscores']) ws = ws.ix['weighted probabilistic c-index'] results = df.mean() results['weight average probabilistic c-index'] = ws del results['zscores'] # Finally compute pvalues based on precomputed scores precomp = pd.read_csv(self._pj([self.classpath, 'data', 'DREAM7_DrugSensitivity1_drug_zscores.txt']), sep='\t', skiprows=6, header=None) overall_mean = precomp.ix[31][1] overall_var = precomp.ix[31][2] pval = 1 - (.5 * (math.erf((ws - overall_mean)/(math.sqrt(2*overall_var))) + 1)) results['weight average probabilistic c-index p-value'] = pval return {'Results': results}
def test_config_parser(): s1 = ParamsGA() s2 = ParamsGeneral() c1 = CNOConfigParser() c1.add_section(s2) c1.add_section(s1) s1 = ParamsGA() s2 = ParamsGeneral() c2 = CNOConfigParser() c2.add_section(s2) c2.add_section(s1) assert c1 == c2 from easydev import TempFile fh = TempFile() c1.save(fh.name) c2 = CNOConfigParser(fh.name) fh.delete() assert c1 == c2
def test_fastq_unzipped(): for thisdata in [data, datagz]: # isntanciation f = fastq.FastQ(thisdata) assert f.data_format == "Illumina_1.8+" # count lines # rune it twice because we want to make sure re-running count_lines # (decompression with zlib) works when run again. assert f.count_lines() == 1000 assert f.count_lines() == 1000 assert f.count_reads() == 250 assert f.count_reads() == 250 # extract head of the file into an unzipped file ft = TempFile() f.extract_head(100, ft.name) fcheck = fastq.FastQ(ft.name) assert fcheck.count_lines() == 100 ft.delete() # extract head of the file and zip output ft = TempFile(suffix=".gz") f.extract_head(100, ft.name) fcheck = fastq.FastQ(ft.name) assert fcheck.count_lines() == 100 ft.delete() with FastQ(thisdata) as ff: assert len(ff) == 250 with TempFile() as fh: selection = f.select_random_reads(10, fh.name) f.select_random_reads(selection, fh.name)
def test_pacbio_input_bam(tmpdir): # we need a summary and a bunch of images filename = sequana_data("summary_pacbio_qc1.json") # mock the PNG files found in the summary import json summary = json.load(open(filename)) pngname = sequana_data("no_data.jpg") summary["images"]["gc_vs_length"] = pngname summary["images"]["hist_gc_content"] = pngname summary["images"]["hist_read_length"] = pngname summary["images"]["hist_snr"] = pngname summary["images"]["hist_zmw"] = pngname summary_file = TempFile() with open(summary_file.name, "w") as ff: json.dump(summary, ff) # Now that we have this new summary file, let us use it # we also need an output handler ff = TempFile() from sequana.utils import config config.output_dir = "/tmp" #here, ff.name is of the form /tmp/djhfjh4dz so we need to remove the /tmp pacbio_input_bam.PacbioInputBAMModule(summary_file.name, ff.name.split("/")[1]) # cleanup summary_file.delete() ff.delete()
def score_A(self, filename): from easydev import TempFile fh = TempFile() script = self._pj([self._path2data, 'weighted_average_concordance_index.pl']) datadir = self._pj([self._path2data, 'data']) cmd = "perl %s %s %s %s" cmd = cmd % (script, filename, datadir , fh.name) shellcmd(cmd, verbose=True, ignore_errors=True) df = pd.read_csv(fh.name, sep='\t', header=None) df.columns = ['DrugID','probabilistic c-index', 'weighted probabilistic c-index', 'zscores'] df = df.set_index('DrugID') fh.delete() ws = (df.sum() / df.sum().ix['zscores']) ws = ws.ix['weighted probabilistic c-index'] results = df.mean() results['weight average probabilitis c-index'] = ws del results['zscores'] # Finally compute pvalues based on precomputed scores precomp = pd.read_csv(self._pj([self._path2data, 'data', 'DREAM7_DrugSensitivity1_drug_zscores.txt']), sep='\t', skiprows=6, header=None) overall_mean = precomp.ix[31][1] overall_var = precomp.ix[31][2] pval = 1 - (.5 * (math.erf((ws - overall_mean)/(math.sqrt(2*overall_var))) + 1)) results['weight average probabilitis c-index p-value'] = pval return {'Results': results}
def test_conv(): # Scf V2 file infile_v2 = bioconvert_data("sample_v2.scf") expected_outfile_v2 = bioconvert_data("sample_v2.fasta") # Scf V3 file infile_v3 = bioconvert_data("sample_v3.scf") expected_outfile_v3 = bioconvert_data("sample_v3.fasta") with TempFile(suffix=".fasta") as tempfile: convert = SCF2FASTA(infile_v2, tempfile.name) convert() # Check that the output is correct with a checksum assert md5(tempfile.name) == md5(expected_outfile_v2) convert = SCF2FASTA(infile_v3, tempfile.name) convert() # Check that the output is correct with a checksum assert md5(tempfile.name) == md5(expected_outfile_v3)
def test_conv(method): infile = bioconvert_data("JB409847.embl") with TempFile(suffix=".gbk") as tempfile: converter = EMBL2GENBANK(infile, tempfile.name) converter(method=method) # Check that the output is correct with a checksum if method == "biopython": assert md5(tempfile.name) == "cdd34902975a68e58ad5f105b44ff495" or \ md5(tempfile.name) == "63002093c1aaef8c3a6fd693c2bbd9f4" elif method == "squizz": pass # TODO # embl input is not understood by squizz if generated by biopython # assert md5(tempfile.name) == "????" else: raise NotImplementedError
def wrapped(inst, *args, **kwargs): infile_name = inst.infile output_compressed = None if inst.outfile.endswith(".gz"): (inst.outfile, output_compressed) = splitext(inst.outfile) elif inst.outfile.endswith(".bz2"): (inst.outfile, output_compressed) = splitext(inst.outfile) elif inst.outfile.endswith(".dsrc"): # !!! only for fastq files (inst.outfile, output_compressed) = splitext(inst.outfile) # Now inst has the uncompressed output file name if infile_name.endswith(".gz"): # decompress input # TODO: https://stackoverflow.com/a/29371584/1878788 _log.info("Generating uncompressed version of {} ".format(infile_name)) (ungz_name, _) = splitext(infile_name) (_, base_suffix) = splitext(ungz_name) with TempFile(suffix=base_suffix) as ungz_infile: inst.infile = ungz_infile.name inst.shell("unpigz -c -p {} {} > {}".format( inst.threads, infile_name, inst.infile)) # computation results = func(inst, *args, **kwargs) inst.infile = infile_name else: results = func(inst, *args, **kwargs) # Compress output and restore inst output file name if output_compressed == ".gz": # TODO: this uses -f ; should be a _log.info("Compressing output into .gz") inst.shell("pigz -f -p {} {}".format(inst.threads, inst.outfile)) inst.outfile = inst.outfile + ".gz" elif output_compressed == ".bz2": _log.info("Compressing output into .bz2") inst.shell("pbzip2 -f -p{} {}".format(inst.threads, inst.outfile)) inst.outfile = inst.outfile + ".bz2" elif output_compressed == ".dsrc": # !!! only for FastQ files _log.info("Compressing output into .dsrc") inst.shell("dsrc c -t{} {} {}.dsrc".format( inst.threads, inst.outfile, inst.outfile)) inst.outfile = inst.outfile + ".dsrc" return results
def test_kraken_results(): test_file = sequana_data("test_kraken.out", "testing") k = KrakenResults(test_file) df = k.plot(kind='pie') k.boxplot_classified_vs_read_length() print(df) df = k.plot(kind='barh') df = k.get_taxonomy_db(11234) assert 11234 in df.index from easydev import TempFile with TempFile() as fout: k.kraken_to_csv(fout.name, "toydb") k.kraken_to_json(fout.name, "toydb") k.kraken_to_krona(fout.name ) k.to_js(fout.name) df = k.plot2(kind='pie')
def test_SIRV(): with TempFile() as fh: data = sequana_data("test_sirv.xls") ss = SIRVReference() ss.from_excel(data) ss.to_fasta(fh.name) sirv = SIRV(fh.name) assert sirv.group_lengths == { 'SIRV1': 7, 'SIRV2': 6, 'SIRV3': 11, 'SIRV4': 7, 'SIRV5': 12, 'SIRV6': 18, 'SIRV7': 7 } assert sum(sirv.SIRV.lengths) == 75469
def _test_conv(method): # the input file infile = bioconvert_data('test_measles.sorted.bam') # What is the expected md5sum of the final output file ? expected_outputfile = bioconvert_data('test_measles.bigwig') md5out = md5(expected_outputfile) # Call convert and check that the output file created has the correct md5sum with TempFile(suffix=".bigwig") as outfile: convert = BAM2BIGWIG(infile, outfile.name) if (method == 'ucsc'): convert(method=method, chrom_sizes=bioconvert_data("hg38.chrom.sizes")) # TODO Failed in OCt 2018 . why ? bamCoverage vesrion in header ? #assert md5(outfile.name) == '61abd0de51bd614136ad85ae0a1ff85b', "{} failed".format(method) else: convert(method=method)
def test_conv(): infile = bioconvert_data("test_maf2sam.maf") outfile = bioconvert_data("test_maf2sam.sam") with TempFile(suffix=".sam") as tempfile: convert = MAF2SAM(infile, tempfile.name) convert(method="python") # In the SAM, the version may be different when using other bioconvert # version, so we need to get rid of the line that contains the version # and program data1 = open(outfile).readlines() data1 = [x for x in data1 if "bioconvert" not in x] data1 = "\n".join(data1) data2 = open(tempfile.name).readlines() data2 = [x for x in data2 if "bioconvert" not in x] data2 = "\n".join(data2) assert data1 == data2
def test_fasta2clustal_goalign(): infile = bioconvert_data("goalign.fasta") outfile = bioconvert_data("goalign.clustal") with TempFile(suffix=".clustal") as tempfile: converter = FASTA2CLUSTAL(infile, tempfile.name) converter(method='goalign') ## We remove goalign version from the first line out = "" with open(tempfile.name) as f: lines = f.readlines() if len(lines)>0: clustal = lines[0].split(" ") if len(clustal) > 0: lines[0]=clustal[0]+"\n" out = ''.join(lines) # Check that the output is correct with a checksum assert hashlib.md5(out.encode('utf-8')).hexdigest() == md5(outfile)
def test_bam2tsv(): #your code here # you will need data for instance "mydata.fastq and mydata.fasta". # Put it in bioconvert/bioconvert/data # you can then use :: infile = bioconvert_data("test_measles.sorted.bam") #expected_outfile = bioconvert_data("test_measles.tsv") with TempFile(suffix=".tsv") as tempfile: convert = BAM2TSV(infile, tempfile.name) convert(method="pysam") # impossible to track down why this test fails on python3.6 # looks like pytest changes the behaviour of the BAM2TSV class # when saving the file. Only the header is saved and the content # of the BAM file is not...su if sys.version_info[0] == 3 and sys.version_info[1] == 6: pass else: assert md5(tempfile.name) == "4c5f3336be8a03c95a6c56be28581fb7" convert(method="samtools") assert md5(tempfile.name) == "4c5f3336be8a03c95a6c56be28581fb7"
def test_conv(): infile = bioconvert_data("test_measles.sorted.bam") with TempFile(suffix=".fa") as tempfile: convert = BAM2Fasta(infile, tempfile.name) convert(method="bamtools") # Check that the output is correct with a checksum # Note that we cannot test the md5 on a gzip file but only # on the original data. This check sum was computed # fro the unzipped version of biokit/data/converters/measles.bed assert md5(tempfile.name) == "ea5511c3c8913626be152609887c8c4d" convert = BAM2Fasta(infile, tempfile.name) convert(method="samtools") # samtools 1.6 / hstlib 1.6 gives different results on travis and # locally assert md5(tempfile.name) in [ "0924d2a11b43094680d1a7374aaaa45e", "cc9afcef458f3402fbdef1a091e05c39" ]
def test_read_write_from_cnograph(): c = CNOGraph(cnodata("PKN-ToyPB.sif")) fh = TempFile(suffix='.xml') c.to_sbmlqual(fh.name) c2 = CNOGraph(fh.name) assert c == c2 fh.delete() c = CNOGraph(cnodata("PKN-ToyPB.sif")) c.expand_and_gates() fh = TempFile(suffix='.xml') c.to_sbmlqual(fh.name) c2 = CNOGraph(fh.name) fh.delete() assert c == c2
def test_conv(): infile = bioconvert_data("test_measles.sorted.bam") #outfile = biokit_data("converters/measles.sam") with TempFile(suffix=".bam") as tempfile: convert = BAM2SAM(infile, tempfile.name) convert() # Check that the output is correct with a checksum # Note that we cannot test the md5 on a gzip file but only # on the original data. This check sum was computed # fro the unzipped version of biokit/data/converters/measles.bed #assert md5(tempfile.name) == md5(outfile) # output is a SAM that can be read and must have import pysam sam = pysam.AlignmentFile(tempfile.name) assert sam.count() == 60 convert(method="pysam") convert = BAM2SAM(infile, tempfile.name) convert(method="sambamba") assert md5(tempfile.name) == "ad83af4d159005a77914c5503bc43802"
def test_gz2dsrc(): """ Test that fastq gz file is converted as expected to a fastq .dsrc file """ from bioconvert import bioconvert_data in_gz = bioconvert_data("test_SP1.fq.gz") exp_fq = bioconvert_data("exp_SP1.fq") with TempFile(suffix=".dsrc") as tempfile: converter = GZ2DSRC(in_gz, tempfile.name) converter() # uncompress the created dsrc file, and compare uncompressed file # to the expected one. We do not directly compare dsrc files as # it depends on the dsrc version used... assert os.path.isfile(tempfile.name) tmp_fq = tempfile.name + ".fq" cmd = "dsrc d {} {}".format(tempfile.name, tmp_fq) subprocess.call(cmd.split()) # Check that the output is correct with a checksum assert md5(tmp_fq) == md5(exp_fq)
def test_gz2dsrc(): """ Test that fastq gz file is converted as expected to a fastq .dsrc file """ from bioconvert import bioconvert_data infile = bioconvert_data("test_SP1.fq.dsrc") with TempFile(suffix=".fq.gz") as tempfile: converter = DSRC2GZ(infile, tempfile.name) converter() # uncompress the createdfile, and compare uncompressed file # to the expected md5. We do not directly compare dsrc or gz files as # it is not deterministic assert os.path.isfile(tempfile.name) cmd = "gunzip -c {} | md5sum -".format(tempfile.name) res = subprocess.check_output(cmd, shell=True) res = res.split()[0].decode() # Check that the output is correct with a checksum assert res == "d41d8cd98f00b204e9800998ecf8427e"
def test_summary(): s = Summary("test2", sample_name="chr1",data={"mean":1}) assert s.data == {"mean":1} assert s.version assert s.date d = s.as_dict() assert "name" in d assert "version" in d assert "data" in d assert "date" in d # test wrong constructor try: s = Summary("test") assert False except: assert True try: s = Summary("test", "test") assert False except: assert True # test data_description s = Summary("test2", data={"mean":1}) s.data_description = {"mean": "mean of the data set"} assert s.data_description == {"mean": "mean of the data set"} try: s.data_description = {"dummy": 1} assert False except: assert True from easydev import TempFile with TempFile(suffix=".json") as fh: s.to_json(fh.name)
def test_sra2fastq_gz(method): infile = "SRR390728" outfile = bioconvert_data("SRR390728_1.fastq") outfile2 = bioconvert_data("SRR390728_2.fastq") with TempFile(suffix=".fastq.gz") as tempfile: converter = SRA2FASTQ(infile, tempfile.name, True) converter(method=method) outbasename, ext = os.path.splitext(tempfile.name) if ext == ".gz": outbasename, ext = os.path.splitext(outbasename) with gzip.open(outbasename + "_1.fastq.gz", 'rb') as f_in, open(outbasename + "_1.fastq", 'wb') as f_out: shutil.copyfileobj(f_in, f_out) with gzip.open(outbasename + "_2.fastq.gz", 'rb') as f_in, open(outbasename + "_2.fastq", 'wb') as f_out: shutil.copyfileobj(f_in, f_out) # Check that the output is correct with a checksum assert md5(outbasename + "_1.fastq") == md5(outfile) assert md5(outbasename + "_2.fastq") == md5(outfile2)
def _method_wiggletools(self, *args, **kwargs): """ """ import os from easydev import TempFile # with need a unique name, that does not exists for the symlink # Fixes #233 fname = None with TempFile(suffix=".bb") as ftemp: fname = ftemp.name os.symlink(os.path.abspath(self.infile), ftemp.name) try: cmd = "wiggletools {} > {}".format(ftemp.name, self.outfile) self.execute(cmd) except Exception as err: raise(err) finally: # clean symlink os.unlink(fname)
def test_af1(): data = sequana_data("test_vcf_mpileup_4dot1.vcf") v = VCF(data) variant = next(v.vcf) variant.INFO['AF1'] = 1 assert v.vcf.is_valid_af1(variant) is True variant.INFO['AF1'] = 0.5 assert v.vcf.is_valid_af1(variant) is False # polymorphic case variant = next(v.vcf) variant.INFO['AF1'] = 1 assert v.vcf.is_valid_af1(variant) is False #variant.INFO['AF1'] = 0.5 #assert v.vcf.is_valid_af1(variant) is True v = VCF(data) v.vcf.apply_af1_filter = True with TempFile() as fh: res = v.vcf.filter_vcf(fh.name) assert res == {'N': 573, 'filtered': 391, 'unfiltered': 182}
def test_copy_requirements(): # We need 4 cases: # 1- http # 2- a sequana file (phix) # 3- an existing file elsewhere (here just a temporary file) # 4- an existing file in the same directory as the target dir from easydev import TempFile fh = tempfile.TemporaryDirectory() targetdir = fh.name # Case 3: a temporary file temprequire = TempFile() # Case 4: a local file (copy of the temp file) # TODO #localfile = temprequire.name.split(os.sep)[-1] #shutil.copy(temprequire.name, targetdir) cfg = snaketools.SequanaConfig() cfg.config.requirements = [ "phiX174.fa", temprequire.name, #localfile, "https://raw.githubusercontent.com/sequana/sequana/master/README.rst" ] cfg._update_yaml() cfg.copy_requirements(target=fh.name) # error cfg.config.requirements = ['dummy'] try: cfg.copy_requirements(target=fh.name) assert False except: assert True
def _method_chain(self, *args, **kwargs): """This method successively uses the default conversion method of each converter in the conversion path.""" def conv_step(converter, infile, outfile): """Performs one conversion step.""" converter(infile, outfile)(*args, **kwargs) # Contains the last temporary output file, if any pipe_files = deque() for (step_num, ((_, out_fmt), converter)) \ in enumerate(self.converter_map, start=1): if step_num == 1: # May not be necessary: step_infile = None step_input = self.infile del_infile = False else: step_infile = pipe_files.popleft() step_input = step_infile.name del_infile = True if step_num == self.nb_steps: # May not be necessary: step_outfile = None step_output = self.outfile else: #FIXME: for mutiple IO converters if len(out_fmt) == 1: step_outfile = TempFile(suffix=out_fmt[0].lower()) step_output = step_outfile.name pipe_files.append(step_outfile) conv_step(converter, step_input, step_output) if del_infile: step_infile.delete()
def test_stats_file(): data = sequana_data("test_demultiplex_Stats.json") s = StatsFile(data) with TempFile() as fout: s.to_summary_reads(fout.name) with TempFile() as fout: s.barplot_summary(fout.name) with TempFile() as fout: s.barplot() for lane in s.get_data_reads().lane.unique(): os.remove("lane{}_status.png".format(lane)) data = sequana_data("test_demultiplex_Stats_undetermined.json") s = StatsFile(data) with TempFile() as fout: s.to_summary_reads(fout.name) with TempFile() as fout: s.barplot_summary(fout.name) with TempFile() as fout: s.barplot() for lane in s.get_data_reads().lane.unique(): os.remove("lane{}_status.png".format(lane))
def test_xmfa2phy(method): infile = bioconvert_data("test_phylip2xmfa.xmfa") #outfile = bioconvert_data("test_phylip2xmfa.xmfa") with TempFile(suffix=".xmfa") as tempfile: converter = XMFA2PHYLIP(infile, tempfile.name) converter(method=method)
def create_graph(filename, layout="dot", use_singularity=False): """ :param filename: should end in .png or .svg or .dot If extension is .dot, only the dot file is created. This is useful if you have issues installing graphviz. If so, under Linux you could use our singularity container see github.com/cokelaer/graphviz4all """ from bioconvert.core.registry import Registry rr = Registry() try: if filename.endswith(".dot") or use_singularity is True: raise from pygraphviz import AGraph dg = AGraph(directed=True) for a, b in rr.get_conversions(): dg.add_edge(a, b) dg.layout(layout) dg.draw(filename) except: dot = """ strict digraph{ node [label="\\N"]; """ nodes = set([item for items in rr.get_conversions() for item in items]) for node in nodes: dot += "\"{}\";\n".format(node) for a, b in rr.get_conversions(): dot += "\"{}\" -> \"{}\";\n".format(a, b) dot += "}\n" from easydev import TempFile from bioconvert import shell dotfile = TempFile(suffix=".dot") with open(dotfile.name, "w") as fout: fout.write(dot) dotpath = "" if use_singularity: from bioconvert.core.downloader import download_singularity_image singfile = download_singularity_image( "graphviz.simg", "shub://cokelaer/graphviz4all:v1", "4288088d91c848e5e3a327282a1ab3d1") dotpath = "singularity run {} ".format(singfile) on_rtd = environ.get('READTHEDOCS', None) == 'True' if on_rtd: dotpath = "" ext = filename.rsplit(".", 1)[1] cmd = "{}dot -T{} {} -o {}".format(dotpath, ext, dotfile.name, filename) try: shell(cmd) except: import os os.system(cmd)
def main(args=None): user_options = Options(prog="sequana") if args is None: args = sys.argv # If --help or no options provided, show the help if len(args) == 1: user_options.parse_args(["prog", "--help"]) else: options = user_options.parse_args(args[1:]) if options.version: import sequana print(sequana.version) sys.exit() if options.jobs > 20 and options.bypass is False: raise ValueError('The number of jobs is limited to 20. You can ' + 'force this limit by using --bypass-job-limit') if misc.on_cluster("tars-") and options.unlock is False: if options.cluster is None: raise ValueError("You are on TARS (Institut Pasteur). You " + " must use --cluster option to provide the scheduler " + " options (typically ' --cluster 'sbatch --qos normal' )") # valid codecs: valid_extensions = [("fastq." + ext2).rstrip(".") for ext2 in ['', 'bz2', 'gz', 'dsrc']] valid_extensions += [("fq." + ext2).rstrip(".") for ext2 in ['', 'bz2', 'gz', 'dsrc']] valid_combos = [(x, y) for x in valid_extensions for y in valid_extensions if x!=y] if (options.source, options.target) not in valid_combos: raise ValueError("""--target and --source combo not valid. Must be one of fastq, fastq.gz, fastq.bz2 or fastq.dsrc""") # Create the config file locally module = Module("compressor") with TempFile(suffix=".yaml", dir=".") as temp: cfg = SequanaConfig(module.config) cfg.config.compressor.source = options.source cfg.config.compressor.target = options.target cfg.config.compressor.recursive = options.recursive cfg.config.compressor.verbose = options.verbose cfg.config.compressor.threads = options.threads cfg._update_yaml() cfg.save(filename=temp.name) # The Snakefile can stay in its original place: rule = module.path + os.sep + "compressor.rules" # Run the snakemake command itself. cmd = 'snakemake -s %s --configfile %s -j %s ' % \ (rule, temp.name, options.jobs) if options.dryrun: cmd += " --dryrun " if options.verbose is False: cmd += " --quiet " else: cmd += " -p " # for slurm only: --cores-per-socket if options.cluster: cluster = ' --cluster "%s" ' % options.cluster cmd += cluster if options.snakemake: if " -s " in options.snakemake or " -j " in options.snakemake: raise ValueError("-s or -j cannot be used in " + " --snakemake-options (already used internally") cmd += options.snakemake if options.unlock: cmd += " --unlock " if options.verbose: print(cmd) # On travis, snakemake.shell command from snakemake fails. # Most probably because travis itself uses a subprocess. # excute from easydev uses pexpect.spawn, which seems to work well from easydev import execute execute(cmd, showcmd=False)
def test_converter1(): infile = bioconvert_data("test_measles.sorted.bam") with TempFile(suffix=".bed") as tempfile: import sys sys.argv = ["bioconvert", infile, tempfile.name, "--force"] converter.main()
def test_wget(): from easydev import TempFile with TempFile() as fh: wget("https://github.com/sequana/sequana/raw/master/README.rst", fh.name)
def test_fasta(): with TempFile(suffix=".fasta") as fout: f = fasta.FastaSim(fout.name) f.nreads = 1000 f.simulate()
def test_savePathwayAs(wikipath): # Note that not all WP have the PDF format available. # WP4 has not (march 2018) with TempFile(suffix=".png") as fout: wikipath.savePathwayAs("WP232", fout.name, display=False)
def test_converter(): infile = bioconvert_data("test_measles.sorted.bam") with TempFile(suffix=".bed") as tempfile: cmd = "bioconvert %s %s --force" % (infile, tempfile.name) subprocess.Popen(cmd, shell=True)
def test_genomecov(): filename = sequana_data('JB409847.bed') try: bed = bedtools.GenomeCov("dummy.csv") assert False except: assert True try: bed = bedtools.GenomeCov(filename, "dummy.gbk") assert False except: assert True # !now let us read the good data sets bed = bedtools.GenomeCov(filename, sequana_data('JB409847.gbk')) bed.compute_coverage(4001) bed = bedtools.GenomeCov(filename, sequana_data('JB409847.gbk')) bed2 = bedtools.GenomeCov(filename, sequana_data('JB409847.gbk')) assert bed == bed # test equality for same chromosome but different data bed2.chr_list[0].df["cov"] += 100 assert bed != bed2 # test equality for same chromosome but different data bed2.chr_list[0].df["cov"] -= 100 bed2.chr_list.append("dummy") assert bed != bed2 # setter must be bool try: bed.circular = 1 assert False except: assert True # cant use setter try: bed.feature_dict = {} assert False except: assert True assert len(bed) == 1 # a getter for the first chromosome bed[0] # setter available but not sure this is useful bed.window_size = 4001 bed.hist() # This requires to call other method before for chrom in bed: chrom.moving_average(n=501) chrom.running_median(n=501, circular=True) chrom.running_median(n=501, circular=False) chrom.compute_zscore() roi = chrom.get_roi() with TempFile(suffix='.png') as fh: chrom.plot_coverage(filename=fh.name) with TempFile(suffix='.png') as fh: chrom.plot_hist_zscore(filename=fh.name) with TempFile(suffix='.png') as fh: chrom.plot_hist_normalized_coverage(filename=fh.name) len(chrom) print(chrom) chrom.get_size() chrom.get_mean_cov() chrom.get_var_coef() with TempFile(suffix='.csv') as fh: bed.to_csv(fh.name) bed2 = bedtools.GenomeCov(fh.name, sequana_data('JB409847.gbk')) # plotting bed.chr_list[0].plot_hist_coverage() bed.chr_list[0].plot_hist_coverage(logx=False,logy=True) bed.chr_list[0].plot_hist_coverage(logx=True,logy=False) with TempFile(suffix=".png") as fh: bed.chr_list[0].plot_hist_coverage(logx=False,logy=False, filename=fh.name)
def test_genomecov(): filename = sequana_data('JB409847.bed') # wrong file try: bed = bedtools.GenomeCov("dummy.csv") assert False except: assert True # wrong threshold try: bed = bedtools.GenomeCov(filename, high_threshold=2) assert False except: assert True # wrong threshold try: bed = bedtools.GenomeCov(filename, low_threshold=-2) assert False except: assert True # wrong genbank try: bed = bedtools.GenomeCov(filename, "dummy.gbk") assert False except: assert True # !now let us read the good data sets by chunkd bed = bedtools.GenomeCov(filename, sequana_data('JB409847.gbk'), chunksize=5000) for c in bed.chr_list: c.run(1001, k=2) # setter must be bool try: bed.circular = 1 assert False except: assert True # cant use setter try: bed.feature_dict = {} assert False except: assert True assert len(bed) == 1 # a getter for the first chromosome bed[0] # setter available but not sure this is useful bed.window_size = 4000 bed.window_size = 4001 bed.hist() # This requires to call other method before for chrom in bed: chrom.moving_average(n=501) chrom.running_median(n=501, circular=True) chrom.running_median(n=501, circular=False) chrom.compute_zscore() roi = chrom.get_rois() with TempFile(suffix='.png') as fh: chrom.plot_coverage(filename=fh.name) with TempFile(suffix='.png') as fh: chrom.plot_hist_zscore(filename=fh.name) with TempFile(suffix='.png') as fh: chrom.plot_hist_normalized_coverage(filename=fh.name) len(chrom) print(chrom) chrom.get_size() chrom.DOC chrom.CV with TempFile(suffix='.csv') as fh: bed.gc_window_size = 100 bed.to_csv(fh.name) # plotting bed.chr_list[0].plot_hist_coverage() bed.chr_list[0].plot_hist_coverage(logx=False, logy=True) bed.chr_list[0].plot_hist_coverage(logx=True, logy=False) with TempFile(suffix=".png") as fh: bed.chr_list[0].plot_hist_coverage(logx=False, logy=False, filename=fh.name)