def create_summary_pages(self): """Create summary pages Once the main analyis is done (:meth:`analyse`), and the company packages have been created (:meth:`create_data_packages_for_companies`), you can run this method that will creade a summary HTML page (index.html) for the tissue, and a similar summary HTML page for the tissues of each company. Finally, an HTML summary page for the companies is also created. The final tree direcorty looks like:: |-- index.html |-- company_packages | |-- index.html | |-- Company1 | | |-- Tissue1 | | |-- Tissue2 | | |-- index.html | |-- Company2 | | |-- Tissue1 | | |-- Tissue2 | | |-- index.html |-- tissue_packages | |-- index.html | |-- Tissue1 | |-- Tissue2 """ # First for the main directory (tissue_packages): print(purple("Creating summary index.html for the tissues")) self._create_summary_pages(self.main_directory, verbose=False) # Then for each companies: print(purple("Creating summary index.html for each company")) pb = Progress(len(self.companies)) for i, company in enumerate(self.companies): try: self._create_summary_pages(self.company_directory + os.sep + company, verbose=False, company=company) except Exception as err: print( red("Issue with %s. Continue with other companies" % company)) print(err) pb.animate(i + 1) # Finally, an index towards each company self._create_main_index()
def create_summary_pages(self): """Create summary pages Once the main analyis is done (:meth:`analyse`), and the company packages have been created (:meth:`create_data_packages_for_companies`), you can run this method that will creade a summary HTML page (index.html) for the tissue, and a similar summary HTML page for the tissues of each company. Finally, an HTML summary page for the companies is also created. The final tree direcorty looks like:: |-- index.html |-- company_packages | |-- index.html | |-- Company1 | | |-- Tissue1 | | |-- Tissue2 | | |-- index.html | |-- Company2 | | |-- Tissue1 | | |-- Tissue2 | | |-- index.html |-- tissue_packages | |-- index.html | |-- Tissue1 | |-- Tissue2 """ # First for the main directory (tissue_packages): print(purple("Creating summary index.html for the tissues")) self._create_summary_pages(self.main_directory, verbose=False) # Then for each companies: print(purple("Creating summary index.html for each company")) pb = Progress(len(self.companies)) for i, company in enumerate(self.companies): try: self._create_summary_pages(self.company_directory + os.sep + company, verbose=False, company=company) except Exception as err: print(red("Issue with %s. Continue with other companies" % company)) print(err) pb.animate(i+1) # Finally, an index towards each company self._create_main_index()
def _analyse_all(self, multicore=None): for gf_filename in sorted(self.gf_filenames): tcga = gf_filename.split("_")[1].split('.')[0] print(purple('======================== Analysing %s data' % tcga)) self.mkdir(self.main_directory + os.sep + tcga) # Computes the ANOVA try: self.ic50 = IC50(self.ic50_filename) except: print("Clustering IC50 (v18 released data ?)") self.ic50 = IC50Cluster(self.ic50_filename, verbose=False) an = ANOVA(self.ic50, gf_filename, self.drug_decode, verbose=False) if self.test is True: an.features.df = an.features.df[an.features.df.columns[0:15]] self.an = an an.settings = ANOVASettings(**self.settings) an.settings.analysis_type = tcga an.init() # This reset the directory results = an.anova_all(multicore=multicore) an.settings.directory = self.main_directory + os.sep + tcga # Store the results self.results[tcga] = results print('Analysing %s data and creating images' % tcga) self.report = ANOVAReport(an) self.report.settings.savefig = True self.report.create_html_pages(onweb=False)
def check_ipython_notebook(): notebooks = glob.glob("*ipynb") N = len(notebooks) pb = Progress(N) for i, filename in enumerate(notebooks): print(purple(filename)) notebook = read(open(filename), 'json') r = NotebookRunner(notebook) r.run_notebook() pb.animate(i + 1)
def check_ipython_notebook(): notebooks = glob.glob("*ipynb") N = len(notebooks) pb = Progress(N) for i,filename in enumerate(notebooks): print(purple(filename)) notebook = read(open(filename), 'json') r = NotebookRunner(notebook) r.run_notebook() pb.animate(i+1)
def main(args=None): if args is None: args = sys.argv[:] print(purple("Welcome to sequana_substractor")) print(purple("WARNING. TESTED ON LONG READS ONLY. EXPERIMENTAL")) user_options = Options(prog="sequana_substractor") if len(args) == 1: args.append("--help") if "--version" in sys.argv: import sequana print(sequana.version) sys.exit(0) options = user_options.parse_args(args[1:]) logger.setLevel(options.level) # build the references list references = [] if options.reference: references.append(options.reference) if options.references: references = options.references options.references = references references = [] # expand globs if any for ref in options.references: references.extend(glob.glob(ref)) logger.info("{} references provided: {}".format(len(references), ",".join(references))) # call the entire machinery here sub = Substractor(options.input, references, options.outdir, options.mapper, options.threads) sub.run(options.outfile)
def main(args=None): if args is None: args = sys.argv[:] print(purple("Welcome to sequana_bam_splitter")) user_options = Options(prog="sequana_bam_splitter") if len(args) == 1: args.append("--help") if "--version" in sys.argv: import sequana print(sequana.version) sys.exit(0) options = user_options.parse_args(args[1:]) # set the level logger.level = options.level logger.info("This SAM/BAM/CRAM splitter is used for paired or un-paired " "reads with perfectly mapped or unmapped reads (flags 0, 4, " "16). Others are dropped.") logger.info("Reading {}".format(options.input)) # What prefix used for the output filename ? if options.prefix is None: prefix = options.input.rstrip(".bam") prefix = "test" else: prefix = options.prefix if options.outdir: prefix = options.outdir + os.sep + prefix if os.path.exists(options.outdir) is False: from easydev import mkdirs logger.info("Creating {} directory".format(options.outdir)) mkdirs(options.outdir) match, unmatch, flags = _main(options.input, prefix, keep_unmapped=options.keep_unmapped) logger.info("Matched: {}".format(match)) logger.info("Unmatched (flag 4 and 256): {}".format(unmatch)) logger.info("All flags: {}".format(Counter(flags)))
def __init__(self, prog="sequana_coverage"): usage = purple("""\nWelcome to SEQUANA -- Coverage standalone Extract and plot coverage of one or more chromosomes/contigs in a BED or BAM file. In addition, running median used in conjunction with double thresholds extract regions of interests (low or high coverage). A reference may be provided to plot the coverage versus GC content. The input file should be one of the following: - a BED file that is a tabulated file at least 3 columns. The first column being the reference, the second is the position and the third column contains the coverage itself. - or a BAM file that is converted automatically into a BED file using the following command: samtools depth -aa input.bam > output.bed If the reference is provided, an additional plot showing the coverage versus GC content is also shown. Here are some examples sequana_coverage --input file.bed --window-median 1001 sequana_coverage --input file.bam --window-median 1001 -r <REFERENCE.fa> An other interesting option is to provide a BED file with 4 columns. The fourth column being another coverage data created with a filter. One can create such a file only from the BAM file using samtools as follows given the original unfiltered BAM file named input.bam: samtools view -q 35 -o data.filtered.bam input.bam samtools depth input.bam data.filtered.bam -aa > test.bed sequana_coverage --input test.bed --show-html Note that the first file is the filtered one, and the second file is the unfiltered one. Note for multi chromosome and genbank features: for now, you will need to call sequana_coverage for each chromosome individually since we accept only one genbank as input parameter: sequana_coverage --input file.bed --genbank chrom1.gbk -c 1 chromosome order in the BED and """) epilog = purple(""" ---- AUTHORS: Thomas Cokelaer, Dimitri Desvillechabrol Documentation: http://sequana.readthedocs.io Issues: http://github.com/sequana/sequana """) description = """DESCRIPTION: """ super(Options, self).__init__(usage=usage, prog=prog, description=description, epilog=epilog, formatter_class=CustomFormatter) # options to fill the config file group = self.add_argument_group("Required argument") group.add_argument("-i", "--input", dest="input", type=str, help=("Input file in BED or BAM format. If a BAM file is " "provided, it will be converted locally to a BED file " "using genomecov, which must be installed.")) group = self.add_argument_group("Optional biological arguments") group.add_argument( '-c', "--chromosome", dest="chromosome", type=int, default=0, help="Chromosome number (if only one, no need to use: the single" " chromosome is chosen automatically). Default is " " first chromosome found in the BED file. You may want to" " analyse all chromosomes at the same time. If so, set this" " parameter to 0") group.add_argument('-o', "--circular", dest="circular", default=False, action="store_true", help="""If the DNA of the organism is circular (typically viruses or bacteria), set to True""") group = self.add_argument_group("General") group.add_argument("--output-directory", dest="output_directory", default="report", help="name of the output (report) directory.") group.add_argument('--show-html', dest="show_html", default=False, action='store_true', help="""When report is created, you can open the main page automatically with this option (default is False)""") group.add_argument("-q", "--quiet", dest="verbose", default=True, action="store_false") group.add_argument('--no-report', dest="create_report", default=True, action='store_false', help="""Do not create any HTML report""") group.add_argument("--logging-level", dest="logging_level", default="INFO", help="set to DEBUG, INFO, WARNING, CRITICAL, ERROR") group = self.add_argument_group('Annotation') group.add_argument("-b", "--genbank", dest="genbank", type=str, default=None, help='a valida genbank annotation') # Group related to GC content group = self.add_argument_group("GC content related") group.add_argument('-r', "--reference", dest="reference", type=str, default=None, help="""If available, you can provide a reference (ENA/NCBI). It must have the same length as the one used to create the BAM or BED file. If provided, it is used to create the coverage versus GC content image""") group.add_argument( "-g", "--window-gc", dest="w_gc", type=int, default=201, help="""Length of the running window to compute the GC content""") group.add_argument('-n', "--nlevels", dest="levels", type=int, default=3, help="""Number of levels in the contour""") #group running median group = self.add_argument_group("Running Median related") group.add_argument("-w", "--window-median", dest="w_median", type=int, help="""Length of the running median window (default 4001, recommended for viruses). For long genome, 20001 or 30001 is recommended but larger windows may be useful in the presence of long deleted regions.""", default=4001) group.add_argument("-k", "--mixture-models", dest="k", type=int, help="""Number of mixture models to use (default 2, although if sequencing depth is below 8, k is set to 1 automatically). To ignore that behaviour set k to the required value""", default=None) group.add_argument("-L", "--low-threshold", dest="low_threshold", default=None, type=float, help=("lower threshold (zscore) of the confidence interval. " "Overwrite value given by --threshold/-T")) group.add_argument("-H", "--high-threshold", dest="high_threshold", default=None, type=float, help=("higher threshold (zscore) of the confidence interval. " "Overwrite value given by --threshold/-T")) group.add_argument("-T", "--threshold", dest="threshold", default=4, type=float, help="""set lower and higher thresholds of the confidence interval.""") group = self.add_argument_group("Download reference") group.add_argument("--download-reference", dest="download_reference", default=None, type=str) group.add_argument("--download-genbank", dest="download_genbank", default=None, type=str) group.add_argument("--database", dest="database", default="ENA", type=str, choices=["ENA", "EUtils"], help="Download the reference from one of these database (default ENA)")
def __init__(self, prog="sequana_coverage"): usage = purple("""\nWelcome to SEQUANA -- Coverage standalone Extract and plot coverage of one or more chromosomes/contigs in a BED or BAM file. In addition, running median used in conjunction with double thresholds extract regions of interests (low or high coverage). A reference may be provided to plot the coverage versus GC content. The input file should be one of the following: - a BED file that is a tabulated file at least 3 columns. The first column being the reference, the second is the position and the third column contains the coverage itself. - or a BAM file that is converted automatically into a BED file using the following command: samtools depth -aa input.bam > output.bed If the reference is provided, an additional plot showing the coverage versus GC content is also shown. Here are some examples sequana_coverage --input file.bed --window-median 1001 sequana_coverage --input file.bam --window-median 1001 -r <REFERENCE.fa> An other interesting option is to provide a BED file with 4 columns. The fourth column being another coverage data created with a filter. One can create such a file only from the BAM file using samtools as follows given the original unfiltered BAM file named input.bam: samtools view -q 35 -o data.filtered.bam input.bam samtools depth input.bam data.filtered.bam -aa > test.bed sequana_coverage --input test.bed --show-html Note that the first file is the filtered one, and the second file is the unfiltered one. Note for multi chromosome and genbank features: for now, you will need to call sequana_coverage for each chromosome individually since we accept only one genbank as input parameter: sequana_coverage --input file.bed --genbank chrom1.gbk -c 1 Large genomes: -------------- If your input data is large and does not fit into memory, use the --binning BIN options to average data into bin of BIN values. CNV cases: -------------- By default, sequana_coverage identify events as small as 1 bin. For the CNV detection case, you may want to cluster events. the --cnv-merging DELTA option merges consecutives events whose distance is smaller that DELTA """) epilog = purple(""" ---- AUTHORS: Thomas Cokelaer, Dimitri Desvillechabrol Documentation: http://sequana.readthedocs.io Issues: http://github.com/sequana/sequana """) description = """DESCRIPTION: """ super(Options, self).__init__(usage=usage, prog=prog, description=description, epilog=epilog, formatter_class=CustomFormatter) # options to fill the config file group = self.add_argument_group("Required argument") group.add_argument("-i", "--input", dest="input", type=str, help=("Input file in BED or BAM format. If a BAM file is " "provided, it will be converted locally to a BED file " "using genomecov, which must be installed.")) group = self.add_argument_group("Optional biological arguments") group.add_argument( '-c', "--chromosome", dest="chromosome", type=int, default=-1, help="Chromosome number (if only one chromosome found, the single" " chromosome is chosen automatically). Otherwise all " "chromosomes are analysed. You may want to analyse only one" " in which case, use this parameter (e.g., -c 1). " "!!START AT INDEX 0 !!") group.add_argument('-o', "--circular", dest="circular", default=False, action="store_true", help="""If the DNA of the organism is circular (typically viruses or bacteria), set to True""") group = self.add_argument_group("General") group.add_argument("--output-directory", dest="output_directory", default="report", help="name of the output (report) directory.") group.add_argument("-q", "--quiet", dest="verbose", default=True, action="store_false") group.add_argument('--no-html', dest="skip_html", default=False, action='store_true', help="""Do not create any HTML reports. Save ROIs and statistics only.""") group.add_argument('--no-multiqc', dest="skip_multiqc", default=False, action='store_true', help="""Do not create any multiqc HTML page.""") group.add_argument("--debug-level", dest="logging_level", default="INFO", help="set to DEBUG, INFO, WARNING, CRITICAL, ERROR") group.add_argument("--level", dest="logging_level", default="INFO", help="set to DEBUG, INFO, WARNING, CRITICAL, ERROR") group = self.add_argument_group('Annotation') group.add_argument("-b", "--genbank", dest="genbank", type=str, default=None, help='a valid genbank annotation') # Group related to GC content group = self.add_argument_group("GC content related") group.add_argument('-r', "--reference", dest="reference", type=str, default=None, help="""If available, you can provide a reference (ENA/NCBI). It must have the same length as the one used to create the BAM or BED file. If provided, it is used to create the coverage versus GC content image""") group.add_argument( "-g", "--window-gc", dest="w_gc", type=int, default=201, help="""Length of the running window to compute the GC content""") group.add_argument('-n', "--nlevels", dest="levels", type=int, default=3, help="""Number of levels in the contour""") #group running median group = self.add_argument_group("Running Median and clustering related") group.add_argument("-w", "--window-median", dest="w_median", type=int, help="""Length of the running median window (default 20001, recommended for bacteria). For short genome (below 100000 bases), we set this parameter to one fifth of the genome length .""", default=20001) group.add_argument("-k", "--mixture-models", dest="k", type=int, help="""Number of mixture models to use (default 2, although if sequencing depth is below 8, k is set to 1 automatically). To ignore that behaviour set k to the required value""", default=2) group.add_argument("-L", "--low-threshold", dest="low_threshold", default=None, type=float, help=("lower threshold (zscore) of the confidence interval. " "Overwrite value given by --threshold/-T")) group.add_argument("-H", "--high-threshold", dest="high_threshold", default=None, type=float, help=("higher threshold (zscore) of the confidence interval. " "Overwrite value given by --threshold/-T")) group.add_argument("-T", "--threshold", dest="threshold", default=4, type=float, help="""set lower and higher thresholds of the confidence interval.""") group.add_argument("-C", "--clustering-parameter", dest="double_threshold", default=0.5, type=float, help="""set lower and higher double threshold parameter (in [0,1]). Do not use value close to zero. Ideally, around 0.5. lower value will tend to cluster more than higher value""") group = self.add_argument_group("Large data related - CNV detection") group.add_argument("-s", "--chunk-size", dest="chunksize", type=int, default=5000000, min=1000000, action=Min, help="""Length of the chunk to be used for the analysis. """) group.add_argument("-B", "--binning", dest="binning", type=int, default=None, min=2, action=Min, help="""merge consecutive (non overlapping) data points, taking the mean. This is useful for large genome (e.g. human). This allows a faster computation, especially for CNV detection were only large windows are of interest. For instance, using a binning of 50 or 100 allows the human genome to be analysed.""") group.add_argument("--cnv-clustering", dest="cnv_clustering", default=-1, type=int, help="""Two consecutive ROIs are merged when their distance in bases is below this parameter. If set to -1, not used. """) # group facilities group = self.add_argument_group("Download reference") group.add_argument("--download-reference", dest="download_reference", default=None, type=str) group.add_argument("--download-genbank", dest="download_genbank", default=None, type=str) group.add_argument("--database", dest="database", default="ENA", type=str, choices=["ENA", "EUtils"], help="Download the reference from one of these database (default ENA)")
from sequana.scripts.tools import SequanaOptions from sequana import logger from easydev.console import purple class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter): pass epilog = purple(""" ---- AUTHORS: Thomas Cokelaer Documentation: http://sequana.readthedocs.io Issues: http://github.com/sequana/sequana """) class Options(argparse.ArgumentParser, SequanaOptions): def __init__(self, prog="sequana_vcf_filter"): usage = """%s Only for VCF using mpileup version 4.1 for now\n""" % prog usage += """usage2: %s vcf_filter""" % prog usage += """Examples: sequana_vcf_filter --input test.vcf --quality 40 --filter "AF1>0.95&AF1<0.05" --filter "MQ<30"
def main(args=None): if args is None: args = sys.argv[:] from easydev.console import purple, underline if "-v" in args or "--verbosity" in args: print(purple("Welcome to bioconvert (bioconvert.readthedocs.io)")) arg_parser = argparse.ArgumentParser(prog="bioconvert", epilog=" ---- ", description="""Convertor infer the formats from the extension name. We do not scan the input file. Therefore users must ensure that their input format files are properly formatted.""", usage=""" # convert fastq to fasta bioconvert test.fastq test.fasta # if input extension is not standard, use -i to specify it bioconvert test.FASTQ test.fasta -i fastq bioconvert test.fastq -o fasta # You may have several inputs, in which case wildcards are possible # Note, however, the quotes that are required bioconvert "test*.fastq" -o fasta # batch is also possible. bioconvert "test*.fastq" -o fasta -m Note the difference between the two previous commands !! """) arg_parser.add_argument("input_file", default=None, help="The path to the file to convert.") arg_parser.add_argument("output_file", nargs="?", default=None, help="The path where the result will be stored.") arg_parser.add_argument("-f", "--formats", action=ConvAction, default=False, help="Display available formats and exit.") arg_parser.add_argument( "-v", "--verbosity", default="INFO", help= "Set the outpout verbosity. Should be one of DEBUG, INFO, WARNING, ERROR, CRITICAL" ) arg_parser.add_argument( "-i", "--input-format", default=None, help= "Provide the input format. Check the --formats to see valid input name" ) arg_parser.add_argument( "-o", "--output-format", default=None, help= "Provide the output format. Check the --formats to see valid input name" ) arg_parser.add_argument( "-x", "--threads", default=None, type=int, help="Number of threads. Depends on the underlying tool") arg_parser.add_argument("-m", "--batch", default=False, action="store_true", help="for batch effect") arg_parser.add_argument("-c", "--method", default=None, help="A converter may have several methods") arg_parser.add_argument( "-F", "--force", action="store_true", help="if outfile exists, it is overwritten with this option") arg_parser.add_argument("-s", "--show-methods", default=False, action="store_true", help="A converter may have several methods") arg_parser.add_argument("-b", "--benchmark", default=False, action="store_true", help="Running all available methods") arg_parser.add_argument("-N", "--benchmark-N", default=5, type=int, help="Number of trials for each methods") args = arg_parser.parse_args() # Set the logging level bioconvert.logger_set_level(args.verbosity) # Figure out whether we have several input files or not # Are we in batch mode ? import glob if args.batch: filenames = glob.glob(args.input_file) else: filenames = [args.input_file] for filename in filenames: args.input_file = filename analysis(args)
def purple(self, txt, force=False): if self.verbose or force is True: print(purple(txt))
def create_data_packages_for_companies(self, companies=None): """Creates a data package for each company found in the DrugDecode file """ ########################################################## #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# # # # DRUG_DECODE and IC50 inputs must be filtered to keep # # only WEBRELEASE=Y and owner # # # #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# ########################################################## # companies must be just one name (one string) or a list of strings # By default, takes all companies found in DrugDecode if isinstance(companies, str): companies = [companies] if companies is None: companies = self.companies if len(companies) == 0: raise ValueError( "Could not find any companies in the DrugDecode file") # The main directory self.mkdir(self.company_directory) # Loop over all companies, retrieving information built # in analyse() method, selecting for each TCGA all information # for that company only (and public drugs) Ncomp = len(companies) for ii, company in enumerate(companies): print( purple("\n=========== Analysing company %s out of %s (%s)" % (ii + 1, Ncomp, company))) self.mkdir(self.company_directory + os.sep + company) # Handle each TCGA case separately for gf_filename in sorted(self.gf_filenames): tcga = gf_filename.split("_")[1].split('.')[0] print(brown(" ------- building TCGA %s sub directory" % tcga)) # Read the results previously computed either try: results_df = self.results[tcga].df.copy() except: results_path = "%s/%s/OUTPUT/results.csv" % ( self.main_directory, tcga) results_df = ANOVAResults(results_path) # MAke sure the results are formatted correctly results = ANOVAResults(results_df) # Get the DrugDecode information for that company only drug_decode_company = self.drug_decode.df.query( "WEBRELEASE=='Y' or OWNED_BY=='%s'" % company) # Transform into a proper DrugDecode class for safety drug_decode_company = DrugDecode(drug_decode_company) # Filter the results to keep only public drugs and that # company. Make sure this is integers results.df["DRUG_ID"] = results.df["DRUG_ID"].astype(int) mask = [ True if x in drug_decode_company.df.index else False for x in results.df.DRUG_ID ] results.df = results.df.ix[mask] # We read the IC50 again try: self.ic50 = IC50(self.ic50_filename) except: self.ic50 = IC50Cluster(self.ic50_filename, verbose=False) # And create an ANOVA instance. This is not to do the analyse # again but to hold various information an = ANOVA(self.ic50, gf_filename, drug_decode_company, verbose=False) def drug_to_keep(drug): to_keep = drug in drug_decode_company.df.index return to_keep an.ic50.df = an.ic50.df.select(drug_to_keep, axis=1) an.settings = ANOVASettings(**self.settings) an.init() an.settings.directory = self.company_directory + os.sep + company + os.sep + tcga an.settings.analysis_type = tcga # Now we create the report self.report = ANOVAReport(an, results, drug_decode=drug_decode_company, verbose=self.verbose) self.report.company = company self.report.settings.analysis_type = tcga self.report.create_html_main(False) self.report.create_html_manova(False) self.report.create_html_features() self.report.create_html_drugs() self.report.create_html_associations()
def anova_pipeline(args=None): """This function is used by the standalone application called **gdsctools_anova** Type:: gdsctools_anova --help to get some help. """ msg = "Welcome to GDSCTools standalone" print_color(msg, purple, underline=True) # Keep the argument args as None by default to # allow testing e.g., in nosetests if args is None: args = sys.argv[:] elif len(args) == 1: args += ['--help'] user_options = ANOVAOptions(prog="gdsctools_anova") try: options = user_options.parse_args(args[1:]) except SystemExit: return # ----------------------------------------------------------------- # ---------------------------------------- options without analysis # ----------------------------------------------------------------- if options.version is True: print("This is version %s of gdsctools_anova" % gdsctools.version) return if options.testing is True: print('Testing mode:') from gdsctools import ANOVA, ic50_test an = ANOVA(ic50_test) df = an.anova_one_drug_one_feature('Drug_1047_IC50', 'TP53_mut') assert df.loc[1,'N_FEATURE_pos'] == 554, \ "N_feature_pos must be equal to 554" print(df.T) print(darkgreen("\nGDSCTools seems to be installed properly")) return if options.save_settings: from gdsctools import ANOVA, ic50_test an = ANOVA(ic50_test) an.settings.to_json(options.save_settings) print('Save a default parameter set in %s' % options.save_settings) return if options.license is True: print(gdsctools.license) return if options.summary is True: from gdsctools import anova an = anova.ANOVA(options.input_ic50, options.input_features) print(an) return if options.print_tissues is True: from gdsctools import anova an = anova.ANOVA(options.input_ic50, options.input_features) tissues = an.tissue_factor try: tissues = tissues.sort_values('Tissue Factor').unique() except: tissues = tissues.sort(inplace=False).unique() for name in tissues: print(name) return if options.print_drugs is True: from gdsctools import anova gdsc = anova.ANOVA(options.input_ic50, options.input_features) import textwrap print("\n".join(textwrap.wrap(" , ".join(gdsc.drugIds)))) return if options.print_features is True: from gdsctools import anova gdsc = anova.ANOVA(options.input_ic50, options.input_features) import textwrap print("\n".join(textwrap.wrap(" , ".join(gdsc.feature_names)))) return # ----------------------------------------------------------------- # --------------------------------------------------- real analysis # ----------------------------------------------------------------- # dispatcher to the functions according to the user parameters from gdsctools import ANOVA, ANOVAReport anova = ANOVA(options.input_ic50, options.input_features, options.input_drug, low_memory=not options.fast) anova = _set_settings(anova, options) if options.drug and options.drug not in anova.ic50.df.columns: print(red("Invalid Drug. Try --print-drug-names")) sys.exit(1) if options.drug is not None and options.feature is not None: print_color("ODOF mode", purple) anova_one_drug_one_feature(anova, options) elif options.drug is not None: print_color("ODAF mode", purple) anova_one_drug(anova, options) else: # analyse everything if options.feature is None: print_color("ADAF mode", purple) else: print_color("ADOF mode", purple) anova_all(anova, options) if options.onweb is False and options.no_html is False: msg = "\nNote that a directory {} was created and files saved into it" print(purple(msg.format(options.directory))) return
def main(args=None): if args is None: args = sys.argv[:] from easydev.console import purple, underline print(purple("Welcome to bioconvert (bioconvert.readthedocs.io)")) arg_parser = argparse.ArgumentParser(prog="bioconvert", epilog=" ---- ", description="""DESCRIPTION: Convertor infer the formats from the extension name. We do not scan the input file. Therefore users must ensure that their input format files are properly formatted. """) arg_parser.add_argument("input_file", help="The path to the file to convert.") arg_parser.add_argument("output_file", help="The path where the result will be stored.") arg_parser.add_argument("-f", "--formats", action=ConvAction, default=False, help="Display available formats and exit.") arg_parser.add_argument("-v", "--verbosity", action="count", default=0, help="Set the outpout verbosity.") arg_parser.add_argument("-x", "--input-format", default=None, help="Provide the input format. Check the --formats to see valid input name") args = arg_parser.parse_args() # Set the logging level args.verbosity = max(10, 30 - (10 * args.verbosity)) bioconvert.logger_set_level(args.verbosity) _log = colorlog.getLogger('bioconvert') mapper = Registry() infile = args.input_file outfile = args.output_file # Users may provide information about the input file. # Indeed, the input may be a FastQ file but with an extension # that is not standard. For instance fq instead of fastq # If so, we can use the --input-format fastq to overwrite the # provided filename extension inext = os.path.splitext(infile)[-1] outext = os.path.splitext(outfile)[-1] if args.input_format: inext = args.input_format if not inext.startswith("."): inext = "." + inext if not inext: raise RuntimeError("convert infer the format from the extension name." " So add extension to the input file name or use --input-format option.") if not outext: raise RuntimeError("convert infer the format from the extension name." " So add extension to the output file name.") # From the input parameters 1 and 2, we get the module name try: _log.info("Input: {}".format(inext)) _log.info("Output: {}".format(outext)) class_converter = mapper[(inext, outext)] except KeyError: print(mapper) print(inext) print(outext) # Is the module name available in biokit ? If not, let us tell the user msg = "Request input format ({}) to output format (({}) is not available in converters" _log.critical(msg.format(inext, outext)) _log.critical("Use --formats to know the available formats") sys.exit(1) # If the module exists, it is part of the MapperRegitry dictionary and # we should be able to import it dynamically, create the class and call # the instance _log.info("Converting from {} to {}".format(inext, outext)) convert = class_converter(infile, outfile) convert() _log.info("Done")
def main(args=None): from easydev.console import purple, underline print(purple("Welcome to biokit converter (biokit.readthedocs.io)")) mapper = MapperRegistry() if args is None: args = sys.argv[:] user_options = Options(prog="converter") # If --help or no options provided, show the help if "-f" in args or "--formats" in args: options = user_options.parse_args(args[1:]) if options.format: print("Available mapping:") print("==================") for k in sorted(mapper): print("{}: {}".format(k, mapper[k])) sys.exit(0) if len(args) < 3: user_options.parse_args(["prog", "--help"]) else: infile = args[1] outfile = args[2] options = user_options.parse_args(args[3:]) # Set the logging level biokit_debug_level(options.logging_level) # Users may provide information about the input file. # Indeed, the input may be a FastQ file but with an extension # that is not standard. For instance fq instead of fastq # If so, we can use the --input-format fastq to overwrite the # provided filename extension inext = os.path.splitext(infile)[-1][1:] outext = os.path.splitext(outfile)[-1][1:] if options.input_format: inext = options.input_format # From the input parameters 1 and 2, we get the module name module_name = "2".join([inext, outext]) # Is the module name available in biokit ? If not, let us tell the user if module_name not in mapper.keys(): msg = "Request input format ({}) to output format (({}) is not available in converters" logger.critical(msg.format(inext, outext)) logger.critical("Use --formats to know the available formats") sys.exit(1) # If the module exists, it is part of the MapperRegitry dictionary and # we should be able to import it dynamically, create the class and call # the instance logger.info("Converting from {} to {}".format(inext, outext)) module = importlib.import_module("biokit.converters.{}".format(module_name)) class_reference = getattr(module, mapper[module_name]) convert = class_reference(infile, outfile) convert() logger.info("Done")
def create_data_packages_for_companies(self, companies=None): """Creates a data package for each company found in the DrugDecode file """ ########################################################## #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# # # # DRUG_DECODE and IC50 inputs must be filtered to keep # # only WEBRELEASE=Y and owner # # # #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!# ########################################################## # companies must be just one name (one string) or a list of strings # By default, takes all companies found in DrugDecode if isinstance(companies, str): companies = [companies] if companies is None: companies = self.companies if len(companies) == 0: raise ValueError("Could not find any companies in the DrugDecode file") # The main directory self.mkdir(self.company_directory) # Loop over all companies, retrieving information built # in analyse() method, selecting for each TCGA all information # for that company only (and public drugs) Ncomp = len(companies) for ii, company in enumerate(companies): print(purple("\n=========== Analysing company %s out of %s (%s)" % (ii+1, Ncomp, company))) self.mkdir(self.company_directory + os.sep + company) # Handle each TCGA case separately for gf_filename in sorted(self.gf_filenames): tcga = gf_filename.split("_")[1].split('.')[0] print(brown(" ------- building TCGA %s sub directory" % tcga)) # Read the results previously computed either try: results_df = self.results[tcga].df.copy() except: results_path = "%s/%s/OUTPUT/results.csv" % (self.main_directory, tcga) results_df = ANOVAResults(results_path) # MAke sure the results are formatted correctly results = ANOVAResults(results_df) # Get the DrugDecode information for that company only drug_decode_company = self.drug_decode.df.query( "WEBRELEASE=='Y' or OWNED_BY=='%s'" % company) # Transform into a proper DrugDecode class for safety drug_decode_company = DrugDecode(drug_decode_company) # Filter the results to keep only public drugs and that # company. Make sure this is integers results.df["DRUG_ID"] = results.df["DRUG_ID"].astype(int) mask = [True if x in drug_decode_company.df.index else False for x in results.df.DRUG_ID] results.df = results.df.ix[mask] # We read the IC50 again try: self.ic50 = IC50(self.ic50_filename) except: self.ic50 = IC50Cluster(self.ic50_filename, verbose=False) # And create an ANOVA instance. This is not to do the analyse # again but to hold various information an = ANOVA(self.ic50, gf_filename, drug_decode_company, verbose=False) def drug_to_keep(drug): to_keep = drug in drug_decode_company.df.index return to_keep an.ic50.df = an.ic50.df.select(drug_to_keep, axis=1) an.settings = ANOVASettings(**self.settings) an.init() an.settings.directory = self.company_directory + os.sep + company + os.sep + tcga an.settings.analysis_type = tcga # Now we create the report self.report = ANOVAReport(an, results, drug_decode=drug_decode_company, verbose=self.verbose) self.report.company = company self.report.settings.analysis_type = tcga self.report.create_html_main(False) self.report.create_html_manova(False) self.report.create_html_features() self.report.create_html_drugs() self.report.create_html_associations()
verbo_nb = sum([1 for opt in sys.argv if opt.startswith('--verb')]) verbosity = v_nb + verbo_nb bioconvert.logger_set_level(max(10, 30 - (10 * verbosity))) mapper = Registry() print("Available mapping:") print("==================") for k in sorted(mapper.get_conversions()): print("{} -> {}".format(k[0], k[1])) sys.exit(0) if __name__ == "__main__": from easydev.console import purple, underline print(purple("Welcome to bioconvert (bioconvert.readthedocs.io)")) arg_parser = argparse.ArgumentParser(prog="converter", epilog=" ---- ", description="""DESCRIPTION: Convertor infer the formats from the extension name. We do not scan the input file. Therefore users must ensure that their input format files are properly formatted. """) arg_parser.add_argument("input_file", help="The path to the file to convert.") arg_parser.add_argument("output_file", help="The path where the result will be stored.")
def run(self, color=True): """Executes 'python setup.py' with the user commands on all packages. """ if color: try: from easydev.console import bold, red, green, \ color_terminal, nocolor, underline, purple except: try: sys.path.insert(0, os.path.join('deploy', 'src', 'deploy')) from console import bold, red, green, \ color_terminal, nocolor, underline, purple except: pass if not color_terminal(): # Windows' poor cmd box doesn't understand ANSI sequences nocolor() else: bold = purple = red = green = underline = str print(bold("Running multisetup version %s" % __revision__.split()[2])) #project_dir = self.curdir.basename() directories = [package for package in self.packages] print('Will process the following directories: ', ) for directory in directories: print(bold(directory)), #print bold(directory.basename()), print('') try: for directory in directories: try: os.chdir(directory) print( underline('Entering %s package' % os.path.basename(directory))) # % directory.basename()) except OSError as err: print( underline('Entering %s package' % os.path.basename(directory))) print( red("cannot find this directory (%s)" % os.path.basename(directory))) print(err) print('Python exec : ', sys.executable) #print underline('Entering %s package' % directory.basename()) for cmd in self.commands: setup_command = '%s setup.py %s ' % (sys.executable, cmd) print("\tExecuting " + setup_command + '...processing', ) #Run setup.py with user commands outputs = None errors = None if self.verbose: process = Popen(setup_command, shell=True) status = process.wait() else: process = Popen(setup_command, stdout=PIPE, stderr=PIPE, shell=True) #status = process.wait() outputs, errors = process.communicate() if process.returncode == 0: print(green('done')) else: if not self.verbose: print( red('\tFailed. ( error code %s) ' % (process.returncode))) os.chdir(self.curdir) if not self.force: raise RuntimeError() if 'pylint' in cmd: if outputs is not None: for x in outputs.split('\n'): if x.startswith('Your code has been'): print(purple('\t%s' % x)) if 'nosetests' in cmd: if errors is not None: for x in errors.split('\n'): if x.startswith('TOTAL'): res = x.replace('TOTAL', 'Total coverage') res = " ".join(res.split()) print(purple('\t%s' % res)) if x.startswith('Ran'): print(purple('\t%s' % x)) if x.startswith('FAILED'): print(purple('\t%s' % x)) else: print(purple('all right')) os.chdir(self.curdir) except RuntimeError: sys.exit() os.chdir(self.curdir)
from sequana import FastQ from sequana import logger import colorlog logger = colorlob.getLogger(__name__) class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter): pass epilog = purple(""" ---- AUTHORS: Thomas Cokelaer Documentation: http://sequana.readthedocs.io Issues: http://github.com/sequana/sequana """) class Options(argparse.ArgumentParser, SequanaOptions): def __init__(self, prog="sequana_substractor"): usage = """%s reads (flag 256+4) saving the mapped reads in a file, and the unmapped in another file\n""" % prog usage += """usage2: %s --input test.fastq --reference Phix174.fa\n""" % prog usage += """ """ super(Options, self).__init__(usage=usage, prog=prog,
def sequana_init(options): import sequana from sequana.misc import textwrap from sequana import SequanaConfig, sequana_data sa = Tools(verbose=options.verbose) # Check that the pipeline is well defined module = Module(options.pipeline) if os.path.exists(options.target_dir): txt = "Will override the following files if present: %s.rules " +\ "config.yaml, runme.sh, ..." sa.blue(txt % options.pipeline) if options.force is True: choice = "y" else: choice = input( red("Do you want to proceed (to avoid this " + " message, use --force)? [y]/n:")) if choice == "n": sys.exit(0) # Copying snakefile logger.info("Copying snakefile") sa.mkdir(options.target_dir) shutil.copy(module.snakefile, options.target_dir + os.sep + options.pipeline + ".rules") # Creating README to print on the screen and in a file txt = "User command::\n\n" txt += " %s \n\n" % " ".join(sys.argv) txt += "You can now run snakemake yourself or type::" txt += purple(""" snakemake -s %s.rules --stats stats.txt -p -j 4 """ % options.pipeline) txt += """ # -j 4 means you will use 4 cores # -p prints the commands used # --stats stats.txt must be used since stats.txt is expected to be found. or just run the bash script:: sh runme.sh EDIT THE config.yaml if needed Once finished with success, the report/ directory contains a summary.html and relevant files (depends on the pipeline). """ logger.info("Creating README") with open(options.target_dir + os.sep + "README", "w") as fh: fh.write(txt.replace("\x1b[35m", "").replace("\x1b[39;49;00m", "")) # Creating Config file logger.info("Creating the config file") # Create (if needed) and update the config file config_filename = options.target_dir + os.sep + "config.yaml" if options.config: # full existing path if os.path.exists(options.config): shutil.copy(options.config, config_filename) else: # or a sequana config file in the module path ? raise (IOError("Config file %s not found locally" % options.config)) else: copy_config_from_sequana(module, "config.yaml", config_filename) # Copy multiqc if it is available multiqc_filename = options.target_dir + os.sep + "multiqc_config.yaml" copy_config_from_sequana(module, "multiqc_config.yaml", multiqc_filename) cluster_cfg_filename = options.target_dir + os.sep + "cluster_config.json" copy_config_from_sequana(module, "cluster_config.json", cluster_cfg_filename) # The input cfg = SequanaConfig(config_filename) cfg.config.input_directory = options.input_directory cfg.config.input_pattern = options.pattern cfg.config.input_extension = options.extension cfg.config.input_samples.file1 = options.file1 cfg.config.input_samples.file2 = options.file2 cfg.config.input_readtag = options.input_readtag # Dedicated section for quality control section if options.pipeline == "quality_control": if options.design: shutil.copy(options.design, options.target_dir + os.sep) cfg.config['cutadapt'].design_file = os.path.basename( options.design) if options.kraken: cfg.config.kraken.database_directory = os.path.abspath( options.kraken) cfg.config.kraken.do = True else: cfg.config.kraken.do = False cfg.config['cutadapt'].fwd = options.adapter_fwd cfg.config['cutadapt'].rev = options.adapter_rev cfg.config['cutadapt'].adapter_type = options.adapters # Foir all pipeline using BWA if options.reference: cfg.config.bwa_mem.reference = os.path.abspath(options.reference) if options.pipeline == "variant_calling": if options.reference: cfg.config.bwa_mem_ref.reference = os.path.abspath( options.reference) if options.pipeline in ["rnaseq", "smallrnaseq"]: if options.design: shutil.copy(options.design, options.target_dir + os.sep) cfg.config['cutadapt'].design_file = os.path.basename( options.design) cfg.config['cutadapt'].fwd = options.adapter_fwd cfg.config['cutadapt'].rev = options.adapter_rev cfg.config['cutadapt'].adapter_choice = options.adapters cfg.copy_requirements(target=options.target_dir) # FIXME If invalid, no error raised if options.config_params: params = [this.strip() for this in options.config_params.split(",")] for param in params: if param.count(":") not in [1, 2, 3]: txt = "incorrect format following --config-params" txt += "Expected at least one : sign or at most 2 of them" txt += "Config file section such as :\n" txt += "project: tutorial\n" txt += "should be encoded project:tutorial" raise ValueError(txt) if param.count(":") == 1: k, v = param.split(':') cfg.config[k] = v elif param.count(":") == 2: k1, k2, v = param.split(":") cfg.config[k1][k2] = v elif param.count(":") == 3: k1, k2, k3, v = param.split(":") cfg.config[k1][k2][k3] = v # important to update yaml with content of config cfg._update_yaml() cfg.save(config_filename) # Creating a unique runme.sh file runme_filename = options.target_dir + os.sep + "runme.sh" with open(runme_filename, "w") as fout: cmd = "#!/bin/sh\n" cmd += "# generated with sequana version %s with this command:\n" % sequana.version cmd += "# %s\n" % " ".join(sys.argv) cmd += "snakemake -s %(project)s.rules --stats stats.txt -p -j %(jobs)s --nolock" if options.forceall: cmd += " --forceall " if options.cluster: # Do we want to include the cluster config option ? cluster_config = Module(options.pipeline).cluster_config if options.ignore_cluster_config is True: cluster_config = None if cluster_config is None: cmd += ' --cluster "%s"' % options.cluster else: cmd += ' --cluster "%s" --cluster-config %s' %\ (options.cluster, os.path.basename(cluster_config)) if options.redirection: cmd += " 1>run.out 2>run.err" fout.write( cmd % { 'project': options.pipeline, 'jobs': options.jobs, "version": sequana.version }) # change permission of runme.sh to 755 st = os.stat(runme_filename) os.chmod(runme_filename, st.st_mode | 0o755) sa.green("Initialisation of %s succeeded" % options.target_dir) sa.green("Please, go to the project directory ") sa.purple("\n cd %s\n" % options.target_dir) sa.green("Check out the README and config.yaml files") sa.green("A basic script to run the analysis is named runme.sh ") sa.purple("\n sh runme.sh\n") sa.purple("On a slurm cluster, you may type:") sa.purple("\n srun --qos normal runme.sh\n") sa.green( "In case of trouble, please post an issue on https://github.com/sequana/sequana/issue " ) sa.green( "or type sequana --issue and fill a post with the error and the config file (NO DATA PLEASE)" ) # Change permission try: #python 3 os.chmod(runme_filename, 0o755) except: logger.info( "Please use Python3. Change the mode of %s manually to 755" % runme_filename)
def run(self, color=True): """Executes 'python setup.py' with the user commands on all packages. """ if color: try: from easydev.console import bold, red, green, \ color_terminal, nocolor, underline, purple except: try: sys.path.insert(0, os.path.join('deploy', 'src', 'deploy')) from console import bold, red, green, \ color_terminal, nocolor, underline, purple except: pass if not color_terminal(): # Windows' poor cmd box doesn't understand ANSI sequences nocolor() else: bold = purple = red = green = underline = str print(bold("Running multisetup version %s" % __revision__.split()[2])) #project_dir = self.curdir.basename() directories = [package for package in self.packages] print('Will process the following directories: ',) for directory in directories: print(bold(directory)), #print bold(directory.basename()), print('') try: for directory in directories: try: os.chdir(directory) print(underline('Entering %s package' % os.path.basename(directory))) # % directory.basename()) except OSError as err: print(underline('Entering %s package' % os.path.basename(directory))) print(red("cannot find this directory (%s)" % os.path.basename(directory))) print(err) print('Python exec : ' , sys.executable) #print underline('Entering %s package' % directory.basename()) for cmd in self.commands: setup_command = '%s setup.py %s ' % (sys.executable,cmd) print("\tExecuting " + setup_command + '...processing',) #Run setup.py with user commands outputs = None errors = None if self.verbose: process = Popen(setup_command, shell=True) status = process.wait() else: process = Popen(setup_command, stdout=PIPE, stderr=PIPE, shell=True) #status = process.wait() outputs, errors = process.communicate() if process.returncode == 0: print(green('done')) else: if not self.verbose: print(red('\tFailed. ( error code %s) ' % (process.returncode))) os.chdir(self.curdir) if not self.force: raise RuntimeError() if 'pylint' in cmd: if outputs is not None: for x in outputs.split('\n'): if x.startswith('Your code has been'): print(purple('\t%s' % x)) if 'nosetests' in cmd: if errors is not None: for x in errors.split('\n'): if x.startswith('TOTAL'): res = x.replace('TOTAL', 'Total coverage') res = " ".join (res.split()) print(purple('\t%s' % res)) if x.startswith('Ran'): print(purple('\t%s' % x)) if x.startswith('FAILED'): print(purple('\t%s' % x)) else: print(purple('all right')) os.chdir(self.curdir) except RuntimeError: sys.exit() os.chdir(self.curdir)