Exemplo n.º 1
0
 def _set_config(self):
     config = BuscoConfig(
         self.species,
         sample_name=self.sample_name,
         outpath=self.outpath,
         conda_bin_path=self.conda_bin_path,
         Rscript_bin_path="",  # not required by our analysis
         tmp_path="./tmp_{}".format(self.sample_name))
     from easydev import mkdirs
     mkdirs(self.outpath)
     self.config_filename = self.outpath + "/config.ini"
     config.save_config_file(self.config_filename)
Exemplo n.º 2
0
    def fetch_ic50s(self):

        if os.path.exists(self.data_folder_name):
            pass
        else:
            from easydev import mkdirs
            mkdirs(self.data_folder_name)

        urllib.request.urlretrieve(self.url_base + "TableS4A.xlsx",
                                   self.data_folder_name + "ic50s.xlsx")

        self._format_data()
Exemplo n.º 3
0
    def __init__(self):
        if "DAMONA_PATH" not in os.environ:
            logger.error("DAMONA_PATH not found in your environment. You must define "
                "it. In this shell, type 'export DAMONA_PATH=PATH_WHERE_TO_PLACE_DAMONA'")
            sys.exit(1)

        self.damona_path = pathlib.Path(os.environ["DAMONA_PATH"])
        easydev.mkdirs(self.damona_path)
        easydev.mkdirs(self.damona_path / 'envs')
        easydev.mkdirs(self.damona_path / 'images')
        easydev.mkdirs(self.damona_path / 'images' / 'damona_buffer')
        easydev.mkdirs(self.damona_path / 'bin')
Exemplo n.º 4
0
 def _set_config(self):
     config = BuscoConfig(
         self.species,
         sample_name=self.sample_name,
         outpath=self.outpath,
         conda_bin_path=self.conda_bin_path,
         Rscript_bin_path="", # not required by our analysis
         tmp_path="./tmp_{}".format(self.sample_name)
     )
     from easydev import mkdirs
     mkdirs(self.outpath)
     self.config_filename = self.outpath + "/config.ini"
     config.save_config_file(self.config_filename)
Exemplo n.º 5
0
def main(args=None):
    if args is None:
        args = sys.argv[:]

    print(purple("Welcome to sequana_bam_splitter"))
    user_options = Options(prog="sequana_bam_splitter")
    if len(args) == 1:
        args.append("--help")

    if "--version" in sys.argv:
        import sequana
        print(sequana.version)
        sys.exit(0)

    options = user_options.parse_args(args[1:])

    # set the level
    logger.level = options.level
    logger.info("This SAM/BAM/CRAM splitter is used for paired or un-paired "
                "reads with perfectly mapped or unmapped reads (flags 0, 4, "
                "16). Others are dropped.")

    logger.info("Reading {}".format(options.input))

    # What prefix used for the output filename ?
    if options.prefix is None:
        prefix = options.input.rstrip(".bam")
        prefix = "test"
    else:
        prefix = options.prefix

    if options.outdir:
        prefix = options.outdir + os.sep + prefix
        if os.path.exists(options.outdir) is False:
            from easydev import mkdirs
            logger.info("Creating {} directory".format(options.outdir))
            mkdirs(options.outdir)

    match, unmatch, flags = _main(options.input,
                                  prefix,
                                  keep_unmapped=options.keep_unmapped)

    logger.info("Matched: {}".format(match))
    logger.info("Unmatched (flag 4 and 256): {}".format(unmatch))
    logger.info("All flags: {}".format(Counter(flags)))
Exemplo n.º 6
0
def krakendb():
    # todo
    try:
        taxonomy.main([prog, '--download', 'toydb'])
    except TypeError:  # Fails on travis so we download manually (appdirs returns
        # none instead of the expected user config path
        HOME = os.getenv('HOME')
        from sequana.misc import wget
        baseurl = "https://github.com/sequana/data/raw/master/kraken_toydb/"
        filenames = [
            "database.idx", "database.kdb", "taxonomy/names.dmp",
            "taxonomy/nodes.dmp"
        ]
        for filename in filenames:
            from easydev import mkdirs
            mkdirs(HOME + os.sep + "database/taxonomy")
            wget(baseurl + os.sep + filename,
                 os.sep.join([HOME, "database", filename]))
    except SystemExit:
        pass
Exemplo n.º 7
0
def run_analysis(chrom, options, feature_dict):

    if options.verbose:
        print(chrom)

    if options.verbose:
        logger.info('Computing running median (w=%s)' % options.w_median)

    # compute running median
    chrom.running_median(n=options.w_median, circular=options.circular)

    stats = chrom.get_stats(output="dataframe")
    stats.set_index("name", inplace=True)

    DOC = stats.ix['DOC'].Value
    if options.k is None and DOC < 8:
        options.k = 1
    elif options.k is None:
        options.k = 2

    if options.verbose:
        print("Number of mixture model %s " % options.k)
        print('Computing zscore')

    # Compute zscore
    chrom.compute_zscore(k=options.k, verbose=options.verbose)

    # Save the CSV file of the ROIs
    high = chrom.thresholds.high2
    low = chrom.thresholds.low2
    query = "zscore > @high or zscore < @low"
    if feature_dict and chrom.chrom_name in feature_dict:
        f = FilteredGenomeCov(chrom.df.query(query),
                        chrom.thresholds,
                        feature_list=feature_dict[chrom.chrom_name])
    else:
        f = FilteredGenomeCov(chrom.df.query(query), chrom.thresholds)
    directory = options.output_directory 
    directory += os.sep + "coverage_reports" 
    directory += os.sep + chrom.chrom_name
    mkdirs(directory)
    f.df.to_csv("{}/rois.csv".format(directory))

    if options.verbose:
        logger.info("Computing centralness")

    # Let us save the thresholds first and then change it to compute centralness
    thresholds = chrom.thresholds.copy()

    chrom.thresholds.low = -3
    chrom.thresholds.high = 3
    c3 = chrom.get_centralness()

    chrom.thresholds.low = -4
    chrom.thresholds.high = 4
    c4 = chrom.get_centralness()
    chrom.thresholds = thresholds.copy()   # Get back to the original values

    if options.verbose and chrom.thresholds:
        print(chrom.thresholds)

    if options.verbose:
        res = chrom._get_best_gaussian()
        print("sigma and mu of the central distribution: mu=%s, sigma=%s" %
            (round(res["mu"],3), round(res['sigma'],3)))
        print("Evenness: %8.3f" % chrom.get_evenness())
        print("Centralness (3 sigma): %f" % round(c3,3))
        print("Centralness (4 sigma): %f" % round(c4,4))

    if options.verbose:
        print("\n\n")
Exemplo n.º 8
0
def run_analysis(chrom, options, feature_dict):


    logger.info("Computing some metrics")
    if chrom.DOC < 8:
        logger.warning("The depth of coverage is below 8. sequana_coverage is"
                        " not optimised for such depth. You may want to "
                        " increase the threshold to avoid too many false detections")
    logger.info(chrom.__str__())

    if options.w_median > len(chrom.df) / 4:
        NW = int(len(chrom.df) / 4)
        if NW % 2 == 0:
            NW += 1
        logger.warning("median window length is too long. \n"
            "    Setting the window length automatically to a fifth of\n"
            "    the chromosome length ({})".format(NW))
        options.w_median = NW

    # compute the running median, zscore and ROIs for each chunk summarizing the
    # results in a ChromosomeCovMultiChunk instane
    logger.info('Using running median (w=%s)' % options.w_median)
    logger.info("Number of mixture models %s " % options.k)
    results = chrom.run(options.w_median, options.k,
                        circular=options.circular, binning=options.binning,
                        cnv_delta=options.cnv_clustering)


    # Print some info related to the fitted mixture models
    try:
        mu = results.data[0][0].as_dict()['data']['fit_mu']
        sigma = results.data[0][0].as_dict()['data']['fit_sigma']
        pi = results.data[0][0].as_dict()['data']['fit_pi']
        logger.info("Fitted central distribution (first chunk): mu=%s, sigma=%s, pi=%s" %
              (round(mu,3), round(sigma,3), round(pi,3)))
    except:
        pass

    # some information about the ROIs found
    high = chrom.thresholds.high2
    low = chrom.thresholds.low2
    logger.info("Searching for ROIs (threshold=[{},{}] ; double =[{},{}])".format(
        chrom.thresholds.low, chrom.thresholds.high, low, high))
    ROIs = results.get_rois() # results is a ChromosomeCovMultiChunk instane
    logger.info("Number of ROIs found: {}".format(len(ROIs.df)))
    logger.info("    - below average: {}".format(len(ROIs.get_low_rois())))
    logger.info("    - above average: {}".format(len(ROIs.get_high_rois())))

    # Create directory and save ROIs
    directory = options.output_directory
    directory += os.sep + "coverage_reports"
    directory += os.sep + chrom.chrom_name
    mkdirs(directory)
    ROIs.df.to_csv("{}/rois.csv".format(directory))

    # save summary and metrics
    logger.info("Computing extra metrics")
    summary = results.get_summary()

    summary.to_json(directory + os.sep + "sequana_summary_coverage.json")
    logger.info("Evenness: {}".format(summary.data['evenness']))
    logger.info("Centralness (3 sigma): {}".format(summary.data['C3']))
    logger.info("Centralness (4 sigma): {}".format(summary.data['C4']))

    if options.skip_html:
        return

    logger.info("Creating report in %s. Please wait" % config.output_dir)
    if chrom._mode == "chunks":
        logger.warning(("This chromosome is large. " 
            "Plots in the HTML reports are skipped"))
    datatable = CoverageModule.init_roi_datatable(ROIs)
    ChromosomeCoverageModule(chrom, datatable,
                options={"W": options.w_median,
                         "k": options.k,
                         "ROIs": ROIs,
                         "circular": options.circular},
                command=" ".join(["sequana_coverage"] + sys.argv[1:]))
Exemplo n.º 9
0
def run_analysis(chrom, options, feature_dict):

    logger.info("Computing some metrics")
    if chrom.DOC < 8:
        logger.warning("The depth of coverage is below 8. sequana_coverage is"
                        " not optimised for such depth. You may want to "
                        " increase the threshold to avoid too many false detections")
    logger.info(chrom.__str__())

    if options.w_median > len(chrom.df) / 4:
        NW = int(len(chrom.df) / 4)
        if NW % 2 == 0:
            NW += 1
        logger.warning("median window length is too long. \n"
            "    Setting the window length automatically to a fifth of\n"
            "    the chromosome length ({})".format(NW))
    else:
        NW = options.w_median

    ######################### DEFINES OUTPUT DIR AND SAMPLE NAME  ###########
    config.output_dir = options.output_directory
    config.sample_name = os.path.basename(options.input).split('.')[0]
    #########################################################################


    # compute the running median, zscore and ROIs for each chunk summarizing the
    # results in a ChromosomeCovMultiChunk instane
    logger.info('Using running median (w=%s)' % NW)
    logger.info("Number of mixture models %s " % options.k)
    results = chrom.run(NW, options.k,
                        circular=options.circular, binning=options.binning,
                        cnv_delta=options.cnv_clustering)

    # Print some info related to the fitted mixture models
    try:
        mu = results.data[0][0].as_dict()['data']['fit_mu']
        sigma = results.data[0][0].as_dict()['data']['fit_sigma']
        pi = results.data[0][0].as_dict()['data']['fit_pi']
        logger.info("Fitted central distribution (first chunk): mu=%s, sigma=%s, pi=%s" %
              (round(mu,3), round(sigma,3), round(pi,3)))
    except:
        pass


    # some information about the ROIs found
    high = chrom.thresholds.high2
    low = chrom.thresholds.low2
    logger.info("Searching for ROIs (threshold=[{},{}] ; double =[{},{}])".format(
        chrom.thresholds.low, chrom.thresholds.high, low, high))
    ROIs = results.get_rois() # results is a ChromosomeCovMultiChunk instane
    logger.info("Number of ROIs found: {}".format(len(ROIs.df)))
    logger.info("    - below average: {}".format(len(ROIs.get_low_rois())))
    logger.info("    - above average: {}".format(len(ROIs.get_high_rois())))

    # Create directory and save ROIs
    directory = options.output_directory

    directory ="{}/{}".format(options.output_directory, 
             chrom.chrom_name)
    mkdirs(directory)
    ROIs.df.to_csv("{}/rois.csv".format(directory))

    # save summary and metrics
    logger.info("Computing extra metrics")
    summary = results.get_summary(caller="sequana_coverage")

    summary.to_json("{}/sequana_summary_coverage.json".format(directory))
    logger.info("Evenness: {}".format(summary.data['evenness']))
    logger.info("Centralness (3 sigma): {}".format(summary.data['C3']))
    logger.info("Centralness (4 sigma): {}".format(summary.data['C4']))

    if options.skip_html:
        return

    chrom.plot_coverage("{}/coverage.png".format(directory))
    logger.info("Creating report in %s. Please wait" % options.output_directory)

    if chrom._mode == "chunks":
        logger.warning(("This chromosome is large. "
            "Plots in the HTML reports are skipped"))

    datatable = CoverageModule.init_roi_datatable(ROIs)

    # sample name not important for the standalone
    config.sample_name = "subreports"

    ChromosomeCoverageModule(chrom, datatable,
                options={"W": NW,
                         "k": options.k,
                         "ROIs": ROIs,
                         "circular": options.circular},
                command=" ".join(["sequana_coverage"] + sys.argv[1:]))
Exemplo n.º 10
0
 def __init__(self, dbname):
     super(Kraken2Builder, self).__init__(dbname)
     self.path_to_taxonomy = sequana_config_path + os.sep + "kraken2_taxonomy"
     from easydev import mkdirs
     mkdirs(self.path_to_taxonomy)
Exemplo n.º 11
0
def run_analysis(chrom, options, feature_dict):

    logger.info("Computing some metrics")
    if chrom.DOC < 8:
        logger.warning(
            "The depth of coverage is below 8. sequana_coverage is"
            " not optimised for such depth. You may want to "
            " increase the threshold to avoid too many false detections")
    logger.info(chrom.__str__())

    if options.w_median > len(chrom.df) / 5:
        NW = int(len(chrom.df) / 5)
        if NW % 2 == 0:
            NW += 1
        logger.warning(
            "median window length is too long. \n"
            "    Setting the window length automatically to a fifth of\n"
            "    the chromosome length ({})".format(NW))
        options.w_median = NW

    # compute the running median, zscore and ROIs for each chunk summarizing the
    # results in a ChromosomeCovMultiChunk instane
    logger.info('Using running median (w=%s)' % options.w_median)
    logger.info("Number of mixture models %s " % options.k)
    results = chrom.run(options.w_median, options.k, circular=options.circular)

    # Print some info related to the fitted mixture models
    try:
        mu = results.data[0][0].as_dict()['data']['fit_mu']
        sigma = results.data[0][0].as_dict()['data']['fit_sigma']
        pi = results.data[0][0].as_dict()['data']['fit_pi']
        logger.info(
            "Fitted central distribution (first chunk): mu=%s, sigma=%s, pi=%s"
            % (round(mu, 3), round(sigma, 3), round(pi, 3)))
    except:
        pass

    # some information about the ROIs found
    high = chrom.thresholds.high2
    low = chrom.thresholds.low2
    logger.info(
        "Searching for ROIs (threshold=[{},{}] ; double =[{},{}])".format(
            chrom.thresholds.low, chrom.thresholds.high, low, high))
    ROIs = results.get_rois()  # results is a ChromosomeCovMultiChunk instane
    logger.info("Number of ROIs found: {}".format(len(ROIs.df)))
    logger.info("    - below average: {}".format(len(ROIs.get_low_rois())))
    logger.info("    - above average: {}".format(len(ROIs.get_high_rois())))

    # Create directory and save ROIs
    directory = options.output_directory
    directory += os.sep + "coverage_reports"
    directory += os.sep + chrom.chrom_name
    mkdirs(directory)
    ROIs.df.to_csv("{}/rois.csv".format(directory))

    # save summary and metrics
    logger.info("Computing extra metrics")
    summary = results.get_summary()

    summary.to_json(directory + os.sep + "sequana_summary_coverage.json")
    logger.info("Evenness: {}".format(summary.data['evenness']))
    logger.info("Centralness (3 sigma): {}".format(summary.data['C3']))
    logger.info("Centralness (4 sigma): {}".format(summary.data['C4']))

    if options.skip_html:
        return

    logger.info("Creating report in %s. Please wait" % config.output_dir)
    if chrom._mode == "chunks":
        logger.warning(
            ("This chromosome is large (more than {0}). Producing "
             "plots and HTML sub coverage plots only for data from 0 to "
             "{0} bases. Neccesitate to recompute some metrics. Please wait"
             ).format(options.chunksize))
    datatable = CoverageModule.init_roi_datatable(ROIs)
    ChromosomeCoverageModule(chrom,
                             datatable,
                             options={
                                 "W": options.w_median,
                                 "k": options.k,
                                 "ROIs": ROIs,
                                 "circular": options.circular
                             })
Exemplo n.º 12
0
def rnadiff(**kwargs):
    """Perform RNA-seq differential analysis.

    This command performs the differential analysis of gene expression. The
    analysis is performed on feature counts generated by a RNA-seq analysis 
    (see e.g. https://github.com/sequana/rnaseq pipeline). The analysis is
    performed by DESeq2. A HTML report is created as well as a set of output
    files, including summary table of the analysis.

    To perform this analysis, you will need the GFF file used during the RNA-seq
    analysis, the feature stored altogether in a single file, an experimental 
    design file, and the feature and attribute used during the feature count. 

    Here is an example:

\b
        sequana rnadiff --annotation Lepto.gff  
            --design design.csv --features all_features.out 
             --feature-name gene --attribute-name ID 


    """
    import pandas as pd
    from sequana.featurecounts import FeatureCount
    from sequana.rnadiff import RNADiffAnalysis, RNADesign
    from sequana.modules_report.rnadiff import RNAdiffModule

    logger.setLevel(kwargs['logger'])

    outdir = kwargs['output_directory']
    feature = kwargs['feature_name']
    attribute = kwargs['attribute_name']
    design = kwargs['design']
    reference = kwargs['reference']

    if kwargs['annotation']:
        gff = kwargs['annotation']
        logger.info(f"Checking annotation file")
        from sequana import GFF3
        g = GFF3(gff)  #.save_annotation_to_csv()
        if feature not in g.features:
            logger.critical(
                f"{feature} not found in the GFF. Most probably a wrong feature name"
            )
        attributes = g.get_attributes(feature)
        if attribute not in attributes:
            logger.critical(
                f"{attribute} not found in the GFF for the provided feature. Most probably a wrong feature name. Please change --attribute-name option or do not provide any GFF"
            )
            sys.exit(1)
    else:
        gff = None

    design_check = RNADesign(design, reference=reference)
    compa_csv = kwargs['comparisons']
    if compa_csv:
        compa_df = pd.read_csv(compa_csv)
        comparisons = list(zip(compa_df["alternative"], compa_df["reference"]))
    else:
        comparisons = design_check.comparisons

    if kwargs['report_only'] is False:
        logger.info(
            f"Processing features counts and saving into {outdir}/light_counts.csv"
        )
        fc = FeatureCount(kwargs['features'])
        from easydev import mkdirs
        mkdirs(f"{outdir}")
        fc.rnadiff_df.to_csv(f"{outdir}/light_counts.csv")

        logger.info(f"Differential analysis to be saved into ./{outdir}")
        for k in sorted([
                "independent_filtering", "beta_prior", "cooks_cutoff",
                "fit_type", "reference"
        ]):
            logger.info(f"  Parameter {k} set to : {kwargs[k]}")
        r = RNADiffAnalysis(
            f"{outdir}/light_counts.csv",
            design,
            condition=kwargs["condition"],
            comparisons=comparisons,
            fc_feature=feature,
            fc_attribute=attribute,
            outdir=outdir,
            gff=gff,
            cooks_cutoff=kwargs.get("cooks_cutoff"),
            independent_filtering=kwargs.get("independent_filtering"),
            beta_prior=kwargs.get("beta_prior"),
            fit_type=kwargs.get('fit_type'))

        logger.info(f"Saving output files into {outdir}/rnadiff.csv")
        try:
            results = r.run()
            results.to_csv(f"{outdir}/rnadiff.csv")
        except Exception as err:
            logger.error(err)
            sys.exit(1)
        else:
            logger.info(f"DGE done.")
            # cleanup if succesful
            os.remove(f"{outdir}/rnadiff.err")
            os.remove(f"{outdir}/rnadiff.out")
            os.remove(f"{outdir}/rnadiff_light.R")

    logger.info(f"Reporting. Saving in rnadiff.html")
    report = RNAdiffModule(outdir,
                           kwargs['design'],
                           gff=gff,
                           fc_attribute=attribute,
                           fc_feature=feature,
                           alpha=0.05,
                           log2_fc=0,
                           condition=kwargs["condition"],
                           annot_cols=None,
                           pattern="*vs*_degs_DESeq2.csv")