Exemplo n.º 1
0
    def run(self):
        if not self._check_pargs(["project"]):
            return
        if self.pargs.post_process:
            self.pargs.post_process = os.path.abspath(self.pargs.post_process)
        basedir = os.path.abspath(os.path.join(self.app.controller._meta.root_path, self.app.controller._meta.path_id))
        if self.pargs.from_ssheet:
            [
                samplesheet_csv_to_yaml(fn)
                for fn in find_samples(basedir, pattern="SampleSheet.csv$", **vars(self.pargs))
            ]
        flist = find_samples(basedir, **vars(self.pargs))
        # Add filtering on flowcell if necessary
        self._meta.pattern = ".*"
        flist = [x for x in flist if self._filter_fn(x)]
        if self.pargs.merged:
            ##  Setup merged samples and append to flist if new list longer
            flist = setup_merged_samples(flist, **vars(self.pargs))
        if not len(flist) > 0:
            self.log.info("No sample configuration files found")
            return
        if len(flist) > 0 and not query_yes_no(
            "Going to start {} jobs... Are you sure you want to continue?".format(len(flist)), force=self.pargs.force
        ):
            return
        # Make absolutely sure analysis directory is a *subdirectory* of the working directory
        validate_sample_directories(flist, basedir)
        orig_dir = os.path.abspath(os.getcwd())

        for run_info in flist:
            os.chdir(os.path.abspath(os.path.dirname(run_info)))
            setup_sample(run_info, **vars(self.pargs))
            os.chdir(orig_dir)
        if self.pargs.only_setup:
            return
        if self.pargs.only_failed:
            status = {x: self._sample_status(x) for x in flist}
            flist = [x for x in flist if self._sample_status(x) == "FAIL"]
        ## Here process files again, removing if requested, and running the pipeline
        for run_info in flist:
            self.app.log.info("Running analysis defined by config file {}".format(run_info))
            os.chdir(os.path.abspath(os.path.dirname(run_info)))
            if self.app.cmd.monitor(work_dir=os.path.dirname(run_info)):
                self.app.log.warn("Not running job")
                continue
            if self.pargs.restart:
                self.app.log.info("Removing old analysis files in {}".format(os.path.dirname(run_info)))
                remove_files(run_info, **vars(self.pargs))
            (cl, platform_args) = run_bcbb_command(run_info, **vars(self.pargs))
            self.app.cmd.command(
                cl, **{"platform_args": platform_args, "saveJobId": True, "workingDirectory": os.path.dirname(run_info)}
            )
            os.chdir(orig_dir)
Exemplo n.º 2
0
 def vcf_summary(self):
     if not self._check_pargs(["project"]):
         return
     flist = find_samples(os.path.abspath(os.path.join(self.app.controller._meta.project_root, self.app.controller._meta.path_id)), **vars(self.pargs))
     vcf_d = get_vcf_files(flist, **vars(self.pargs))
     ## Traverse files, copy to result directory, run bgzip and tabix, and merge vcfs to one file
     outdir = os.path.join(os.path.abspath(os.path.join(self.app.controller._meta.project_root, self.app.controller._meta.path_id, "intermediate", "results", "vcf")))
     if not os.path.exists(outdir):
         self.app.cmd.safe_makedir(outdir)
     for k, v in vcf_d.iteritems():
         # FIXME: this should be memoized
         if os.path.exists("{}.tbi".format(v)):
             self.app.log.info("{}.tbi exists; skipping bgzip and tabix operations".format(v))
             continue
         if not v.endswith(".gz"):
             ## bgzip
             self.app.log.info("Running bgzip on {}".format(v))
             cl = ["bgzip", v]
             self.app.cmd.command(cl)
         # tabix
         self.app.log.info("Running tabix on {}.gz".format(v))
         cl = ["tabix", "-f", "-p", "vcf", "{}.gz".format(v)]
         self.app.cmd.command(cl)
     # Make all-variants file
     all_variants = os.path.join(outdir, "all-variants.vcf")
     cl = ['vcf-merge'] + vcf_d.values()# + [">",  all_variants]
     if not os.path.exists(all_variants):
         self.app.log.info("Merging vcf files {} to {}".format(vcf_d.values() ,all_variants))
         output = self.app.cmd.command(cl)
         with open(all_variants, "w") as fh:
             fh.write(output)
     cl = ['bgzip', all_variants]
     self.app.cmd.command(cl)
     cl = ['tabix', "-f", "-p", "vcf", "{}.gz".format(all_variants)]
     self.app.cmd.command(cl)
Exemplo n.º 3
0
 def test_sample_table(self):
     """Test making a sample table"""
     flist = find_samples(j_doe_00_01)
     samples = sample_table(flist)
     grouped = samples.groupby("sample")
     self.assertEqual(len(grouped.groups["P001_101_index3"]), 2)
     self.assertEqual(len(grouped.groups["P001_102_index6"]), 1)
Exemplo n.º 4
0
 def test_setup_merged_samples(self):
     """Test setting up merged samples"""
     flist = find_samples(j_doe_00_05)
     setup_merged_samples(flist, **{'dry_run':False})
     with open(os.path.join(j_doe_00_05, "P001_101_index3", "TOTAL", "P001_101_index3-bcbb-config.yaml")) as fh:
         conf = yaml.load(fh)
     self.assertEqual(conf["details"][0]["files"][0], os.path.join(j_doe_00_05, "P001_101_index3", "TOTAL", "P001_101_index3_B002BBBXX_TGACCA_L001_R1_001.fastq.gz"))
Exemplo n.º 5
0
def compile_qc(path, application="seqcap", **kw):
    """Perform qc on data without access to statusdb.

    :param **kw: keyword argument

    """
    output_data = {'stdout':StringIO(), 'stderr':StringIO()}
    ### find_samples excrutiatingly slow for multi-sample projects where we can have > 100k files...
    flist = find_samples(path, **kw)
    srm_l = []
    for f in flist:
        LOG.debug("Opening config file {}".format(f))
        with open(f) as fh:
            runinfo_yaml = yaml.load(fh)
        for info in runinfo_yaml['details']:
            if info.get("multiplex", None):
                for mp in info.get("multiplex"):
                    sample_kw = dict(path=os.path.dirname(f), flowcell=runinfo_yaml.get("fc_name", None), date=runinfo_yaml.get("fc_date", None), lane=info.get("lane", None), barcode_name=mp.get("name", None), sample_prj=kw.get("project"), barcode_id=mp.get('barcode_id', None), sequence=mp.get('sequence', None))
                    obj = SampleRunMetrics(**sample_kw)
                    srm_l.append(obj)
            else:
                sample_kw = dict(path=os.path.dirname(f), flowcell=runinfo_yaml.get("fc_name", None), date=runinfo_yaml.get("fc_date", None), lane=info.get("lane", None), barcode_name=info.get("description", None), sample_prj=kw.get("project"), barcode_id=None, sequence=None)
                obj = SampleRunMetrics(**sample_kw)
                obj.read_picard_metrics()
                srm_l.append(obj)
    qcdata = []
    output_data = _qc_info_header(kw.get("project"), application, output_data)
    for s in srm_l:
        qcdata.append(_srm_to_qc(s))
    for v in qcdata:
        y = [str(x) for x in assess_qc(v, application)]
        output_data["stdout"].write("".join(y) + "\n")
    return output_data
Exemplo n.º 6
0
    def test_setup_samples(self):
        """Test setting up samples, changing genome to rn4"""
        flist = find_samples(j_doe_00_05)
        for f in flist:
            setup_sample(f, **{'analysis':'Align_standard_seqcap', 'genome_build':'rn4', 'dry_run':False, 'baits':'rat_baits.interval_list', 'targets':'rat_targets.interval_list', 'num_cores':8, 'distributed':False})
        for f in flist:
            with open(f, "r") as fh:
                config = yaml.load(fh)
            if config["details"][0].get("multiplex", None):
                self.assertEqual(config["details"][0]["multiplex"][0]["genome_build"], "rn4")
            else:
                self.assertEqual(config["details"][0]["genome_build"], "rn4")

            with open(f.replace("-bcbb-config.yaml", "-post_process.yaml")) as fh:
                config = yaml.load(fh)
            self.assertEqual(config["custom_algorithms"][ANALYSIS_TYPE]["hybrid_bait"], 'rat_baits.interval_list')
            self.assertEqual(config["custom_algorithms"][ANALYSIS_TYPE]["hybrid_target"], 'rat_targets.interval_list')
            self.assertEqual(config["algorithm"]["num_cores"], 8)
                
        for f in flist:
            setup_sample(f, **{'analysis':ANALYSIS_TYPE, 'genome_build':'rn4', 'dry_run':False,
                               'no_only_run':True, 'google_report':True,
                               'dry_run':False, 'baits':'rat_baits.interval_list', 'targets':'rat_targets.interval_list', 'amplicon':True, 'num_cores':8, 'distributed':False})
            with open(f, "r") as fh:
                config = yaml.load(fh)
            if config["details"][0].get("multiplex", None):
                self.assertEqual(config["details"][0]["multiplex"][0]["genome_build"], "rn4")
            else:
                self.assertEqual(config["details"][0]["genome_build"], "rn4")
            with open(f.replace("-bcbb-config.yaml", "-post_process.yaml")) as fh:
                config = yaml.load(fh)
            self.assertEqual(config["algorithm"]["mark_duplicates"], False)
            self.assertEqual(config["custom_algorithms"][ANALYSIS_TYPE]["mark_duplicates"], False)
Exemplo n.º 7
0
 def test_sample_table(self):
     """Test making a sample table"""
     flist = find_samples(j_doe_00_01)
     samples = sample_table(flist)
     grouped = samples.groupby("sample")
     self.assertEqual(len(grouped.groups["P001_101_index3"]), 2)
     self.assertEqual(len(grouped.groups["P001_102_index6"]), 1)
Exemplo n.º 8
0
 def bpreport(self):
     if not self._check_pargs(["project"]):
         return
     if not self.pargs.statusdb_project_name:
         self.pargs.statusdb_project_name = self.pargs.project
     kw = vars(self.pargs)
     basedir = os.path.abspath(os.path.join(self.app.controller._meta.root_path, self.app.controller._meta.path_id))
     flist = find_samples(basedir, **vars(self.pargs))
     if not len(flist) > 0:
         self.log.info("No samples/sample configuration files found")
         return
     if self.pargs.no_statusdb:
         sample_name_map = None
     else:
         p_con = ProjectSummaryConnection(dbname=self.app.config.get("db", "projects"), **vars(self.app.pargs))
         s_con = SampleRunMetricsConnection(dbname=self.app.config.get("db", "samples"), **vars(self.app.pargs))
         try:
             sample_name_map = get_scilife_to_customer_name(self.pargs.statusdb_project_name, p_con, s_con, get_barcode_seq=True)
         except ValueError as e:
             self.log.warn(str(e))
             self.log.warn("No such project {} defined in statusdb; try using option --statusdb_project_name".format(self.app.pargs.project))
             sample_name_map = None
     kw.update(project_name=self.pargs.project, flist=flist, basedir=basedir, sample_name_map=sample_name_map)
     out_data = best_practice_note(**kw)
     self.log.info("Wrote report to directory {}; use Makefile to generate pdf report".format(basedir))
     self.app._output_data['stdout'].write(out_data['stdout'].getvalue())
     self.app._output_data['stderr'].write(out_data['stderr'].getvalue())
     self.app._output_data['debug'].write(out_data['debug'].getvalue())
Exemplo n.º 9
0
 def hs_metrics(self):
     if not self._check_pargs(["project", "targets"]):
         return
     if not self.pargs.baits:
         self.pargs.baits = self.pargs.targets
     self.log.info("hs_metrics: This is a temporary solution for calculating hs metrics for samples using picard tools")
     pattern = "{}.bam$".format(self.pargs.hs_file_type)
     def filter_fn(f):
         return re.search(pattern, f) != None
     ### FIX ME: this isn't caught by _process_args
     flist = []
     path =  self.pargs.flowcell if self.pargs.flowcell else self.pargs.project
     basedir = os.path.abspath(os.path.join(self.app.controller._meta.root_path, self.app.controller._meta.path_id))
     samples = find_samples(basedir, **vars(self.pargs))
     inc_dirs = [os.path.dirname(x) for x in samples]
     flist = filtered_walk(os.path.join(self.config.get(self.app.controller._meta.label, "root"), path), filter_fn=filter_fn, exclude_dirs=['nophix', 'alignments', 'fastqc', 'fastq_screen'], include_dirs=inc_dirs)
     if not query_yes_no("Going to run hs_metrics on {} files. Are you sure you want to continue?".format(len(flist)), force=self.pargs.force):
         return
     for f in flist:
         self.log.info("running CalculateHsMetrics on {}".format(f))
         ### Issue with calling java from
         ### subprocess:http://stackoverflow.com/questions/9795249/issues-with-wrapping-java-program-with-pythons-subprocess-module
         ### Actually not an issue: command line arguments have to be done the right way
         cl = ["java"] + ["-{}".format(self.pargs.java_opts)] +  ["-jar", "{}/CalculateHsMetrics.jar".format(os.getenv("PICARD_HOME"))] + ["INPUT={}".format(f)] + ["TARGET_INTERVALS={}".format(os.path.abspath(self.pargs.targets))] + ["BAIT_INTERVALS={}".format(os.path.abspath(self.pargs.baits))] +  ["OUTPUT={}".format(f.replace(".bam", ".hs_metrics"))] + ["VALIDATION_STRINGENCY=SILENT"]
         out = self.app.cmd.command(cl)
         if out:
             self.app._output_data["stdout"].write(out.rstrip())
Exemplo n.º 10
0
 def bpreport(self):
     if not self._check_pargs(["project"]):
         return
     kw = vars(self.pargs)
     basedir = os.path.abspath(
         os.path.join(self.app.controller._meta.root_path,
                      self.app.controller._meta.path_id))
     flist = find_samples(basedir, **vars(self.pargs))
     if not len(flist) > 0:
         self.log.info("No samples/sample configuration files found")
         return
     if self.pargs.no_statusdb:
         sample_name_map = None
     else:
         if not self._check_pargs(["statusdb_project_name"]):
             return
         p_con = ProjectSummaryConnection(dbname=self.app.config.get(
             "db", "projects"),
                                          **vars(self.app.pargs))
         s_con = SampleRunMetricsConnection(dbname=self.app.config.get(
             "db", "samples"),
                                            **vars(self.app.pargs))
         sample_name_map = get_scilife_to_customer_name(
             self.pargs.statusdb_project_name, p_con, s_con)
     kw.update(project_name=self.pargs.project,
               flist=flist,
               basedir=basedir,
               sample_name_map=sample_name_map)
     out_data = best_practice_note(**kw)
     self.log.info(
         "Wrote report to directory {}; use Makefile to generate pdf report"
         .format(basedir))
     self.app._output_data['stdout'].write(out_data['stdout'].getvalue())
     self.app._output_data['stderr'].write(out_data['stderr'].getvalue())
     self.app._output_data['debug'].write(out_data['debug'].getvalue())
Exemplo n.º 11
0
    def best_practice(self):
        if not self._check_pargs(["project", "uppmax_project"]):
            return
        project_path = os.path.normpath(
            os.path.join("/proj", self.pargs.uppmax_project))
        if not os.path.exists(project_path):
            self.log.warn("No such project {}; skipping".format(
                self.pargs.uppmax_project))
            return
        if self.pargs.outdir:
            outpath = os.path.join(project_path, "INBOX", self.pargs.outdir)
        else:
            outpath = os.path.join(
                project_path, "INBOX", self.pargs.statusdb_project_name
            ) if self.pargs.statusdb_project_name else os.path.join(
                project_path, "INBOX", self.pargs.project)
        if not query_yes_no(
                "Going to deliver data to {}; continue?".format(outpath)):
            return
        if not os.path.exists(outpath):
            self.app.cmd.safe_makedir(outpath)
        kw = vars(self.pargs)
        basedir = os.path.abspath(
            os.path.join(self._meta.root_path, self._meta.path_id))
        flist = find_samples(basedir, **vars(self.pargs))
        if not len(flist) > 0:
            self.log.info("No samples/sample configuration files found")
            return

        def filter_fn(f):
            if not pattern:
                return
            return re.search(pattern, f) != None

        # Setup pattern
        plist = [".*.yaml$", ".*.metrics$"]
        if not self.pargs.no_bam:
            plist.append(".*-{}.bam$".format(self.pargs.bam_file_type))
            plist.append(".*-{}.bam.bai$".format(self.pargs.bam_file_type))
        if not self.pargs.no_vcf:
            plist.append(".*.vcf$")
            plist.append(".*.vcf.gz$")
            plist.append(".*.tbi$")
            plist.append(".*.tsv$")
        pattern = "|".join(plist)
        size = 0
        for f in flist:
            path = os.path.dirname(f)
            sources = filtered_walk(path,
                                    filter_fn=filter_fn,
                                    exclude_dirs=BCBIO_EXCLUDE_DIRS)
            targets = [src.replace(basedir, outpath) for src in sources]
            self._transfer_files(sources, targets)
            if self.pargs.size:
                statinfo = [os.stat(src).st_size for src in sources]
                size = size + sum(statinfo)
        if self.pargs.size:
            self.app._output_data['stderr'].write(
                "\n********************************\nEstimated delivery size: {:.1f}G\n********************************"
                .format(size / 1e9))
Exemplo n.º 12
0
 def test_find_samples_from_file(self):
     """Find samples defined in file with empty lines and erroneous names"""
     with open(os.path.join(j_doe_00_05, "P001_101_index3-bcbb-config.yaml"), "w") as fh:
         fh.write("\n")
     flist = find_samples(j_doe_00_05, sample=os.path.join(j_doe_00_05, "samples.txt"))
     validate_sample_directories(flist, j_doe_00_05)
     self.assertEqual(len(flist),2)
     os.unlink(os.path.join(j_doe_00_05, "P001_101_index3-bcbb-config.yaml"))
Exemplo n.º 13
0
 def test_merge_sample_config(self):
     """Test merging sample configuration files"""
     flist = find_samples(j_doe_00_05)
     fdict = _group_samples(flist)
     out_d = os.path.join(j_doe_00_05, "P001_101_index3", "TOTAL")
     if not os.path.exists(out_d):
         os.makedirs(out_d)
     newconf = merge_sample_config(fdict["P001_101_index3"].values(), "P001_101_index3", out_d=out_d, dry_run=False)
     self.assertTrue(os.path.exists(os.path.join(j_doe_00_05, "P001_101_index3", "TOTAL", "P001_101_index3_B002BBBXX_TGACCA_L001_R1_001.fastq.gz" )))
     self.assertTrue(os.path.exists(os.path.join(j_doe_00_05, "P001_101_index3", "TOTAL", "P001_101_index3_C003CCCXX_TGACCA_L001_R1_001.fastq.gz" )))
Exemplo n.º 14
0
 def test_remove_dirs(self):
     """Test removing directories before rerunning pipeline"""
     keep_files = ["-post_process.yaml$", "-post_process.yaml.bak$", "-bcbb-config.yaml$", "-bcbb-config.yaml.bak$",  "-bcbb-command.txt$", "-bcbb-command.txt.bak$", "_[0-9]+.fastq$", "_[0-9]+.fastq.gz$"]
     pattern = "|".join(keep_files)
     def remove_filter_fn(f):
         return re.search(pattern, f) == None
     flist = find_samples(j_doe_00_05)
     for f in flist:
         workdir = os.path.dirname(f)
         remove_dirs = filtered_walk(workdir, remove_filter_fn, get_dirs=True)
         self.assertIn("fastqc", [os.path.basename(x) for x in remove_dirs])
Exemplo n.º 15
0
 def test_remove_files(self):
     """Test removing files"""
     keep_files = ["-post_process.yaml$", "-post_process.yaml.bak$", "-bcbb-config.yaml$", "-bcbb-config.yaml.bak$",  "-bcbb-command.txt$", "-bcbb-command.txt.bak$", "_[0-9]+.fastq$", "_[0-9]+.fastq.gz$", "^[0-9][0-9]_.*.txt$"]
     pattern = "|".join(keep_files)
     def remove_filter_fn(f):
         return re.search(pattern, f) == None
     flist = find_samples(j_doe_00_05)
     for f in flist:
         workdir = os.path.dirname(f)
         remove_files = filtered_walk(workdir, remove_filter_fn)
         self.assertNotIn("01_analysis_start.txt", [os.path.basename(x) for x in remove_files])
Exemplo n.º 16
0
 def vcf_summary(self):
     if not self._check_pargs(["project"]):
         return
     flist = find_samples(
         os.path.abspath(
             os.path.join(self.app.controller._meta.project_root,
                          self.app.controller._meta.path_id)),
         **vars(self.pargs))
     vcf_d = get_vcf_files(flist, **vars(self.pargs))
     ## Traverse files, copy to result directory, run bgzip and tabix, and merge vcfs to one file
     outdir = os.path.join(
         os.path.abspath(
             os.path.join(self.app.controller._meta.project_root,
                          self.app.controller._meta.path_id, "intermediate",
                          "results", "vcf")))
     vcf_out = []
     if not os.path.exists(outdir):
         self.app.cmd.safe_makedir(outdir)
     for k, v in vcf_d.iteritems():
         # FIXME: this should be memoized
         if os.path.exists("{}.tbi".format(v)):
             self.app.log.info(
                 "{}.tbi exists; skipping bgzip and tabix operations".
                 format(v))
             vcf_out.append(v)
             continue
         if not v.endswith(".gz"):
             ## bgzip
             self.app.log.info("Running bgzip on {}".format(v))
             cl = ["bgzip", v]
             self.app.cmd.command(cl)
             vcf_out.append("{}.gz".format(v))
         else:
             vcf_out.append(v)
         # tabix
         self.app.log.info("Running tabix on {}.gz".format(v))
         cl = ["tabix", "-f", "-p", "vcf", "{}.gz".format(v)]
         self.app.cmd.command(cl)
     # Make all-variants file
     all_variants = os.path.join(outdir, "all-variants.vcf")
     cl = ['vcf-merge'] + vcf_out
     if not os.path.exists(all_variants):
         self.app.log.debug("Merging vcf files {} to {}".format(
             vcf_out, all_variants))
         self.app.log.info("Merging {} vcf files to {}".format(
             len(vcf_out), all_variants))
         output = self.app.cmd.command(cl)
         with open(all_variants, "w") as fh:
             fh.write(output)
         cl = ['bgzip', all_variants]
         self.app.cmd.command(cl)
         cl = ['tabix', "-f", "-p", "vcf", "{}.gz".format(all_variants)]
         self.app.cmd.command(cl)
Exemplo n.º 17
0
 def test_global_post_process(self):
     """Test that when using a "global" post_process, jobname,
     output, error and output directory are updated.
     """
     flist = find_samples(j_doe_00_05)
     pp = os.path.join(j_doe_00_01, SAMPLES[1], FLOWCELL, "{}-post_process.yaml".format(SAMPLES[1]))
     with open(pp) as fh:
         postprocess = yaml.load(fh)
     for f in flist:
         (cl, platform_args) = run_bcbb_command(f, pp)
         self.assertIn("--error", platform_args)
         self.assertEqual(platform_args[platform_args.index("--error") + 1], f.replace("-bcbb-config.yaml", "-bcbb.err"))
Exemplo n.º 18
0
 def test_find_samples_from_file(self):
     """Find samples defined in file with empty lines and erroneous names"""
     with open(
             os.path.join(j_doe_00_05, "P001_101_index3-bcbb-config.yaml"),
             "w") as fh:
         fh.write("\n")
     flist = find_samples(j_doe_00_05,
                          sample=os.path.join(j_doe_00_05, "samples.txt"))
     validate_sample_directories(flist, j_doe_00_05)
     self.assertEqual(len(flist), 2)
     os.unlink(os.path.join(j_doe_00_05,
                            "P001_101_index3-bcbb-config.yaml"))
Exemplo n.º 19
0
 def test_setup_merged_samples(self):
     """Test setting up merged samples"""
     flist = find_samples(j_doe_00_05)
     setup_merged_samples(flist, **{'dry_run': False})
     with open(
             os.path.join(j_doe_00_05, "P001_101_index3", "TOTAL",
                          "P001_101_index3-bcbb-config.yaml")) as fh:
         conf = yaml.load(fh)
     self.assertEqual(
         conf["details"][0]["files"][0],
         os.path.join(
             j_doe_00_05, "P001_101_index3", "TOTAL",
             "P001_101_index3_B002BBBXX_TGACCA_L001_R1_001.fastq.gz"))
Exemplo n.º 20
0
    def hs_metrics(self):
        if not self._check_pargs(["project", "targets"]):
            return
        if not self.pargs.baits:
            self.pargs.baits = self.pargs.targets
        self.log.info(
            "hs_metrics: This is a temporary solution for calculating hs metrics for samples using picard tools"
        )
        pattern = "{}.bam$".format(self.pargs.hs_file_type)

        def filter_fn(f):
            return re.search(pattern, f) != None

        ### FIX ME: this isn't caught by _process_args
        flist = []
        path = self.pargs.flowcell if self.pargs.flowcell else self.pargs.project
        basedir = os.path.abspath(
            os.path.join(self.app.controller._meta.root_path,
                         self.app.controller._meta.path_id))
        samples = find_samples(basedir, **vars(self.pargs))
        inc_dirs = [os.path.dirname(x) for x in samples]
        flist = filtered_walk(
            os.path.join(
                self.config.get(self.app.controller._meta.label, "root"),
                path),
            filter_fn=filter_fn,
            exclude_dirs=['nophix', 'alignments', 'fastqc', 'fastq_screen'],
            include_dirs=inc_dirs)
        if not query_yes_no(
                "Going to run hs_metrics on {} files. Are you sure you want to continue?"
                .format(len(flist)),
                force=self.pargs.force):
            return
        for f in flist:
            self.log.info("running CalculateHsMetrics on {}".format(f))
            ### Issue with calling java from
            ### subprocess:http://stackoverflow.com/questions/9795249/issues-with-wrapping-java-program-with-pythons-subprocess-module
            ### Actually not an issue: command line arguments have to be done the right way
            cl = ["java"] + ["-{}".format(self.pargs.java_opts)] + [
                "-jar", "{}/CalculateHsMetrics.jar".format(
                    os.getenv("PICARD_HOME"))
            ] + ["INPUT={}".format(f)] + [
                "TARGET_INTERVALS={}".format(
                    os.path.abspath(self.pargs.targets))
            ] + [
                "BAIT_INTERVALS={}".format(os.path.abspath(self.pargs.baits))
            ] + ["OUTPUT={}".format(f.replace(".bam", ".hs_metrics"))
                 ] + ["VALIDATION_STRINGENCY=SILENT"]
            out = self.app.cmd.command(cl)
            if out:
                self.app._output_data["stdout"].write(out.rstrip())
Exemplo n.º 21
0
 def test_global_post_process(self):
     """Test that when using a "global" post_process, jobname,
     output, error and output directory are updated.
     """
     flist = find_samples(j_doe_00_05)
     pp = os.path.join(j_doe_00_01, SAMPLES[1], FLOWCELL,
                       "{}-post_process.yaml".format(SAMPLES[1]))
     with open(pp) as fh:
         postprocess = yaml.load(fh)
     for f in flist:
         (cl, platform_args) = run_bcbb_command(f, pp)
         self.assertIn("--error", platform_args)
         self.assertEqual(platform_args[platform_args.index("--error") + 1],
                          f.replace("-bcbb-config.yaml", "-bcbb.err"))
Exemplo n.º 22
0
 def best_practice(self):
     if not self._check_pargs(["project", "uppmax_project"]):
         return
     project_path = os.path.normpath(os.path.join("/proj", self.pargs.uppmax_project))
     if not os.path.exists(project_path):
         self.log.warn("No such project {}; skipping".format(self.pargs.uppmax_project))
         return
     if self.pargs.outdir:
         outpath = os.path.join(project_path, "INBOX", self.pargs.outdir)
     else:
         outpath = os.path.join(project_path, "INBOX", self.pargs.statusdb_project_name) if self.pargs.statusdb_project_name else os.path.join(project_path, "INBOX", self.pargs.project)
     if not query_yes_no("Going to deliver data to {}; continue?".format(outpath)):
         return
     if not os.path.exists(outpath):
         self.app.cmd.safe_makedir(outpath)
     kw = vars(self.pargs)
     basedir = os.path.abspath(os.path.join(self._meta.root_path, self._meta.path_id))
     flist = find_samples(basedir, **vars(self.pargs))
     if self.pargs.flowcell:
         flist = [ fl for fl in flist if os.path.basename(os.path.dirname(fl)) == self.pargs.flowcell ]
     if not len(flist) > 0:
         self.log.info("No samples/sample configuration files found")
         return
     def filter_fn(f):
         if not pattern:
             return
         return re.search(pattern, f) != None
     # Setup pattern
     plist = [".*.yaml$", ".*.metrics$"]
     if not self.pargs.no_bam:
         plist.append(".*-{}.bam$".format(self.pargs.bam_file_type))
         plist.append(".*-{}.bam.bai$".format(self.pargs.bam_file_type))
     if not self.pargs.no_vcf:
         plist.append(".*.vcf$")
         plist.append(".*.vcf.gz$")
         plist.append(".*.tbi$")
         plist.append(".*.tsv$")
     pattern = "|".join(plist)
     size = 0
     for f in flist:
         path = os.path.dirname(f)
         sources = filtered_walk(path, filter_fn=filter_fn, exclude_dirs=BCBIO_EXCLUDE_DIRS)
         targets = [src.replace(basedir, outpath) for src in sources]
         self._transfer_files(sources, targets)
         if self.pargs.size:
             statinfo = [os.stat(src).st_size for src in sources]
             size = size + sum(statinfo)
     if self.pargs.size:
         self.app._output_data['stderr'].write("\n********************************\nEstimated delivery size: {:.1f}G\n********************************".format(size/1e9))
Exemplo n.º 23
0
def compile_qc(path, application="seqcap", **kw):
    """Perform qc on data without access to statusdb.

    :param **kw: keyword argument

    """
    output_data = {'stdout': StringIO(), 'stderr': StringIO()}
    ### find_samples excrutiatingly slow for multi-sample projects where we can have > 100k files...
    flist = find_samples(path, **kw)
    srm_l = []
    for f in flist:
        LOG.debug("Opening config file {}".format(f))
        with open(f) as fh:
            runinfo_yaml = yaml.load(fh)
        for info in runinfo_yaml['details']:
            if info.get("multiplex", None):
                for mp in info.get("multiplex"):
                    sample_kw = dict(path=os.path.dirname(f),
                                     flowcell=runinfo_yaml.get(
                                         "fc_name", None),
                                     date=runinfo_yaml.get("fc_date", None),
                                     lane=info.get("lane", None),
                                     barcode_name=mp.get("name", None),
                                     sample_prj=kw.get("project"),
                                     barcode_id=mp.get('barcode_id', None),
                                     sequence=mp.get('sequence', None))
                    obj = SampleRunMetrics(**sample_kw)
                    srm_l.append(obj)
            else:
                sample_kw = dict(path=os.path.dirname(f),
                                 flowcell=runinfo_yaml.get("fc_name", None),
                                 date=runinfo_yaml.get("fc_date", None),
                                 lane=info.get("lane", None),
                                 barcode_name=info.get("description", None),
                                 sample_prj=kw.get("project"),
                                 barcode_id=None,
                                 sequence=None)
                obj = SampleRunMetrics(**sample_kw)
                obj.read_picard_metrics()
                srm_l.append(obj)
    qcdata = []
    output_data = _qc_info_header(kw.get("project"), application, output_data)
    for s in srm_l:
        qcdata.append(_srm_to_qc(s))
    for v in qcdata:
        y = [str(x) for x in assess_qc(v, application)]
        output_data["stdout"].write("".join(y) + "\n")
    return output_data
Exemplo n.º 24
0
 def test_bcbb_command(self):
     """Test output from command, changing analysis to amplicon and
     setting targets and baits"""
     flist = find_samples(j_doe_00_05)
     for f in flist:
         setup_sample(
             f, **{
                 'analysis': ANALYSIS_TYPE,
                 'genome_build': 'rn4',
                 'dry_run': False,
                 'no_only_run': False,
                 'google_report': False,
                 'dry_run': False,
                 'baits': 'rat_baits.interval_list',
                 'targets': 'rat_targets.interval_list',
                 'amplicon': True,
                 'num_cores': 8,
                 'distributed': False
             })
         with open(f.replace("-bcbb-config.yaml",
                             "-bcbb-command.txt")) as fh:
             cl = fh.read().split()
         (cl, platform_args) = run_bcbb_command(f)
         self.assertIn("automated_initial_analysis.py", cl)
         setup_sample(
             f, **{
                 'analysis': ANALYSIS_TYPE,
                 'genome_build': 'rn4',
                 'dry_run': False,
                 'no_only_run': False,
                 'google_report': False,
                 'dry_run': False,
                 'baits': 'rat_baits.interval_list',
                 'targets': 'rat_targets.interval_list',
                 'amplicon': True,
                 'num_cores': 8,
                 'distributed': True
             })
         with open(f.replace("-bcbb-config.yaml",
                             "-bcbb-command.txt")) as fh:
             cl = fh.read().split()
         (cl, platform_args) = run_bcbb_command(f)
         self.assertIn("distributed_nextgen_pipeline.py", cl)
Exemplo n.º 25
0
    def test_remove_files(self):
        """Test removing files"""
        keep_files = [
            "-post_process.yaml$", "-post_process.yaml.bak$",
            "-bcbb-config.yaml$", "-bcbb-config.yaml.bak$",
            "-bcbb-command.txt$", "-bcbb-command.txt.bak$", "_[0-9]+.fastq$",
            "_[0-9]+.fastq.gz$", "^[0-9][0-9]_.*.txt$"
        ]
        pattern = "|".join(keep_files)

        def remove_filter_fn(f):
            return re.search(pattern, f) == None

        flist = find_samples(j_doe_00_05)
        for f in flist:
            workdir = os.path.dirname(f)
            remove_files = filtered_walk(workdir, remove_filter_fn)
            self.assertNotIn("01_analysis_start.txt",
                             [os.path.basename(x) for x in remove_files])
Exemplo n.º 26
0
 def test_bcbb_command(self):
     """Test output from command, changing analysis to amplicon and
     setting targets and baits"""
     flist = find_samples(j_doe_00_05)
     for f in flist:
         setup_sample(f, **{'analysis':ANALYSIS_TYPE, 'genome_build':'rn4', 'dry_run':False,
                            'no_only_run':False, 'google_report':False,
                            'dry_run':False, 'baits':'rat_baits.interval_list', 'targets':'rat_targets.interval_list', 'amplicon':True, 'num_cores':8, 'distributed':False})
         with open(f.replace("-bcbb-config.yaml", "-bcbb-command.txt")) as fh:
             cl = fh.read().split()
         (cl, platform_args) = run_bcbb_command(f)
         self.assertIn("automated_initial_analysis.py",cl)
         setup_sample(f, **{'analysis':ANALYSIS_TYPE, 'genome_build':'rn4', 'dry_run':False,
                            'no_only_run':False, 'google_report':False, 
                            'dry_run':False, 'baits':'rat_baits.interval_list', 'targets':'rat_targets.interval_list', 'amplicon':True, 'num_cores':8, 'distributed':True})
         with open(f.replace("-bcbb-config.yaml", "-bcbb-command.txt")) as fh:
             cl = fh.read().split()
         (cl, platform_args) = run_bcbb_command(f)
         self.assertIn("distributed_nextgen_pipeline.py",cl)
Exemplo n.º 27
0
    def test_remove_dirs(self):
        """Test removing directories before rerunning pipeline"""
        keep_files = [
            "-post_process.yaml$", "-post_process.yaml.bak$",
            "-bcbb-config.yaml$", "-bcbb-config.yaml.bak$",
            "-bcbb-command.txt$", "-bcbb-command.txt.bak$", "_[0-9]+.fastq$",
            "_[0-9]+.fastq.gz$"
        ]
        pattern = "|".join(keep_files)

        def remove_filter_fn(f):
            return re.search(pattern, f) == None

        flist = find_samples(j_doe_00_05)
        for f in flist:
            workdir = os.path.dirname(f)
            remove_dirs = filtered_walk(workdir,
                                        remove_filter_fn,
                                        get_dirs=True)
            self.assertIn("fastqc", [os.path.basename(x) for x in remove_dirs])
Exemplo n.º 28
0
 def test_merge_sample_config(self):
     """Test merging sample configuration files"""
     flist = find_samples(j_doe_00_05)
     fdict = _group_samples(flist)
     out_d = os.path.join(j_doe_00_05, "P001_101_index3", "TOTAL")
     if not os.path.exists(out_d):
         os.makedirs(out_d)
     newconf = merge_sample_config(fdict["P001_101_index3"].values(),
                                   "P001_101_index3",
                                   out_d=out_d,
                                   dry_run=False)
     self.assertTrue(
         os.path.exists(
             os.path.join(
                 j_doe_00_05, "P001_101_index3", "TOTAL",
                 "P001_101_index3_B002BBBXX_TGACCA_L001_R1_001.fastq.gz")))
     self.assertTrue(
         os.path.exists(
             os.path.join(
                 j_doe_00_05, "P001_101_index3", "TOTAL",
                 "P001_101_index3_C003CCCXX_TGACCA_L001_R1_001.fastq.gz")))
Exemplo n.º 29
0
 def vcf_summary(self):
     if not self._check_pargs(["project"]):
         return
     flist = find_samples(
         os.path.abspath(os.path.join(self.app.controller._meta.project_root, self.app.controller._meta.path_id)),
         **vars(self.pargs)
     )
     vcf_d = get_vcf_files(flist)
     ## Traverse files, copy to result directory, run bgzip and tabix, and merge vcfs to one file
     outdir = os.path.join(
         os.path.abspath(
             os.path.join(
                 self.app.controller._meta.project_root,
                 self.app.controller._meta.path_id,
                 "intermediate",
                 "results",
                 "vcf",
             )
         )
     )
     if not os.path.exists(outdir):
         self.app.cmd.safe_makedir(outdir)
     for k, v in vcf_d.iteritems():
         print v
         if v.endswith(".gz"):
             tgt = os.path.join(outdir, os.path.basename(v).replace("TOTAL", "TOTAL_{}".format(k)))
             v = v.replace(".gz", "")
             tgt = tgt.replace(".gz", "")
         else:
             ## bgzip
             LOG.info("Running bgzip on {}".format(v))
             cl = ["bgzip", v]
             self.app.cmd.command(cl)
         ##if not os.path.exists("{}.gz.tbi"):
         ## tabix
         LOG.info("Running tabix on {}.gz".format(v))
         cl = ["tabix", "-f", "-p", "vcf", "{}.gz".format(v)]
         self.app.cmd.command(cl)
         self.app.cmd.link("{}.gz".format(v), "{}.gz".format(tgt))
         self.app.cmd.link("{}.gz.tbi".format(v), "{}.gz.tbi".format(tgt))
Exemplo n.º 30
0
 def test_find_samples(self):
     """Test finding samples"""
     flist = find_samples(j_doe_00_05)
     self.assertIn(len(flist), [3, 4])
     flist = find_samples(j_doe_00_05, **{'only_failed': True})
     self.assertIn(len(flist), [0, 1])
Exemplo n.º 31
0
 def test_find_samples_from_file_with_yaml(self):
     """Find samples defined in file with empty lines and a bcbb-config.yaml file lying directly under root directory"""
     flist = find_samples(j_doe_00_05,
                          sample=os.path.join(j_doe_00_05, "samples2.txt"))
     args = [flist, j_doe_00_05]
     self.assertRaises(Exception, validate_sample_directories, *args)
Exemplo n.º 32
0
    def run(self):
        if not self._check_pargs(["project"]):
            return
        if self.pargs.post_process:
            self.pargs.post_process = os.path.abspath(self.pargs.post_process)
        basedir = os.path.abspath(
            os.path.join(self.app.controller._meta.root_path,
                         self.app.controller._meta.path_id))
        if self.pargs.from_ssheet:
            [
                samplesheet_csv_to_yaml(fn) for fn in find_samples(
                    basedir, pattern="SampleSheet.csv$", **vars(self.pargs))
            ]
        flist = find_samples(basedir, **vars(self.pargs))
        # Add filtering on flowcell if necessary
        self._meta.pattern = ".*"
        flist = [x for x in flist if self._filter_fn(x)]
        if self.pargs.merged:
            ##  Setup merged samples and append to flist if new list longer
            flist = setup_merged_samples(flist, **vars(self.pargs))
        if not len(flist) > 0:
            self.log.info("No sample configuration files found")
            return
        if len(flist) > 0 and not query_yes_no(
                "Going to start {} jobs... Are you sure you want to continue?".
                format(len(flist)),
                force=self.pargs.force):
            return
        # Make absolutely sure analysis directory is a *subdirectory* of the working directory
        validate_sample_directories(flist, basedir)
        orig_dir = os.path.abspath(os.getcwd())

        for run_info in flist:
            os.chdir(os.path.abspath(os.path.dirname(run_info)))
            setup_sample(run_info, **vars(self.pargs))
            os.chdir(orig_dir)
        if self.pargs.only_setup:
            return
        if self.pargs.only_failed:
            status = {x: self._sample_status(x) for x in flist}
            flist = [x for x in flist if self._sample_status(x) == "FAIL"]
        ## Here process files again, removing if requested, and running the pipeline
        for run_info in flist:
            self.app.log.info(
                "Running analysis defined by config file {}".format(run_info))
            os.chdir(os.path.abspath(os.path.dirname(run_info)))
            if self.app.cmd.monitor(work_dir=os.path.dirname(run_info)):
                self.app.log.warn("Not running job")
                continue
            if self.pargs.restart:
                self.app.log.info("Removing old analysis files in {}".format(
                    os.path.dirname(run_info)))
                remove_files(run_info, **vars(self.pargs))
            (cl, platform_args) = run_bcbb_command(run_info,
                                                   **vars(self.pargs))
            self.app.cmd.command(
                cl, **{
                    'platform_args': platform_args,
                    'saveJobId': True,
                    'workingDirectory': os.path.dirname(run_info)
                })
            os.chdir(orig_dir)
Exemplo n.º 33
0
 def test_summarize_variants(self):
     """Test summarizing variants"""
     flist = find_samples(j_doe_00_01)
     vcf_d = get_vcf_files(flist)
Exemplo n.º 34
0
 def test_summarize_variants(self):
     """Test summarizing variants"""
     flist = find_samples(j_doe_00_01)
     vcf_d = get_vcf_files(flist)
Exemplo n.º 35
0
    def test_setup_samples(self):
        """Test setting up samples, changing genome to rn4"""
        flist = find_samples(j_doe_00_05)
        for f in flist:
            setup_sample(
                f, **{
                    'analysis': 'Align_standard_seqcap',
                    'genome_build': 'rn4',
                    'dry_run': False,
                    'baits': 'rat_baits.interval_list',
                    'targets': 'rat_targets.interval_list',
                    'num_cores': 8,
                    'distributed': False
                })
        for f in flist:
            with open(f, "r") as fh:
                config = yaml.load(fh)
            if config["details"][0].get("multiplex", None):
                self.assertEqual(
                    config["details"][0]["multiplex"][0]["genome_build"],
                    "rn4")
            else:
                self.assertEqual(config["details"][0]["genome_build"], "rn4")

            with open(f.replace("-bcbb-config.yaml",
                                "-post_process.yaml")) as fh:
                config = yaml.load(fh)
            self.assertEqual(
                config["custom_algorithms"][ANALYSIS_TYPE]["hybrid_bait"],
                'rat_baits.interval_list')
            self.assertEqual(
                config["custom_algorithms"][ANALYSIS_TYPE]["hybrid_target"],
                'rat_targets.interval_list')
            self.assertEqual(config["algorithm"]["num_cores"], 8)

        for f in flist:
            setup_sample(
                f, **{
                    'analysis': ANALYSIS_TYPE,
                    'genome_build': 'rn4',
                    'dry_run': False,
                    'no_only_run': True,
                    'google_report': True,
                    'dry_run': False,
                    'baits': 'rat_baits.interval_list',
                    'targets': 'rat_targets.interval_list',
                    'amplicon': True,
                    'num_cores': 8,
                    'distributed': False
                })
            with open(f, "r") as fh:
                config = yaml.load(fh)
            if config["details"][0].get("multiplex", None):
                self.assertEqual(
                    config["details"][0]["multiplex"][0]["genome_build"],
                    "rn4")
            else:
                self.assertEqual(config["details"][0]["genome_build"], "rn4")
            with open(f.replace("-bcbb-config.yaml",
                                "-post_process.yaml")) as fh:
                config = yaml.load(fh)
            self.assertEqual(config["algorithm"]["mark_duplicates"], False)
            self.assertEqual(
                config["custom_algorithms"][ANALYSIS_TYPE]["mark_duplicates"],
                False)
Exemplo n.º 36
0
 def test_find_samples(self):
     """Test finding samples"""
     flist = find_samples(j_doe_00_05)
     self.assertIn(len(flist), [3,4])
     flist = find_samples(j_doe_00_05, **{'only_failed':True})
     self.assertIn(len(flist), [0,1])
Exemplo n.º 37
0
 def test_find_samples_from_file_with_yaml(self):
     """Find samples defined in file with empty lines and a bcbb-config.yaml file lying directly under root directory"""
     flist = find_samples(j_doe_00_05, sample=os.path.join(j_doe_00_05, "samples2.txt"))
     args = [flist, j_doe_00_05]
     self.assertRaises(Exception, validate_sample_directories, *args)
Exemplo n.º 38
0
 def test_setup_merged_samples(self):
     """Test setting up merged samples"""
     flist = find_samples(j_doe_00_05)
     setup_merged_samples(flist, **{'dry_run':False})