Exemplo n.º 1
0
    def update(self):
        if not self._check_pargs(["sample_prj"]):
            return
        url = self.pargs.url if self.pargs.url else self.app.config.get("db", "url")
        if not url:
            self.app.log.warn("Please provide a valid url: got {}".format(url))
            return

        s_con = SampleRunMetricsConnection(dbname=self.app.config.get("db", "samples"), **vars(self.app.pargs))
        samples = s_con.get_samples(sample_prj=self.pargs.sample_prj)

        if self.pargs.project_id:
            self.app.log.debug("Going to update 'project_id' to {} for sample runs with 'sample_prj' == {}".format(self.pargs.project_id, self.pargs.sample_prj))
            for s in samples:
                if not s.get("project_id", None) is None:
                    if not query_yes_no("'project_id':{} for sample {}; are you sure you want to overwrite?".format(s["project_id"], s["name"]), force=self.pargs.force):
                        continue
                s["project_id"] = self.pargs.project_id
                s_con.save(s)
        if self.pargs.names:
            self.app.log.debug("Going to update 'project_sample_name' for sample runs with 'sample_prj' == {}".format(self.pargs.sample_prj))
            if os.path.exists(self.pargs.names):
                with open(self.pargs.names) as fh:
                    names_d = json.load(fh)
            else:
                names_d= ast.literal_eval(self.pargs.names)
            samples_sort = sorted(samples, key=lambda s:s["barcode_name"])
            groups = {}
            for k, g in itertools.groupby(samples_sort, key=lambda x:x["barcode_name"]):
                groups[k] = list(g)
            for barcode_name in names_d:
                sample_list = groups.get(barcode_name, None)
                if not sample_list:
                    continue
                for s in sample_list:
                    if not s.get("project_sample_name", None) is None:
                        if not query_yes_no("'project_sample_name':{} for sample {}; are you sure you want to overwrite?".format(s["project_sample_name"], s["name"]), force=self.pargs.force):
                            continue
                    s["project_sample_name"] = names_d[barcode_name]
                    s_con.save(s)
        else:
            self.app.log.info("Trying to use extensive matching...")
            p_con = ProjectSummaryConnection(dbname=self.app.config.get("db", "projects"), **vars(self.app.pargs))
            project_name = self.pargs.sample_prj
            if self.pargs.project_alias:
                project_name = self.pargs.project_alias
            for s in samples:
                project_sample = p_con.get_project_sample(project_name, s["barcode_name"], extensive_matching=True)
                if project_sample:
                    self.app.log.info("using mapping '{} : {}'...".format(s["barcode_name"], project_sample["sample_name"]))
                    s["project_sample_name"] = project_sample["sample_name"]
                    s_con.save(s)
Exemplo n.º 2
0
    def update(self):
        if not self._check_pargs(["sample_prj"]):
            return
        url = self.pargs.url if self.pargs.url else self.app.config.get("db", "url")
        if not url:
            self.app.log.warn("Please provide a valid url: got {}".format(url))
            return

        s_con = SampleRunMetricsConnection(dbname=self.app.config.get("db", "samples"), **vars(self.app.pargs))
        samples = s_con.get_samples(sample_prj=self.pargs.sample_prj)

        if self.pargs.project_id:
            self.app.log.debug("Going to update 'project_id' to {} for sample runs with 'sample_prj' == {}".format(self.pargs.project_id, self.pargs.sample_prj))
            for s in samples:
                if not s.get("project_id", None) is None:
                    if not query_yes_no("'project_id':{} for sample {}; are you sure you want to overwrite?".format(s["project_id"], s["name"]), force=self.pargs.force):
                        continue
                s["project_id"] = self.pargs.project_id
                s_con.save(s)
        if self.pargs.names:
            self.app.log.debug("Going to update 'project_sample_name' for sample runs with 'sample_prj' == {}".format(self.pargs.sample_prj))
            if os.path.exists(self.pargs.names):
                with open(self.pargs.names) as fh:
                    names_d = json.load(fh)
            else:
                names_d= ast.literal_eval(self.pargs.names)
            samples_sort = sorted(samples, key=lambda s:s["barcode_name"])
            groups = {}
            for k, g in itertools.groupby(samples_sort, key=lambda x:x["barcode_name"]):
                groups[k] = list(g)
            for barcode_name in names_d:
                sample_list = groups.get(barcode_name, None)
                if not sample_list:
                    continue
                for s in sample_list:
                    if not s.get("project_sample_name", None) is None:
                        if not query_yes_no("'project_sample_name':{} for sample {}; are you sure you want to overwrite?".format(s["project_sample_name"], s["name"]), force=self.pargs.force):
                            continue
                    s["project_sample_name"] = names_d[barcode_name]
                    s_con.save(s)
        else:
            self.app.log.info("Trying to use extensive matching...")
            p_con = ProjectSummaryConnection(dbname=self.app.config.get("db", "projects"), **vars(self.app.pargs))
            project_name = self.pargs.sample_prj
            if self.pargs.project_alias:
                project_name = self.pargs.project_alias
            for s in samples:
                project_sample = p_con.get_project_sample(project_name, s["barcode_name"], extensive_matching=True)
                if project_sample:
                    self.app.log.info("using mapping '{} : {}'...".format(s["barcode_name"], project_sample["sample_name"]))
                    s["project_sample_name"] = project_sample["sample_name"]
                    s_con.save(s)
Exemplo n.º 3
0
    def purge_alignments(self):
        """Cleanup sam and bam files. In some cases, sam files
        persist. If the corresponding bam file exists, replace the sam
        file contents with a message that the file has been removed to
        save space.
        """
        pattern = ".sam$"
        def purge_filter(f):
            if not pattern:
                return
            return re.search(pattern, f) != None

        flist = filtered_walk(os.path.join(self._meta.root_path, self._meta.path_id), purge_filter)
        if len(flist) == 0:
            self.app.log.info("No sam files found")
            return
        if len(flist) > 0 and not query_yes_no("Going to remove/cleanup {} sam files ({}...). Are you sure you want to continue?".format(len(flist), ",".join([os.path.basename(x) for x in flist[0:10]])), force=self.pargs.force):
            return
        for f in flist:
            self.app.log.info("Purging sam file {}".format(f))
            self.app.cmd.safe_unlink(f)
            if os.path.exists(f.replace(".sam", ".bam")):
                self.app.cmd.write(f, "File removed to save disk space: SAM converted to BAM")

        ## Find bam files in alignments subfolders
        pattern = ".bam$"
        flist = filtered_walk(os.path.join(self._meta.root_path, self._meta.path_id), purge_filter, include_dirs=["alignments"])
        for f in flist:
            f_tgt = [f.replace(".bam", "-sort.bam"), os.path.join(os.path.dirname(os.path.dirname(f)),os.path.basename(f) )]
            for tgt in f_tgt:
                if os.path.exists(tgt):
                    self.app.log.info("Purging bam file {}".format(f))
                    self.app.cmd.safe_unlink(f)
                    self.app.cmd.write(f, "File removed to save disk space: Moved to {}".format(os.path.abspath(tgt)))
Exemplo n.º 4
0
 def touch_finished(self):
     if not self._check_pargs(["project", "sample"]):
         return
     if os.path.exists(self.pargs.sample) and os.path.isfile(self.pargs.sample):
         with open(self.pargs.sample) as fh:
             slist = [x.rstrip() for x in fh.readlines()]
     else:
         slist = [self.pargs.sample]
     for s in slist:
         spath = os.path.join(self._meta.root_path, self._meta.path_id, s)
         if not os.path.exists(spath):
             self.app.log.warn("No such path {}; skipping".format(spath))
             continue
         rsync_src = os.path.join(self._meta.root_path, self._meta.path_id, s) + os.sep
         rsync_tgt = os.path.join(self.app.config.get("runqc", "root"), self.pargs.project, s) + os.sep
         cl = ["rsync {} {} {}".format(self.app.config.get("runqc", "rsync_sample_opts"), rsync_src, rsync_tgt)]
         self.app.log.info("Checking if runqc uptodate with command '{}'".format(" ".join(cl)))
         out = self.app.cmd.command(cl, **{'shell':True})
         if not self.pargs.dry_run and not out.find("total size is 0"):
             self.app.log.info("Some files need to be updated. Rsync output:")
             print "********"
             print out
             print "********"
             continue
         if not query_yes_no("Going to touch file {} for sample {}; continue?".format(FINISHED_FILE, s), force=self.pargs.force):
             continue
         self.app.log.info("Touching file {} for sample {}".format(FINISHED_FILE, s))
         with open(os.path.join(spath, FINISHED_FILE), "w") as fh:
             t_utc = utc_time()
             fh.write(t_utc)
Exemplo n.º 5
0
 def remove_finished(self):
     if not self._check_pargs(["project"]):
         return
     # Don't filter out files
     def filter_fn(f):
         return True
     slist = os.listdir(os.path.join(self._meta.root_path, self._meta.path_id))
     for s in slist:
         spath = os.path.join(self._meta.root_path, self._meta.path_id, s)
         if not os.path.isdir(spath):
             continue
         if not os.path.exists(os.path.join(spath, FINISHED_FILE)):
             self.app.log.info("Sample {} not finished; skipping".format(s))
             continue
         flist = filtered_walk(spath, filter_fn)
         dlist = filtered_walk(spath, filter_fn, get_dirs=True)
         if os.path.exists(os.path.join(spath, REMOVED_FILE)):
             self.app.log.info("Sample {} already removed; skipping".format(s))
             continue
         if len(flist) > 0 and not query_yes_no("Will remove directory {} containing {} files; continue?".format(s, len(flist)), force=self.pargs.force):
             continue
         self.app.log.info("Removing {} files from {}".format(len(flist), spath))            
         for f in flist:
             if f == os.path.join(spath, FINISHED_FILE):
                 continue
             self.app.cmd.safe_unlink(f)
         self.app.log.info("Removing {} directories from {}".format(len(dlist), spath))
         for d in sorted(dlist, reverse=True):
             self.app.cmd.safe_rmdir(d)
         if not self.pargs.dry_run:
             with open(os.path.join(spath, REMOVED_FILE), "w") as fh:
                 t_utc = utc_time()
                 fh.write(t_utc)
Exemplo n.º 6
0
def remove_files(f, **kw):
    ## Remove old files if requested
    keep_files = [
        "-post_process.yaml$", "-post_process.yaml.bak$", "-bcbb-config.yaml$",
        "-bcbb-config.yaml.bak$", "-bcbb-command.txt$",
        "-bcbb-command.txt.bak$", "_[0-9]+.fastq$", "_[0-9]+.fastq.gz$",
        "_[0-9]+_fastq.txt.gz$", "_[0-9]+_fastq.txt$", "^[0-9][0-9]_.*.txt$",
        "JOBID", "PID"
    ]
    pattern = "|".join(keep_files)

    def remove_filter_fn(f):
        return re.search(pattern, f) == None

    workdir = os.path.dirname(f)
    remove_files = filtered_walk(workdir, remove_filter_fn)
    remove_dirs = filtered_walk(workdir, remove_filter_fn, get_dirs=True)
    if len(remove_files) == 0:
        pass
    if len(remove_files) > 0 and query_yes_no(
            "Going to remove {} files and {} directories... Are you sure you want to continue?"
            .format(len(remove_files), len(remove_dirs)),
            force=kw['force']):
        [dry_unlink(x, dry_run=kw['dry_run']) for x in remove_files]
        ## Sort directories by length so we don't accidentally try to remove a non-empty dir
        [
            dry_rmdir(x, dry_run=kw['dry_run'])
            for x in sorted(remove_dirs, key=lambda x: len(x), reverse=True)
        ]
Exemplo n.º 7
0
    def best_practice(self):
        if not self._check_pargs(["project", "uppmax_project"]):
            return
        project_path = os.path.normpath(
            os.path.join("/proj", self.pargs.uppmax_project))
        if not os.path.exists(project_path):
            self.log.warn("No such project {}; skipping".format(
                self.pargs.uppmax_project))
            return
        if self.pargs.outdir:
            outpath = os.path.join(project_path, "INBOX", self.pargs.outdir)
        else:
            outpath = os.path.join(
                project_path, "INBOX", self.pargs.statusdb_project_name
            ) if self.pargs.statusdb_project_name else os.path.join(
                project_path, "INBOX", self.pargs.project)
        if not query_yes_no(
                "Going to deliver data to {}; continue?".format(outpath)):
            return
        if not os.path.exists(outpath):
            self.app.cmd.safe_makedir(outpath)
        kw = vars(self.pargs)
        basedir = os.path.abspath(
            os.path.join(self._meta.root_path, self._meta.path_id))
        flist = find_samples(basedir, **vars(self.pargs))
        if not len(flist) > 0:
            self.log.info("No samples/sample configuration files found")
            return

        def filter_fn(f):
            if not pattern:
                return
            return re.search(pattern, f) != None

        # Setup pattern
        plist = [".*.yaml$", ".*.metrics$"]
        if not self.pargs.no_bam:
            plist.append(".*-{}.bam$".format(self.pargs.bam_file_type))
            plist.append(".*-{}.bam.bai$".format(self.pargs.bam_file_type))
        if not self.pargs.no_vcf:
            plist.append(".*.vcf$")
            plist.append(".*.vcf.gz$")
            plist.append(".*.tbi$")
            plist.append(".*.tsv$")
        pattern = "|".join(plist)
        size = 0
        for f in flist:
            path = os.path.dirname(f)
            sources = filtered_walk(path,
                                    filter_fn=filter_fn,
                                    exclude_dirs=BCBIO_EXCLUDE_DIRS)
            targets = [src.replace(basedir, outpath) for src in sources]
            self._transfer_files(sources, targets)
            if self.pargs.size:
                statinfo = [os.stat(src).st_size for src in sources]
                size = size + sum(statinfo)
        if self.pargs.size:
            self.app._output_data['stderr'].write(
                "\n********************************\nEstimated delivery size: {:.1f}G\n********************************"
                .format(size / 1e9))
Exemplo n.º 8
0
 def hs_metrics(self):
     if not self._check_pargs(["project", "region_file"]):
         return
     if not self.pargs.bait_file:
         self.pargs.bait_file = self.pargs.region_file
     self.log.info("hs_metrics: This is a temporary solution for calculating hs metrics for samples using picard tools")
     pattern = "{}.bam$".format(self.pargs.hs_file_type)
     def filter_fn(f):
         return re.search(pattern, f) != None
     ### FIX ME: this isn't caught by _process_args
     path =  self.pargs.flowcell if self.pargs.flowcell else self.pargs.project
     flist = filtered_walk(os.path.join(self.config.get("production", "root"), path), filter_fn=filter_fn, exclude_dirs=['nophix', 'alignments', 'fastqc', 'fastq_screen'])
     if self.pargs.input_file:
         flist = [os.path.abspath(self.pargs.input_file)]
     if not query_yes_no("Going to run hs_metrics on {} files. Are you sure you want to continue?".format(len(flist)), force=self.pargs.force):
         return
     for f in flist:
         self.log.info("running CalculateHsMetrics on {}".format(f))
         ### Issue with calling java from
         ### subprocess:http://stackoverflow.com/questions/9795249/issues-with-wrapping-java-program-with-pythons-subprocess-module
         ### Actually not an issue: command line arguments have to be done the right way
         cl = ["java"] + ["-{}".format(self.pargs.java_opts)] +  ["-jar", "{}/CalculateHsMetrics.jar".format(os.getenv("PICARD_HOME"))] + ["INPUT={}".format(f)] + ["TARGET_INTERVALS={}".format(os.path.abspath(self.pargs.region_file))] + ["BAIT_INTERVALS={}".format(os.path.abspath(self.pargs.bait_file))] +  ["OUTPUT={}".format(f.replace(".bam", ".hs_metrics"))] + ["VALIDATION_STRINGENCY=SILENT"]
         out = self.app.cmd.command(cl)
         if out:
             self.app._output_data["stdout"].write(out.rstrip())
Exemplo n.º 9
0
 def hs_metrics(self):
     if not self._check_pargs(["project", "targets"]):
         return
     if not self.pargs.baits:
         self.pargs.baits = self.pargs.targets
     self.log.info("hs_metrics: This is a temporary solution for calculating hs metrics for samples using picard tools")
     pattern = "{}.bam$".format(self.pargs.hs_file_type)
     def filter_fn(f):
         return re.search(pattern, f) != None
     ### FIX ME: this isn't caught by _process_args
     flist = []
     path =  self.pargs.flowcell if self.pargs.flowcell else self.pargs.project
     basedir = os.path.abspath(os.path.join(self.app.controller._meta.root_path, self.app.controller._meta.path_id))
     samples = find_samples(basedir, **vars(self.pargs))
     inc_dirs = [os.path.dirname(x) for x in samples]
     flist = filtered_walk(os.path.join(self.config.get(self.app.controller._meta.label, "root"), path), filter_fn=filter_fn, exclude_dirs=['nophix', 'alignments', 'fastqc', 'fastq_screen'], include_dirs=inc_dirs)
     if not query_yes_no("Going to run hs_metrics on {} files. Are you sure you want to continue?".format(len(flist)), force=self.pargs.force):
         return
     for f in flist:
         self.log.info("running CalculateHsMetrics on {}".format(f))
         ### Issue with calling java from
         ### subprocess:http://stackoverflow.com/questions/9795249/issues-with-wrapping-java-program-with-pythons-subprocess-module
         ### Actually not an issue: command line arguments have to be done the right way
         cl = ["java"] + ["-{}".format(self.pargs.java_opts)] +  ["-jar", "{}/CalculateHsMetrics.jar".format(os.getenv("PICARD_HOME"))] + ["INPUT={}".format(f)] + ["TARGET_INTERVALS={}".format(os.path.abspath(self.pargs.targets))] + ["BAIT_INTERVALS={}".format(os.path.abspath(self.pargs.baits))] +  ["OUTPUT={}".format(f.replace(".bam", ".hs_metrics"))] + ["VALIDATION_STRINGENCY=SILENT"]
         out = self.app.cmd.command(cl)
         if out:
             self.app._output_data["stdout"].write(out.rstrip())
Exemplo n.º 10
0
def rm_tarball(arch, tarball):
    """Remove a tarball
    """    
    if not query_yes_no("Going to remove tarball {}. This action can not be undone. Are you sure you want to continue?".format(tarball), 
                        force=arch.pargs.force):
        return
    arch.log.info("removing {}".format(tarball))
    arch.app.cmd.safe_unlink(tarball)
Exemplo n.º 11
0
def rm_run(arch, root, flowcell=None):
    """Remove a flowcell folder from the root folder
    """    
    path = os.path.join(root,flowcell)
    if not query_yes_no("Going to remove flowcell folder {}. This action can not be undone. Are you sure you want to continue?".format(path), 
                        force=arch.pargs.force):
        return
    arch.log.info("removing {}".format(path))
    arch.app.cmd.rmtree(path)
Exemplo n.º 12
0
def rm_tarball(arch, tarball):
    """Remove a tarball
    """
    if not query_yes_no(
            "Going to remove tarball {}. This action can not be undone. Are you sure you want to continue?"
            .format(tarball),
            force=arch.pargs.force):
        return
    arch.log.info("removing {}".format(tarball))
    arch.app.cmd.safe_unlink(tarball)
Exemplo n.º 13
0
def _return_extensive_match_result(name_map, barcode_name, force=False):
    """Wrap return value for extensive matching"""
    if query_yes_no(
            "found mapping '{} : {}' (barcode_name:project_sample_name); do you want to use this project_sample_name?"
            .format(barcode_name, name_map["sample_name"]),
            default="no",
            force=force):
        return name_map
    else:
        return None
Exemplo n.º 14
0
def rm_run(arch, root, flowcell=None):
    """Remove a flowcell folder from the root folder
    """
    path = os.path.join(root, flowcell)
    if not query_yes_no(
            "Going to remove flowcell folder {}. This action can not be undone. Are you sure you want to continue?"
            .format(path),
            force=arch.pargs.force):
        return
    arch.log.info("removing {}".format(path))
    arch.app.cmd.rmtree(path)
Exemplo n.º 15
0
    def run(self):
        if not self._check_pargs(["project"]):
            return
        if self.pargs.post_process:
            self.pargs.post_process = os.path.abspath(self.pargs.post_process)
        basedir = os.path.abspath(os.path.join(self.app.controller._meta.root_path, self.app.controller._meta.path_id))
        if self.pargs.from_ssheet:
            [
                samplesheet_csv_to_yaml(fn)
                for fn in find_samples(basedir, pattern="SampleSheet.csv$", **vars(self.pargs))
            ]
        flist = find_samples(basedir, **vars(self.pargs))
        # Add filtering on flowcell if necessary
        self._meta.pattern = ".*"
        flist = [x for x in flist if self._filter_fn(x)]
        if self.pargs.merged:
            ##  Setup merged samples and append to flist if new list longer
            flist = setup_merged_samples(flist, **vars(self.pargs))
        if not len(flist) > 0:
            self.log.info("No sample configuration files found")
            return
        if len(flist) > 0 and not query_yes_no(
            "Going to start {} jobs... Are you sure you want to continue?".format(len(flist)), force=self.pargs.force
        ):
            return
        # Make absolutely sure analysis directory is a *subdirectory* of the working directory
        validate_sample_directories(flist, basedir)
        orig_dir = os.path.abspath(os.getcwd())

        for run_info in flist:
            os.chdir(os.path.abspath(os.path.dirname(run_info)))
            setup_sample(run_info, **vars(self.pargs))
            os.chdir(orig_dir)
        if self.pargs.only_setup:
            return
        if self.pargs.only_failed:
            status = {x: self._sample_status(x) for x in flist}
            flist = [x for x in flist if self._sample_status(x) == "FAIL"]
        ## Here process files again, removing if requested, and running the pipeline
        for run_info in flist:
            self.app.log.info("Running analysis defined by config file {}".format(run_info))
            os.chdir(os.path.abspath(os.path.dirname(run_info)))
            if self.app.cmd.monitor(work_dir=os.path.dirname(run_info)):
                self.app.log.warn("Not running job")
                continue
            if self.pargs.restart:
                self.app.log.info("Removing old analysis files in {}".format(os.path.dirname(run_info)))
                remove_files(run_info, **vars(self.pargs))
            (cl, platform_args) = run_bcbb_command(run_info, **vars(self.pargs))
            self.app.cmd.command(
                cl, **{"platform_args": platform_args, "saveJobId": True, "workingDirectory": os.path.dirname(run_info)}
            )
            os.chdir(orig_dir)
Exemplo n.º 16
0
 def clean(self):
     if not self._check_pargs(["project"]):
         return
     self._meta.pattern = "|".join(["{}(.gz|.bz2)?$".format(x) for x in self._meta.file_ext])
     flist = filtered_walk(os.path.join(self._meta.root_path, self._meta.path_id), self._filter_fn, include_dirs=self._meta.include_dirs)
     if len(flist) == 0:
         self.app.log.info("No files matching pattern '{}' found".format(self._meta.pattern))
         return
     if len(flist) > 0 and not query_yes_no("Going to remove {} files ({}...). Are you sure you want to continue?".format(len(flist), ",".join([os.path.basename(x) for x in flist[0:10]])), force=self.pargs.force):
         return
     for f in flist:
         self.app.log.info("removing {}".format(f))
         self.app.cmd.safe_unlink(f)
Exemplo n.º 17
0
 def clean(self):
     if not self._check_pargs(["project"]):
         return
     self._meta.pattern = "|".join(["{}(.gz|.bz2)?$".format(x) for x in self._meta.file_ext])
     flist = filtered_walk(os.path.join(self._meta.root_path, self._meta.path_id), self._filter_fn, include_dirs=self._meta.include_dirs)
     if len(flist) == 0:
         self.app.log.info("No files matching pattern '{}' found".format(self._meta.pattern))
         return
     if len(flist) > 0 and not query_yes_no("Going to remove {} files ({}...). Are you sure you want to continue?".format(len(flist), ",".join([os.path.basename(x) for x in flist[0:10]])), force=self.pargs.force):
         return
     for f in flist:
         self.app.log.info("removing {}".format(f))
         self.app.cmd.safe_unlink(f)
Exemplo n.º 18
0
    def _compress(self, label="compress"):
        if self.pargs.input_file:
            flist = [self.pargs.input_file]
        else:
            flist = filtered_walk(os.path.join(self._meta.root_path, self._meta.path_id), self._filter_fn)

        if len(flist) == 0:
            self.app.log.info("No files matching pattern '{}' found".format(self._meta.pattern))
            return
        if len(flist) > 0 and not query_yes_no("Going to {} {} files ({}...). Are you sure you want to continue?".format(label, len(flist), ",".join([os.path.basename(x) for x in flist[0:10]])), force=self.pargs.force):
            sys.exit()
        for f in flist:
            self.log.info("{}ing {}".format(label, f))
            self.app.cmd.command([self._meta.compress_prog, self._meta.compress_opt, "%s" % f], label, ignore_error=True, **{'workingDirectory':os.path.dirname(f), 'outputPath':os.path.join(os.path.dirname(f), "{}-{}-drmaa.log".format(label, os.path.basename(f)))})
Exemplo n.º 19
0
    def _compress(self, label="compress"):
        if self.pargs.input_file:
            flist = [self.pargs.input_file]
        else:
            flist = filtered_walk(os.path.join(self._meta.root_path, self._meta.path_id), self._filter_fn)

        if len(flist) == 0:
            self.app.log.info("No files matching pattern '{}' found".format(self._meta.pattern))
            return
        if len(flist) > 0 and not query_yes_no("Going to {} {} files ({}...). Are you sure you want to continue?".format(label, len(flist), ",".join([os.path.basename(x) for x in flist[0:10]])), force=self.pargs.force):
            sys.exit()
        for f in flist:
            self.log.info("{}ing {}".format(label, f))
            self.app.cmd.command([self._meta.compress_prog, self._meta.compress_opt, "%s" % f], label, ignore_error=True, **{'workingDirectory':os.path.dirname(f), 'outputPath':os.path.join(os.path.dirname(f), "{}-{}-drmaa.log".format(label, os.path.basename(f)))})
Exemplo n.º 20
0
 def best_practice(self):
     if not self._check_pargs(["project", "uppmax_project"]):
         return
     project_path = os.path.normpath(os.path.join("/proj", self.pargs.uppmax_project))
     if not os.path.exists(project_path):
         self.log.warn("No such project {}; skipping".format(self.pargs.uppmax_project))
         return
     if self.pargs.outdir:
         outpath = os.path.join(project_path, "INBOX", self.pargs.outdir)
     else:
         outpath = os.path.join(project_path, "INBOX", self.pargs.statusdb_project_name) if self.pargs.statusdb_project_name else os.path.join(project_path, "INBOX", self.pargs.project)
     if not query_yes_no("Going to deliver data to {}; continue?".format(outpath)):
         return
     if not os.path.exists(outpath):
         self.app.cmd.safe_makedir(outpath)
     kw = vars(self.pargs)
     basedir = os.path.abspath(os.path.join(self._meta.root_path, self._meta.path_id))
     flist = find_samples(basedir, **vars(self.pargs))
     if self.pargs.flowcell:
         flist = [ fl for fl in flist if os.path.basename(os.path.dirname(fl)) == self.pargs.flowcell ]
     if not len(flist) > 0:
         self.log.info("No samples/sample configuration files found")
         return
     def filter_fn(f):
         if not pattern:
             return
         return re.search(pattern, f) != None
     # Setup pattern
     plist = [".*.yaml$", ".*.metrics$"]
     if not self.pargs.no_bam:
         plist.append(".*-{}.bam$".format(self.pargs.bam_file_type))
         plist.append(".*-{}.bam.bai$".format(self.pargs.bam_file_type))
     if not self.pargs.no_vcf:
         plist.append(".*.vcf$")
         plist.append(".*.vcf.gz$")
         plist.append(".*.tbi$")
         plist.append(".*.tsv$")
     pattern = "|".join(plist)
     size = 0
     for f in flist:
         path = os.path.dirname(f)
         sources = filtered_walk(path, filter_fn=filter_fn, exclude_dirs=BCBIO_EXCLUDE_DIRS)
         targets = [src.replace(basedir, outpath) for src in sources]
         self._transfer_files(sources, targets)
         if self.pargs.size:
             statinfo = [os.stat(src).st_size for src in sources]
             size = size + sum(statinfo)
     if self.pargs.size:
         self.app._output_data['stderr'].write("\n********************************\nEstimated delivery size: {:.1f}G\n********************************".format(size/1e9))
Exemplo n.º 21
0
 def rm(self):
     if not self._check_pargs(["project",  "analysis_id"]):
         return
     indir = os.path.join(self.app.controller._meta.project_root, self.app.controller._meta.path_id, self.pargs.analysis_id)
     assert os.path.exists(indir), "No such analysis {} for project {}".format(self.pargs.analysis_id, self.pargs.project)
     try:
         flist = walk(indir)
     except IOError as e:
         self.app.log.warn(str(e))
         raise e
     if len(flist) > 0 and not query_yes_no("Going to remove all contents ({} files) of analysis {} for project {}... Are you sure you want to continue?".format(len(flist), self.pargs.analysis_id, self.pargs.project), force=self.pargs.force):
         return
     for f in flist:
         self.app.cmd.safe_unlink(f)
     self.app.log.info("removing {}".format(indir))
     self.app.cmd.safe_rmdir(indir)
Exemplo n.º 22
0
    def clean(self):
        pattern = "|".join(["{}(.gz|.bz2)?$".format(x) for x in self._meta.file_pat])
        def clean_filter(f):
            if not pattern:
                return
            return re.search(pattern , f) != None

        flist = filtered_walk(os.path.join(self._meta.root_path, self._meta.path_id), clean_filter, include_dirs=self._meta.include_dirs)
        if len(flist) == 0:
            self.app.log.info("No files matching pattern {} found".format(pattern))
            return
        if len(flist) > 0 and not query_yes_no("Going to remove {} files ({}...). Are you sure you want to continue?".format(len(flist), ",".join([os.path.basename(x) for x in flist[0:10]])), force=self.pargs.force):
            return
        for f in flist:
            self.app.log.info("removing {}".format(f))
            self.app.cmd.safe_unlink(f)
Exemplo n.º 23
0
 def run(self):
     if not self._check_pargs(["project", "post_process", "analysis_type"]):
         return
     ## Gather sample yaml files
     pattern = "-bcbb-config.yaml$"
     flist = []
     if self.pargs.sample:
         if os.path.exists(self.pargs.sample):
             with open(self.pargs.sample) as fh:
                 flist = [x.rstrip() for x in fh.readlines()]
         else:
             pattern = "{}{}".format(self.pargs.sample, pattern)
     def bcbb_yaml_filter(f):
         return re.search(pattern, f) != None
     if not flist:
         flist = filtered_walk(os.path.join(self.app.controller._meta.project_root, self.pargs.project, "data"), bcbb_yaml_filter)
     if self.pargs.only_failed:
         status = {x:self._sample_status(x) for x in flist}
         flist = [x for x in flist if self._sample_status(x)=="FAIL"]
     if len(flist) == 0 and self.pargs.sample:
         self.app.log.info("No such sample {}".format(self.pargs.sample))
     if len(flist) > 0 and not query_yes_no("Going to start {} jobs... Are you sure you want to continue?".format(len(flist)), force=self.pargs.force):
         return
     for f in flist:
         with open(f) as fh:
             config = yaml.load(fh)
         if self.pargs.analysis_type:
             config["details"][0]["multiplex"][0]["analysis"] = self.pargs.analysis_type
             config["details"][0]["analysis"] = self.pargs.analysis_type
         if config["details"][0]["genome_build"] == 'unknown':
             config["details"][0]["genome_build"] = self.pargs.genome_build
         ## Check if files exist: if they don't, then change the suffix
         config["details"][0]["multiplex"][0]["files"].sort()
         if not os.path.exists(config["details"][0]["multiplex"][0]["files"][0]):
             if os.path.splitext(config["details"][0]["multiplex"][0]["files"][0])[1] == ".gz":
                 config["details"][0]["multiplex"][0]["files"] = [x.replace(".gz", "") for x in config["details"][0]["multiplex"][0]["files"]]
             else:
                 config["details"][0]["multiplex"][0]["files"] = ["{}.gz".format(x) for x in config["details"][0]["multiplex"][0]["files"]]
         config_file = f.replace("-bcbb-config.yaml", "-pm-bcbb-analysis-config.yaml")
         self.app.cmd.write(config_file, yaml.dump(config))
         ## Run automated_initial_analysis.py
         cur_dir = os.getcwd()
         new_dir = os.path.abspath(os.path.dirname(f))
         os.chdir(new_dir)
         self.app.cmd.command(['automated_initial_analysis.py', os.path.abspath(self.pargs.post_process), new_dir, config_file])
         os.chdir(cur_dir)
Exemplo n.º 24
0
def remove_files(f, **kw):
    ## Remove old files if requested
    keep_files = ["-post_process.yaml$", "-post_process.yaml.bak$", "-bcbb-config.yaml$", "-bcbb-config.yaml.bak$",  "-bcbb-command.txt$", "-bcbb-command.txt.bak$", "_[0-9]+.fastq$", "_[0-9]+.fastq.gz$", "_[0-9]+_fastq.txt.gz$", "_[0-9]+_fastq.txt$",
                  "^[0-9][0-9]_.*.txt$", "JOBID", "PID"]
    pattern = "|".join(keep_files)
    def remove_filter_fn(f):
        return re.search(pattern, f) == None

    workdir = os.path.dirname(f)
    remove_files = filtered_walk(workdir, remove_filter_fn)
    remove_dirs = filtered_walk(workdir, remove_filter_fn, get_dirs=True)
    if len(remove_files) == 0:
        pass
    if len(remove_files) > 0 and query_yes_no("Going to remove {} files and {} directories... Are you sure you want to continue?".format(len(remove_files), len(remove_dirs)), force=kw['force']):
        [dry_unlink(x, dry_run=kw['dry_run']) for x in remove_files]
        ## Sort directories by length so we don't accidentally try to remove a non-empty dir
        [dry_rmdir(x, dry_run=kw['dry_run']) for x in sorted(remove_dirs, key=lambda x: len(x), reverse=True)]
Exemplo n.º 25
0
 def touch_finished(self):
     if not self._check_pargs(["project", "sample"]):
         return
     if os.path.exists(self.pargs.sample) and os.path.isfile(
             self.pargs.sample):
         with open(self.pargs.sample) as fh:
             slist = [x.rstrip() for x in fh.readlines()]
     else:
         slist = [self.pargs.sample]
     for s in slist:
         spath = os.path.join(self._meta.root_path, self._meta.path_id, s)
         if not os.path.exists(spath):
             self.app.log.warn("No such path {}; skipping".format(spath))
             continue
         rsync_src = os.path.join(self._meta.root_path, self._meta.path_id,
                                  s) + os.sep
         rsync_tgt = os.path.join(self.app.config.get("runqc", "root"),
                                  self.pargs.project, s) + os.sep
         cl = [
             "rsync {} {} {}".format(
                 self.app.config.get("runqc", "rsync_sample_opts"),
                 rsync_src, rsync_tgt)
         ]
         self.app.log.info(
             "Checking if runqc uptodate with command '{}'".format(
                 " ".join(cl)))
         out = self.app.cmd.command(cl, **{'shell': True})
         if not self.pargs.dry_run and not out.find("total size is 0"):
             self.app.log.info(
                 "Some files need to be updated. Rsync output:")
             print "********"
             print out
             print "********"
             continue
         if not query_yes_no(
                 "Going to touch file {} for sample {}; continue?".format(
                     FINISHED_FILE, s),
                 force=self.pargs.force):
             continue
         self.app.log.info("Touching file {} for sample {}".format(
             FINISHED_FILE, s))
         with open(os.path.join(spath, FINISHED_FILE), "w") as fh:
             t_utc = utc_time()
             fh.write(t_utc)
Exemplo n.º 26
0
 def run_halo(self):
     if self.app.pargs.setup:
         if not self._check_pargs(["project", "baits", "targets", "target_region"]):
             return
     else:
         if not self._check_pargs(["project"]):
             return
     basedir = os.path.abspath(os.path.join(self.app.controller._meta.root_path, self.app.controller._meta.path_id))
     self.app.log.info("Going to look for samples in {}".format(basedir))
     param_list = run_halo(path=basedir, **vars(self.pargs))
     if self.app.pargs.setup:
         self.app.log.info("Setup configuration files. Rerun command without '--setup' option to run analysis")
         return
     if not len(param_list) > 0:
         self.log.info("No samples found in {}; perhaps you need to add the '--data' option to look in the {} directory".format(self.app.pargs.project, os.path.join(self.app.pargs.project, "data")))
     if len(param_list) > 0 and not query_yes_no("Going to start {} jobs... Are you sure you want to continue?".format(len(param_list)), force=self.pargs.force):
         return
     for param in param_list:
         self.app.cmd.command(param['cl'], **param)
Exemplo n.º 27
0
    def _compress(self, pattern, label="compress"):
        def compress_filter(f):
            if not pattern:
                return
            return re.search(pattern, f) != None

        if self.pargs.input_file:
            flist = [self.pargs.input_file]
        else:
            flist = filtered_walk(os.path.join(self._meta.root_path, self._meta.path_id), compress_filter)

        if len(flist) == 0:
            self.app.log.info("No files matching pattern {} found".format(pattern))
            return
        if len(flist) > 0 and not query_yes_no("Going to {} {} files ({}...). Are you sure you want to continue?".format(label, len(flist), ",".join([os.path.basename(x) for x in flist[0:10]])), force=self.pargs.force):
            sys.exit()
        for f in flist:
            self.log.info("{}ing {}".format(label, f))
            self.app.cmd.command([self._meta.compress_prog, self._meta.compress_opt, "%s" % f], label, ignore_error=True)
Exemplo n.º 28
0
    def remove_finished(self):
        if not self._check_pargs(["project"]):
            return
        # Don't filter out files
        def filter_fn(f):
            return True

        slist = os.listdir(
            os.path.join(self._meta.root_path, self._meta.path_id))
        for s in slist:
            spath = os.path.join(self._meta.root_path, self._meta.path_id, s)
            if not os.path.isdir(spath):
                continue
            if not os.path.exists(os.path.join(spath, FINISHED_FILE)):
                self.app.log.info("Sample {} not finished; skipping".format(s))
                continue
            flist = filtered_walk(spath, filter_fn)
            dlist = filtered_walk(spath, filter_fn, get_dirs=True)
            if os.path.exists(os.path.join(spath, REMOVED_FILE)):
                self.app.log.info(
                    "Sample {} already removed; skipping".format(s))
                continue
            if len(flist) > 0 and not query_yes_no(
                    "Will remove directory {} containing {} files; continue?".
                    format(s, len(flist)),
                    force=self.pargs.force):
                continue
            self.app.log.info("Removing {} files from {}".format(
                len(flist), spath))
            for f in flist:
                if f == os.path.join(spath, FINISHED_FILE):
                    continue
                self.app.cmd.safe_unlink(f)
            self.app.log.info("Removing {} directories from {}".format(
                len(dlist), spath))
            for d in sorted(dlist, reverse=True):
                self.app.cmd.safe_rmdir(d)
            if not self.pargs.dry_run:
                with open(os.path.join(spath, REMOVED_FILE), "w") as fh:
                    t_utc = utc_time()
                    fh.write(t_utc)
Exemplo n.º 29
0
    def run(self):
        if not self._check_pargs(["project"]):
            return
        if self.pargs.post_process:
            self.pargs.post_process = os.path.abspath(self.pargs.post_process)
        basedir = os.path.abspath(
            os.path.join(self.app.controller._meta.root_path,
                         self.app.controller._meta.path_id))
        if self.pargs.from_ssheet:
            [
                samplesheet_csv_to_yaml(fn) for fn in find_samples(
                    basedir, pattern="SampleSheet.csv$", **vars(self.pargs))
            ]
        flist = find_samples(basedir, **vars(self.pargs))
        # Add filtering on flowcell if necessary
        self._meta.pattern = ".*"
        flist = [x for x in flist if self._filter_fn(x)]
        if self.pargs.merged:
            ##  Setup merged samples and append to flist if new list longer
            flist = setup_merged_samples(flist, **vars(self.pargs))
        if not len(flist) > 0:
            self.log.info("No sample configuration files found")
            return
        if len(flist) > 0 and not query_yes_no(
                "Going to start {} jobs... Are you sure you want to continue?".
                format(len(flist)),
                force=self.pargs.force):
            return
        # Make absolutely sure analysis directory is a *subdirectory* of the working directory
        validate_sample_directories(flist, basedir)
        orig_dir = os.path.abspath(os.getcwd())

        for run_info in flist:
            os.chdir(os.path.abspath(os.path.dirname(run_info)))
            setup_sample(run_info, **vars(self.pargs))
            os.chdir(orig_dir)
        if self.pargs.only_setup:
            return
        if self.pargs.only_failed:
            status = {x: self._sample_status(x) for x in flist}
            flist = [x for x in flist if self._sample_status(x) == "FAIL"]
        ## Here process files again, removing if requested, and running the pipeline
        for run_info in flist:
            self.app.log.info(
                "Running analysis defined by config file {}".format(run_info))
            os.chdir(os.path.abspath(os.path.dirname(run_info)))
            if self.app.cmd.monitor(work_dir=os.path.dirname(run_info)):
                self.app.log.warn("Not running job")
                continue
            if self.pargs.restart:
                self.app.log.info("Removing old analysis files in {}".format(
                    os.path.dirname(run_info)))
                remove_files(run_info, **vars(self.pargs))
            (cl, platform_args) = run_bcbb_command(run_info,
                                                   **vars(self.pargs))
            self.app.cmd.command(
                cl, **{
                    'platform_args': platform_args,
                    'saveJobId': True,
                    'workingDirectory': os.path.dirname(run_info)
                })
            os.chdir(orig_dir)
Exemplo n.º 30
0
    def raw_data(self):
        if not self._check_pargs(["project"]):
            return

        # if necessary, reformat flowcell identifier
        if self.pargs.flowcell:
            self.pargs.flowcell = self.pargs.flowcell.split("_")[-1]

        # get the uid and gid to use for destination files
        uid = os.getuid()
        gid = os.getgid()
        if self.pargs.group is not None and len(self.pargs.group) > 0:
            gid = grp.getgrnam(group).gr_gid

        self.log.debug("Connecting to project database")
        p_con = ProjectSummaryConnection(**vars(self.pargs))
        assert p_con, "Could not get connection to project database"
        self.log.debug("Connecting to flowcell database")
        f_con = FlowcellRunMetricsConnection(**vars(self.pargs))
        assert f_con, "Could not get connection to flowcell database"
        self.log.debug("Connecting to x_flowcell database")
        x_con = X_FlowcellRunMetricsConnection(**vars(self.pargs))
        assert x_con, "Could not get connection to x_flowcell database"

        # Fetch the Uppnex project to deliver to
        if not self.pargs.uppmax_project:
            self.pargs.uppmax_project = p_con.get_entry(
                self.pargs.project, "uppnex_id")
            if not self.pargs.uppmax_project:
                self.log.error(
                    "Uppmax project was not specified and could not be fetched from project database"
                )
                return

        # Setup paths and verify parameters
        self._meta.production_root = self.pargs.root if self.pargs.root else self.app.config.get(
            "production", "root")
        self._meta.root_path = self._meta.production_root
        proj_base_dir = os.path.join(self._meta.root_path, self.pargs.project)
        assert os.path.exists(
            self._meta.production_root
        ), "No such directory {}; check your production config".format(
            self._meta.production_root)
        assert os.path.exists(
            proj_base_dir), "No project {} in production path {}".format(
                self.pargs.project, self._meta.root_path)

        try:
            self._meta.uppnex_project_root = self.app.config.get(
                "deliver", "uppnex_project_root")
        except Exception as e:
            self.log.warn(
                "{}, will use '/proj' as uppnext_project_root".format(e))
            self._meta.uppnex_project_root = '/proj'

        try:
            self._meta.uppnex_delivery_dir = self.app.config.get(
                "deliver", "uppnex_project_delivery_path")
        except Exception as e:
            self.log.warn(
                "{}, will use 'INBOX' as uppnext_project_delivery_path".format(
                    e))
            self._meta.uppnex_delivery_dir = 'INBOX'

        destination_root = os.path.join(self._meta.uppnex_project_root,
                                        self.pargs.uppmax_project,
                                        self._meta.uppnex_delivery_dir)
        assert os.path.exists(
            destination_root
        ), "Delivery destination folder {} does not exist".format(
            destination_root)
        destination_root = os.path.join(destination_root, self.pargs.project)

        # Find uncompressed fastq
        uncompressed = self._find_uncompressed_fastq_files(
            proj_base_dir=proj_base_dir,
            sample=self.pargs.sample,
            flowcell=self.pargs.flowcell)
        if len(uncompressed) > 0:
            self.log.error(
                "There are uncompressed fastq file for project, kindly check all files are compressed properly before delivery"
            )
            return

        # Extract the list of samples and runs associated with the project and sort them
        samples = self.samples_to_copy(
            pid=p_con.get_entry(self.pargs.project, "project_id"),
            pod=p_con.get_entry(self.pargs.project, "open_date"),
            fc_dict={
                'HiSeq2500': f_con.proj_list,
                'HiSeqX': x_con.proj_list
            },
            proj_base_dir=proj_base_dir,
            destination_root=destination_root,
            sample=self.pargs.sample,
            flowcell=self.pargs.flowcell)

        # If interactively select, build a list of samples to skip
        if self.pargs.interactive:
            to_process = {}
            for sample in samples:
                if query_yes_no("Deliver sample {} ?".format(sample),
                                default="no"):
                    to_process[sample] = samples[sample]
            samples = to_process

        if self.pargs.sample:
            sample = samples.get(self.pargs.sample)
            if not sample:
                self.log.error(
                    "There is no such sample {} for project {}".format(
                        self.pargs.sample, self.pargs.project))
                return
            samples = {self.pargs.sample: sample}

        self.log.info(
            "Will deliver data for {} samples from project {} to {}".format(
                len(samples), self.pargs.project, destination_root))
        if not query_yes_no("Continue?"):
            return

        # Make sure that transfer will be with rsync
        if not self.pargs.rsync:
            self.log.warn("Files must be transferred using rsync")
            if not query_yes_no(
                    "Do you wish to continue delivering using rsync?",
                    default="yes"):
                return
            self.pargs.rsync = True

        # Process each sample
        for sample, flowcells in samples.iteritems():
            for fc, files in flowcells.iteritems():
                self.log.info("Processing sample {} and flowcell {}".format(
                    sample, fc))

                # transfer files
                self.log.debug("Transferring {} fastq files".format(
                    len(files['src'])))
                self._transfer_files(sources=files['src'],
                                     targets=files['dst'])

                passed = True
                if self.pargs.link or self.pargs.dry_run:
                    passed = False
                else:
                    # calculate md5sums on the source side and write it on the destination
                    md5 = []
                    for s, d in zip(files['src'], files['dst']):
                        m = md5sum(s)
                        mfile = "{}.md5".format(d)
                        md5.append([m, mfile, s])
                        self.log.debug("md5sum for source file {}: {}".format(
                            s, m))

                    # write the md5sum to a file at the destination and verify the transfer
                    for m, mfile, srcpath in md5:
                        dstfile = os.path.splitext(mfile)[0]
                        self.log.debug(
                            "Writing md5sum to file {}".format(mfile))
                        self.app.cmd.write(
                            mfile, "{}  {}".format(m,
                                                   os.path.basename(dstfile)),
                            True)
                        self.log.debug(
                            "Verifying md5sum for file {}".format(dstfile))
                        dm = md5sum(dstfile)
                        self.log.debug(
                            "md5sum for destination file {}: {}".format(
                                dstfile, dm))
                        if m != dm:
                            self.log.warn(
                                "md5sum verification FAILED for {}. Source: {}, Target: {}"
                                .format(dstfile, m, dm))
                            self.log.warn(
                                "Improperly transferred file {} is removed from destination, please retry transfer of this file"
                                .format(dstfile))
                            self.app.cmd.safe_unlink(dstfile)
                            self.app.cmd.safe_unlink(mfile)
                            passed = False
                            continue

                        # Modify the permissions to ug+rw
                        for f in [dstfile, mfile]:
                            self.app.cmd.chmod(
                                f, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP
                                | stat.S_IWGRP)

                # touch the flag to trigger uppmax inbox permission fix
                self.app.cmd.safe_touchfile(
                    os.path.join("/sw", "uppmax", "var", "inboxfix",
                                 "schedule", self.pargs.uppmax_project))

                # log the transfer to statusdb if verification passed
                if passed:
                    data = {
                        'raw_data_delivery': {
                            'timestamp': utc_time(),
                            'files': {
                                os.path.splitext(
                                    (os.path.basename(srcpath)))[0]:
                                {
                                    'md5':
                                    m,
                                    'path':
                                    os.path.splitext(mfile)[0],
                                    'size_in_bytes':
                                    self._getsize(os.path.splitext(mfile)[0]),
                                    'source_location':
                                    srcpath
                                }
                                for m, mfile, srcpath in md5
                            }
                        }
                    }
                    jsonstr = json.dumps(data)
                    jsonfile = os.path.join(
                        proj_base_dir, sample, fc,
                        "{}_{}_raw_data_delivery.json".format(sample, fc))
                    self.log.debug(
                        "Writing delivery to json file {}".format(jsonfile))
                    self.app.cmd.write(jsonfile, data=jsonstr, overwrite=True)
                    self.log.debug(
                        "Saving delivery in StatusDB document {}".format(id))
                    if self.proj_flowcells[fc]['type'] == 'HiSeqX':
                        fc_con = x_con
                    else:
                        fc_con = f_con
                    fc_obj = fc_con.get_entry(fc)
                    self.log.info(
                        "Logging delivery to StatusDB document {}".format(
                            fc_obj.get('_id')))
                    fc_raw_data = fc_obj.get('raw_data_delivery', {})
                    fc_raw_data.update(data['raw_data_delivery'])
                    fc_obj['raw_data_delivery'] = fc_raw_data
                    self._save(fc_con, fc_obj)
                    self.log.debug(jsonstr)
Exemplo n.º 31
0
def main():
    parser = argparse.ArgumentParser(description="A script to help doing the deliveries, now using the Casava directory structure. " \
                                     "The user is asked to provide a project ID, a run name, and an UPPMAX project")

    parser.add_argument('-c',
                        '--casava-path',
                        action="store",
                        dest="caspath",
                        default='/proj/a2010002/nobackup/illumina/',
                        help="Specify a path to a Casava directory manually")
    parser.add_argument('-l',
                        '--log-path',
                        action="store",
                        dest="logpath",
                        default='/proj/a2010002/private/delivery_logs',
                        help="Specify a path to a log file")
    parser.add_argument('-i',
                        '--interactive',
                        action="store_true",
                        dest="interactive",
                        default=False,
                        help="Interactively select samples to be delivered")
    parser.add_argument('-d',
                        '--dry-run',
                        action="store_true",
                        dest="dry",
                        default=False,
                        help="Dry run: nothing will be done")
    parser.add_argument(
        '-a',
        '--deliver-all-fcs',
        action="store_true",
        dest="deliver_all_fcs",
        default=False,
        help=
        "rsync samples from all flow cells. Default is to only deliver from specified flowcell"
    )
    parser.add_argument(
        '-p',
        '--nophix',
        action="store_true",
        dest="deliver_nophix",
        default=False,
        help=
        "Deliver fastq files from nophix subdirectory. Default is to deliver from run directory"
    )
    parser.add_argument('-g',
                        '--group',
                        action="store",
                        dest="group",
                        default="uppmax",
                        help="Group membership to set on copied files")
    parser.add_argument('project_name',
                        action='store',
                        help="Project name to deliver, e.g. J.Doe_10_01")
    parser.add_argument('flowcell_id',
                        action='store',
                        help="Flowcell id to deliver, e.g. 120824_BD1915ACXX")
    parser.add_argument('uppmax_id',
                        action='store',
                        help="UPPMAX project id to deliver to, e.g. b2012001")
    args = parser.parse_args()

    if not args.project_name in os.listdir(args.caspath):
        print("Could not find project. Check directory listing:")
        for f in os.listdir(args.caspath):
            print(f)
        clean_exit(0, None, args.dry)

    fcid = args.flowcell_id
    fcid_comp = fcid.split('_')
    if len(fcid_comp) > 2:
        fcid = fcid_comp[0] + '_' + fcid_comp[-1]
        print("FCID format too long, trying {:s}".format(fcid))

    dt = datetime.now()
    time_str = "_".join([
        str(dt.year),
        str(dt.month),
        str(dt.day),
        str(dt.hour),
        str(dt.minute),
        str(dt.second)
    ])

    logfilename = os.path.join(os.path.normpath(args.logpath),
                               "{:s}.log".format(time_str))
    if not args.dry:
        logfile = open(logfilename, "w")
    else:
        logfile = sys.stdout

    logfile.write("[{:s}] - Project to move files for:\n{:s}\n".format(
        utc_time(), args.project_name))
    logfile.flush()

    proj_base_dir = os.path.join(args.caspath, args.project_name)
    skip_list = []
    if args.interactive:
        for sample_dir in os.listdir(proj_base_dir):
            if not os.path.isdir(os.path.join(proj_base_dir, sample_dir)):
                continue
            if not query_yes_no("Deliver sample {:s}?".format(sample_dir),
                                default="no"):
                skip_list.append(sample_dir)

    created_proj_dir_name = fixProjName(args.project_name)
    del_path_top = '/proj/' + args.uppmax_id + "/INBOX/" + created_proj_dir_name

    to_copy = get_file_copy_list(proj_base_dir, del_path_top, fcid,
                                 args.deliver_all_fcs, args.deliver_nophix,
                                 skip_list)

    # Prompt user if any of the files are non-compressed
    for fqfile, _, _ in to_copy:
        if os.path.splitext(fqfile)[1] == ".gz":
            continue
        print("WARNING: The file {:s}, which you are about to deliver, does not seem to be compressed. " \
              "It is recommended that you compress files prior to delivery.".format(fqfile))
        if query_yes_no("Do you wish to continue delivering " \
                        "uncompressed fastq files?", default="yes"):
            break
        clean_exit(1, logfile, args.dry)

    rsync_files(to_copy, logfile, args.group, args.dry)

    clean_exit(0, logfile, args.dry)
Exemplo n.º 32
0
def purge_alignments(path,
                     ftype="sam",
                     keep="last",
                     dry_run=False,
                     force=False,
                     fsize=MINFILESIZE):
    """Cleanup sam and bam files. In some cases, sam files persist. If
    the corresponding bam file exists, replace the sam file contents
    with a message that the file has been removed to save space.
    
    In general, several bam files are produced in an analysis. By
    grouping bam files by prefix, either the most recent file is
    retained for further reference, or a specific analysis is kept.
    """
    if ftype == "sam":
        pattern = ".sam$"
    elif ftype == "bam":
        pattern = ".bam$"
    else:
        LOG.warn("ftype must be one of 'sam' or 'bam'")
        return
    LOG.debug(
        "running purge_alignments in path {} with pattern {} keep rule {}".
        format(path, pattern, keep))

    def purge_filter(f):
        if not pattern:
            return
        return re.search(pattern, f) != None

    flist = filtered_walk(path, purge_filter, exclude_dirs=["realign-split"])
    if len(flist) == 0:
        LOG.info("No {} files found in {}".format(ftype, path))
        return
    if len(flist) > 0 and not query_yes_no(
            "Going to remove/cleanup {} {} files ({}...). Are you sure you want to continue?"
            .format(len(flist), ftype, ",".join(
                [os.path.basename(x) for x in flist[0:10]])),
            force=force):
        return
    if ftype == "sam":
        for f in flist:
            LOG.info("Purging {} file {}".format(ftype, f))
            dry_unlink(f, dry_run)
            if os.path.exists(f.replace(".sam", ".bam")):
                dry_write(
                    f, "File removed to save disk space: SAM converted to BAM",
                    dry_run)
        return
    elif ftype == "bam":
        samples = {}
        for f in flist:
            m = re.search("([0-9A-Za-z\_]+)-.*", os.path.basename(f))
            if not m:
                LOG.debug("Couldn't determine prefix for {}".format(f))
                continue
            sid = m.groups()[0]

            if not sid in samples.keys():
                samples[sid] = {}
            dname = os.path.dirname(f)
            if not dname in samples[sid].keys():
                samples[sid][dname] = []
            samples[sid][dname].append(f)

        saved_size = 0
        for k in samples.iterkeys():
            for d, files in samples[k].iteritems():
                if not files or len(files) == 1:
                    continue
                files.sort(lambda x, y: cmp(len(x), len(y)))
                if keep == "last":
                    LOG.info(
                        "Keeping file {} and removing all files with common prefix: {}"
                        .format(
                            os.path.basename(files[len(files) - 1]), ", ".join(
                                [os.path.basename(x) for x in files[0:-1]])))
                saved_size = _purge_by_sample(files, dry_run,
                                              int(fsize)) + saved_size
        LOG.info("Will save approximately {:.1f}G space".format(saved_size /
                                                                1e9))
Exemplo n.º 33
0
    def raw_data(self):
        if not self._check_pargs(["project"]):
            return

        # if necessary, reformat flowcell identifier
        if self.pargs.flowcell:
            self.pargs.flowcell = self.pargs.flowcell.split("_")[-1]

        # get the uid and gid to use for destination files
        uid = os.getuid()
        gid = os.getgid()
        if self.pargs.group is not None and len(self.pargs.group) > 0:
            gid = grp.getgrnam(group).gr_gid

        self.log.debug("Connecting to project database")
        p_con = ProjectSummaryConnection(**vars(self.pargs))
        assert p_con, "Could not get connection to project databse"
        self.log.debug("Connecting to samples database")
        s_con = SampleRunMetricsConnection(**vars(self.pargs))
        assert s_con, "Could not get connection to samples databse"

        # Fetch the Uppnex project to deliver to
        if not self.pargs.uppmax_project:
            self.pargs.uppmax_project = p_con.get_entry(
                self.pargs.project, "uppnex_id")
            if not self.pargs.uppmax_project:
                self.log.error(
                    "Uppmax project was not specified and could not be fetched from project database"
                )
                return

        # Extract the list of samples and runs associated with the project and sort them
        samples = sorted(s_con.get_samples(fc_id=self.pargs.flowcell,
                                           sample_prj=self.pargs.project),
                         key=lambda k:
                         (k.get('project_sample_name', 'NA'),
                          k.get('flowcell', 'NA'), k.get('lane', 'NA')))

        # Setup paths and verify parameters
        self._meta.production_root = self.app.config.get("production", "root")
        self._meta.root_path = self._meta.production_root
        proj_base_dir = os.path.join(self._meta.root_path, self.pargs.project)
        assert os.path.exists(
            self._meta.production_root
        ), "No such directory {}; check your production config".format(
            self._meta.production_root)
        assert os.path.exists(
            proj_base_dir), "No project {} in production path {}".format(
                self.pargs.project, self._meta.root_path)

        try:
            self._meta.uppnex_project_root = self.app.config.get(
                "deliver", "uppnex_project_root")
        except Exception as e:
            self.log.warn(
                "{}, will use '/proj' as uppnext_project_root".format(e))
            self._meta.uppnex_project_root = '/proj'

        try:
            self._meta.uppnex_delivery_dir = self.app.config.get(
                "deliver", "uppnex_project_delivery_path")
        except Exception as e:
            self.log.warn(
                "{}, will use 'INBOX' as uppnext_project_delivery_path".format(
                    e))
            self._meta.uppnex_delivery_dir = 'INBOX'

        destination_root = os.path.join(self._meta.uppnex_project_root,
                                        self.pargs.uppmax_project,
                                        self._meta.uppnex_delivery_dir)
        assert os.path.exists(
            destination_root
        ), "Delivery destination folder {} does not exist".format(
            destination_root)
        destination_root = os.path.join(destination_root, self.pargs.project)

        # If interactively select, build a list of samples to skip
        if self.pargs.interactive:
            to_process = []
            for sample in samples:
                sname = sample.get("project_sample_name")
                index = sample.get("sequence")
                fcid = sample.get("flowcell")
                lane = sample.get("lane")
                date = sample.get("date")
                self.log.info(
                    "Sample: {}, Barcode: {}, Flowcell: {}, Lane: {}, Started on: {}"
                    .format(sname, index, fcid, lane, date))
                if query_yes_no("Deliver sample?", default="no"):
                    to_process.append(sample)
            samples = to_process

        # Find uncompressed fastq
        uncompressed = self._find_uncompressed_fastq_files(
            proj_base_dir, samples)
        if len(uncompressed) > 0:
            self.log.warn(
                "The following samples have uncompressed *.fastq files that cannot be delivered: {}"
                .format(",".join(uncompressed)))
            if not query_yes_no("Continue anyway?", default="no"):
                return

        self.log.info(
            "Will deliver data for {} samples from project {} to {}".format(
                len(samples), self.pargs.project, destination_root))
        if not query_yes_no("Continue?"):
            return

        # Get the list of files to transfer and the destination
        self.log.debug("Gathering list of files to copy")
        to_copy = self.get_file_copy_list(proj_base_dir, destination_root,
                                          samples)

        # Make sure that transfer will be with rsync
        if not self.pargs.rsync:
            self.log.warn("Files must be transferred using rsync")
            if not query_yes_no(
                    "Do you wish to continue delivering using rsync?",
                    default="yes"):
                return
            self.pargs.rsync = True

        # Process each sample run
        for id, files in to_copy.items():
            # get the sample database object
            [sample] = [s for s in samples if s.get('_id') == id]
            self.log.info("Processing sample {} and flowcell {}".format(
                sample.get("project_sample_name", "NA"),
                sample.get("flowcell", "NA")))

            # calculate md5sums on the source side and write it on the destination
            md5 = []
            for f in files:
                m = md5sum(f[0])
                mfile = "{}.md5".format(f[1])
                md5.append([m, mfile, f[2], f[0]])
                self.log.debug("md5sum for source file {}: {}".format(f[0], m))

            # transfer files
            self.log.debug("Transferring {} fastq files".format(len(files)))
            self._transfer_files([f[0] for f in files], [f[1] for f in files])

            # write the md5sum to a file at the destination and verify the transfer
            passed = True
            for m, mfile, read, srcpath in md5:
                dstfile = os.path.splitext(mfile)[0]
                self.log.debug("Writing md5sum to file {}".format(mfile))
                self.app.cmd.write(
                    mfile, "{}  {}".format(m, os.path.basename(dstfile)), True)
                self.log.debug("Verifying md5sum for file {}".format(dstfile))

                # if dry-run, make sure verification pass
                if self.pargs.dry_run:
                    dm = m
                else:
                    dm = md5sum(dstfile)
                self.log.debug("md5sum for destination file {}: {}".format(
                    dstfile, dm))
                if m != dm:
                    self.log.warn(
                        "md5sum verification FAILED for {}. Source: {}, Target: {}"
                        .format(dstfile, m, dm))
                    self.log.warn(
                        "Improperly transferred file {} is removed from destination, please retry transfer of this file"
                        .format(dstfile))
                    self.app.cmd.safe_unlink(dstfile)
                    self.app.cmd.safe_unlink(mfile)
                    passed = False
                    continue

                # Modify the permissions to ug+rw
                for f in [dstfile, mfile]:
                    self.app.cmd.chmod(
                        f, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP
                        | stat.S_IWGRP)

            # touch the flag to trigger uppmax inbox permission fix
            self.app.cmd.safe_touchfile(
                os.path.join("/sw", "uppmax", "var", "inboxfix", "schedule",
                             self.pargs.uppmax_project))

            # log the transfer to statusdb if verification passed
            if passed:
                self.log.info(
                    "Logging delivery to StatusDB document {}".format(id))
                data = {
                    'raw_data_delivery': {
                        'timestamp': utc_time(),
                        'files': {
                            'R{}'.format(read): {
                                'md5':
                                m,
                                'path':
                                os.path.splitext(mfile)[0],
                                'size_in_bytes':
                                self._getsize(os.path.splitext(mfile)[0]),
                                'source_location':
                                srcpath
                            }
                            for m, mfile, read, srcpath in md5
                        },
                    }
                }
                jsonstr = json.dumps(data)
                jsonfile = os.path.join(
                    os.path.dirname(md5[0][3]),
                    "{}_{}_{}_{}_L{}_raw_data_delivery.json".format(
                        sample.get("date"), sample.get("flowcell"),
                        sample.get("project_sample_name"),
                        sample.get("sequence"), sample.get("lane")))
                self.log.debug(
                    "Writing delivery to json file {}".format(jsonfile))
                self.app.cmd.write(jsonfile, data=jsonstr, overwrite=True)
                self.log.debug(
                    "Saving delivery in StatusDB document {}".format(id))
                sample.update(data)
                self._save(s_con, sample)
                self.log.debug(jsonstr)
Exemplo n.º 34
0
    def raw_data(self):
        if not self._check_pargs(["project"]):
            return

        # if necessary, reformat flowcell identifier
        if self.pargs.flowcell:
            self.pargs.flowcell = self.pargs.flowcell.split("_")[-1]

        # get the uid and gid to use for destination files
        uid = os.getuid()
        gid = os.getgid()
        if self.pargs.group is not None and len(self.pargs.group) > 0:
            gid = grp.getgrnam(group).gr_gid

        self.log.debug("Connecting to project database")
        p_con = ProjectSummaryConnection(**vars(self.pargs))
        assert p_con, "Could not get connection to project databse"
        self.log.debug("Connecting to samples database")
        s_con = SampleRunMetricsConnection(**vars(self.pargs))
        assert s_con, "Could not get connection to samples databse"

        # Fetch the Uppnex project to deliver to
        if not self.pargs.uppmax_project:
            self.pargs.uppmax_project = p_con.get_entry(self.pargs.project, "uppnex_id")
            if not self.pargs.uppmax_project:
                self.log.error("Uppmax project was not specified and could not be fetched from project database")
                return

        # Extract the list of samples and runs associated with the project and sort them
        samples = sorted(s_con.get_samples(fc_id=self.pargs.flowcell, sample_prj=self.pargs.project), key=lambda k: (k.get('project_sample_name','NA'), k.get('flowcell','NA'), k.get('lane','NA')))

        # Setup paths and verify parameters
        self._meta.production_root = self.app.config.get("production", "root")
        self._meta.root_path = self._meta.production_root
        proj_base_dir = os.path.join(self._meta.root_path, self.pargs.project)
        assert os.path.exists(self._meta.production_root), "No such directory {}; check your production config".format(self._meta.production_root)
        assert os.path.exists(proj_base_dir), "No project {} in production path {}".format(self.pargs.project,self._meta.root_path)

        try:
            self._meta.uppnex_project_root = self.app.config.get("deliver", "uppnex_project_root")
        except Exception as e:
            self.log.warn("{}, will use '/proj' as uppnext_project_root".format(e))
            self._meta.uppnex_project_root = '/proj'

        try:
            self._meta.uppnex_delivery_dir = self.app.config.get("deliver", "uppnex_project_delivery_path")
        except Exception as e:
            self.log.warn("{}, will use 'INBOX' as uppnext_project_delivery_path".format(e))
            self._meta.uppnex_delivery_dir = 'INBOX'

        destination_root = os.path.join(self._meta.uppnex_project_root,self.pargs.uppmax_project,self._meta.uppnex_delivery_dir)
        assert os.path.exists(destination_root), "Delivery destination folder {} does not exist".format(destination_root)
        destination_root = os.path.join(destination_root,self.pargs.project)

        # If interactively select, build a list of samples to skip
        if self.pargs.interactive:
            to_process = []
            for sample in samples:
                sname = sample.get("project_sample_name")
                index = sample.get("sequence")
                fcid = sample.get("flowcell")
                lane = sample.get("lane")
                date = sample.get("date")
                self.log.info("Sample: {}, Barcode: {}, Flowcell: {}, Lane: {}, Started on: {}".format(sname,
                                                                                                           index,
                                                                                                           fcid,
                                                                                                           lane,
                                                                                                           date))
                if query_yes_no("Deliver sample?", default="no"):
                    to_process.append(sample)
            samples = to_process

        # Find uncompressed fastq
        uncompressed = self._find_uncompressed_fastq_files(proj_base_dir,samples)
        if len(uncompressed) > 0:
            self.log.warn("The following samples have uncompressed *.fastq files that cannot be delivered: {}".format(",".join(uncompressed)))
            if not query_yes_no("Continue anyway?", default="no"):
                return

        self.log.info("Will deliver data for {} samples from project {} to {}".format(len(samples),self.pargs.project,destination_root))
        if not query_yes_no("Continue?"):
            return

        # Get the list of files to transfer and the destination
        self.log.debug("Gathering list of files to copy")
        to_copy = self.get_file_copy_list(proj_base_dir,
                                          destination_root,
                                          samples)

        # Make sure that transfer will be with rsync
        if not self.pargs.rsync:
            self.log.warn("Files must be transferred using rsync")
            if not query_yes_no("Do you wish to continue delivering using rsync?", default="yes"):
                return
            self.pargs.rsync = True

        # Process each sample run
        for id, files in to_copy.items():
            # get the sample database object
            [sample] = [s for s in samples if s.get('_id') == id]
            self.log.info("Processing sample {} and flowcell {}".format(sample.get("project_sample_name","NA"),sample.get("flowcell","NA")))

            # transfer files
            self.log.debug("Transferring {} fastq files".format(len(files)))
            self._transfer_files([f[0] for f in files], [f[1] for f in files])

            passed = True
            if self.pargs.link or self.pargs.dry_run:
                passed = False
            else:
                # calculate md5sums on the source side and write it on the destination
                md5 = []
                for f in files:
                    m = md5sum(f[0])
                    mfile = "{}.md5".format(f[1])
                    md5.append([m,mfile,f[2],f[0]])
                    self.log.debug("md5sum for source file {}: {}".format(f[0],m))

                # write the md5sum to a file at the destination and verify the transfer
                for m, mfile, read, srcpath in md5:
                    dstfile = os.path.splitext(mfile)[0]
                    self.log.debug("Writing md5sum to file {}".format(mfile))
                    self.app.cmd.write(mfile,"{}  {}".format(m,os.path.basename(dstfile)),True)
                    self.log.debug("Verifying md5sum for file {}".format(dstfile))
                    dm = md5sum(dstfile)
                    self.log.debug("md5sum for destination file {}: {}".format(dstfile,dm))
                    if m != dm:
                        self.log.warn("md5sum verification FAILED for {}. Source: {}, Target: {}".format(dstfile,m,dm))
                        self.log.warn("Improperly transferred file {} is removed from destination, please retry transfer of this file".format(dstfile))
                        self.app.cmd.safe_unlink(dstfile)
                        self.app.cmd.safe_unlink(mfile)
                        passed = False
                        continue

                    # Modify the permissions to ug+rw
                    for f in [dstfile, mfile]:
                        self.app.cmd.chmod(f,stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP)

            # touch the flag to trigger uppmax inbox permission fix
            self.app.cmd.safe_touchfile(os.path.join("/sw","uppmax","var","inboxfix","schedule",self.pargs.uppmax_project))

            # log the transfer to statusdb if verification passed
            if passed:
                self.log.info("Logging delivery to StatusDB document {}".format(id))
                data = {'raw_data_delivery': {'timestamp': utc_time(),
                                              'files': {'R{}'.format(read):{'md5': m,
                                                                            'path': os.path.splitext(mfile)[0],
                                                                            'size_in_bytes': self._getsize(os.path.splitext(mfile)[0]),
                                                                            'source_location': srcpath} for m, mfile, read, srcpath in md5},
                                              }
                        }
                jsonstr = json.dumps(data)
                jsonfile = os.path.join(os.path.dirname(md5[0][3]),
                                        "{}_{}_{}_{}_L{}_raw_data_delivery.json".format(sample.get("date"),
                                                                                       sample.get("flowcell"),
                                                                                       sample.get("project_sample_name"),
                                                                                       sample.get("sequence"),
                                                                                       sample.get("lane")))
                self.log.debug("Writing delivery to json file {}".format(jsonfile))
                self.app.cmd.write(jsonfile,data=jsonstr,overwrite=True)
                self.log.debug("Saving delivery in StatusDB document {}".format(id))
                sample.update(data)
                self._save(s_con,sample)
                self.log.debug(jsonstr)
Exemplo n.º 35
0
def main():
    parser = argparse.ArgumentParser(description="A script to help doing the deliveries, now using the Casava directory structure. " \
                                     "The user is asked to provide a project ID, a run name, and an UPPMAX project")

    parser.add_argument('-c', '--casava-path', action="store", dest="caspath", default='/proj/a2010002/nobackup/illumina/', 
                        help="Specify a path to a Casava directory manually")
    parser.add_argument('-l', '--log-path', action="store", dest="logpath", default='/proj/a2010002/private/delivery_logs', 
                        help="Specify a path to a log file")
    parser.add_argument('-i', '--interactive', action="store_true", dest="interactive", default=False, 
                        help="Interactively select samples to be delivered")
    parser.add_argument('-d', '--dry-run', action="store_true", dest="dry", default=False, 
                        help="Dry run: nothing will be done")
    parser.add_argument('-a', '--deliver-all-fcs', action="store_true", dest="deliver_all_fcs", default=False, 
                        help="rsync samples from all flow cells. Default is to only deliver from specified flowcell")
    parser.add_argument('-p', '--nophix', action="store_true", dest="deliver_nophix", default=False, 
                        help="Deliver fastq files from nophix subdirectory. Default is to deliver from run directory")
    parser.add_argument('project_name', action='store', help="Project name to deliver, e.g. J.Doe_10_01")
    parser.add_argument('flowcell_id', action='store', help="Flowcell id to deliver, e.g. 120824_BD1915ACXX")
    parser.add_argument('uppmax_id', action='store', help="UPPMAX project id to deliver to, e.g. b2012001")
    args = parser.parse_args()

    if not args.project_name in os.listdir(args.caspath): 
        print("Could not find project. Check directory listing:")
        for f in os.listdir(args.caspath): 
            print(f)
        clean_exit(0,None,args.dry)

    fcid = args.flowcell_id
    fcid_comp = fcid.split('_')
    if len(fcid_comp) > 2:
        fcid = fcid_comp[0] + '_' + fcid_comp[-1]
        print("FCID format too long, trying {:s}".format(fcid))

    dt = datetime.now()
    time_str = "_".join([str(dt.year),
                         str(dt.month),
                         str(dt.day),
                         str(dt.hour),
                         str(dt.minute),
                         str(dt.second)])

    logfilename = os.path.join(os.path.normpath(args.logpath),"{:s}.log".format(time_str)) 
    if not args.dry:
        logfile = open(logfilename, "w")
    else:
        logfile = sys.stdout
         
    logfile.write("[{:s}] - Project to move files for:\n{:s}\n".format(utc_time(), args.project_name))
    logfile.flush()

    proj_base_dir = os.path.join(args.caspath, args.project_name)
    skip_list = []
    if args.interactive:
        for sample_dir in os.listdir(proj_base_dir):
            if not os.path.isdir(os.path.join(proj_base_dir,sample_dir)):
                continue
            if not query_yes_no("Deliver sample {:s}?".format(sample_dir), default="no"):
                skip_list.append(sample_dir)
    
    created_proj_dir_name = fixProjName(args.project_name)
    del_path_top = '/proj/' +  args.uppmax_id + "/INBOX/" + created_proj_dir_name 

    to_copy = get_file_copy_list(proj_base_dir,
                                 del_path_top,
                                 fcid,
                                 args.deliver_all_fcs,
                                 args.deliver_nophix,
                                 skip_list)
    
    # Prompt user if any of the files are non-compressed
    for fqfile, _, _ in to_copy:
        if os.path.splitext(fqfile)[1] == ".gz":
            continue
        print("WARNING: The file {:s}, which you are about to deliver, does not seem to be compressed. " \
              "It is recommended that you compress files prior to delivery.".format(fqfile))
        if query_yes_no("Do you wish to continue delivering " \
                        "uncompressed fastq files?", default="yes"):
            break
        clean_exit(1,logfile,args.dry)
            
    rsync_files(to_copy,
                logfile,
                args.dry)
        
    clean_exit(0,logfile,args.dry)
Exemplo n.º 36
0
def upload_tarball(arch, tarball, remote_host=None, remote_path=None, remote_user=None, **kw):
    """Upload the tarball to the remote destination
    """
    if not remote_path:
        arch.log.error("A remote path must be specified in the config or on the command line")
        return False
     
    source_files = {'tarball': tarball,
                    'tarball_md5': "{}.md5".format(tarball)}
    
    arch.log.debug("Verifying that md5sum file {} exists".format(source_files['tarball_md5']))
    if not os.path.exists(source_files['tarball_md5']):
        arch.log.warn("md5 file {} does not exist".format(source_files['tarball_md5']))
        if not query_yes_no("Calculate md5 file and proceed?", 
                            force=arch.pargs.force):
            return False
        
        # Calculate the md5sum
        arch.app.cmd.md5sum(source_files['tarball'])
    
    remote_location = "{}{}".format("{}@".format(remote_user) if remote_user else "",
                                    "{}:".format(remote_host) if remote_host else "")
    # Transfer the md5 file and tarball
    remote_files = {}
    for label in source_files.keys():
        remote_files[label] = "{}{}".format(remote_location,
                                            os.path.join(remote_path,os.path.basename(source_files[label])))
        arch.log.debug("Transferring {} to {}".format(source_files[label],remote_files[label]))
        arch.app.cmd.transfer_file(source_files[label],remote_files[label])
    
    # Verify the transfer on the remote side using fabric (if necessary)
    use_fabric = remote_host is not None and remote_host != "localhost"
    passed = False 
    arch.log.debug("Verifying integrity of remote file {} after transfer".format(remote_files['tarball']))
    if use_fabric:
        # Verify the md5sum using fabric
        host, path = remote_files['tarball_md5'].split(':')
        result = execute(verify_upload,path,host=host)
        passed = result.get(host,False)
    else:
        passed = arch.app.cmd.verify_md5sum(remote_files['tarball_md5'])
        
    # If the verification was not successful, prompt to delete the corrupt files
    if not passed:
        arch.log.error("md5 sum of remote file {} does not match after transfer".format(remote_files['tarball']))
        if query_yes_no("Remove the corrupted remote file {}?".format(remote_files['tarball']), 
                        force=arch.pargs.force):
            for path in remote_files.values():
                arch.log.info("removing {}".format(path))
                if use_fabric:
                    path = path.split(':')[-1]
                    execute(rm_file,path,host=host)
                else:
                    arch.app.cmd.safe_unlink(path)
        arch.log.error("Upload of {} to remote destination failed".format(source_files['tarball']))
    else:
        arch.log.info("{} uploaded to {} successfully".format(source_files['tarball'],remote_files['tarball']))
    
    if use_fabric:    
        disconnect_all()
        
    return passed
Exemplo n.º 37
0
def _return_extensive_match_result(name_map, barcode_name, force=False):
    """Wrap return value for extensive matching"""
    if query_yes_no("found mapping '{} : {}' (barcode_name:project_sample_name); do you want to use this project_sample_name?".format(barcode_name, name_map["sample_name"]), default="no", force=force):
        return name_map
    else:
        return None
Exemplo n.º 38
0
def purge_alignments(path, ftype="sam", keep="last", dry_run=False, force=False, fsize=MINFILESIZE):
    """Cleanup sam and bam files. In some cases, sam files persist. If
    the corresponding bam file exists, replace the sam file contents
    with a message that the file has been removed to save space.
    
    In general, several bam files are produced in an analysis. By
    grouping bam files by prefix, either the most recent file is
    retained for further reference, or a specific analysis is kept.
    """
    if ftype == "sam":
        pattern = ".sam$"
    elif ftype == "bam":
        pattern = ".bam$"
    else:
        LOG.warn("ftype must be one of 'sam' or 'bam'")
        return
    LOG.debug("running purge_alignments in path {} with pattern {} keep rule {}".format(path, pattern, keep))
    def purge_filter(f):
        if not pattern:
            return
        return re.search(pattern, f) != None
    
    flist = filtered_walk(path, purge_filter, exclude_dirs=["realign-split"])
    if len(flist) == 0:
        LOG.info("No {} files found in {}".format(ftype, path))
        return
    if len(flist) > 0 and not query_yes_no("Going to remove/cleanup {} {} files ({}...). Are you sure you want to continue?".format(len(flist), ftype, ",".join([os.path.basename(x) for x in flist[0:10]])), force=force):
        return
    if ftype == "sam":
        for f in flist:
            LOG.info("Purging {} file {}".format(ftype, f))
            dry_unlink(f, dry_run)
            if os.path.exists(f.replace(".sam", ".bam")):
                dry_write(f, "File removed to save disk space: SAM converted to BAM", dry_run)
        return
    elif ftype == "bam":
        samples = {}
        for f in flist:
            m = re.search("([0-9A-Za-z\_]+)-.*", os.path.basename(f))
            if not m:
                LOG.debug("Couldn't determine prefix for {}".format(f))
                continue
            sid = m.groups()[0]
            
            if not sid in samples.keys():
                samples[sid] = {}
            dname = os.path.dirname(f) 
            if not dname in samples[sid].keys():
                samples[sid][dname] = []
            samples[sid][dname].append(f)

        saved_size = 0
        for k in samples.iterkeys():
            for d, files  in samples[k].iteritems():
                if not files or len(files) == 1:
                    continue
                files.sort(lambda x,y: cmp(len(x), len(y)))
                if keep == "last":
                    LOG.info("Keeping file {} and removing all files with common prefix: {}".format(os.path.basename(files[len(files)-1]), ", ".join([os.path.basename(x) for x in files[0:-1]])))
                saved_size = _purge_by_sample(files, dry_run, int(fsize)) + saved_size
        LOG.info("Will save approximately {:.1f}G space".format(saved_size / 1e9))
Exemplo n.º 39
0
def upload_tarball(arch,
                   tarball,
                   remote_host=None,
                   remote_path=None,
                   remote_user=None,
                   **kw):
    """Upload the tarball to the remote destination
    """
    if not remote_path:
        arch.log.error(
            "A remote path must be specified in the config or on the command line"
        )
        return False

    source_files = {
        'tarball': tarball,
        'tarball_md5': "{}.md5".format(tarball)
    }

    arch.log.debug("Verifying that md5sum file {} exists".format(
        source_files['tarball_md5']))
    if not os.path.exists(source_files['tarball_md5']):
        arch.log.warn("md5 file {} does not exist".format(
            source_files['tarball_md5']))
        if not query_yes_no("Calculate md5 file and proceed?",
                            force=arch.pargs.force):
            return False

        # Calculate the md5sum
        arch.app.cmd.md5sum(source_files['tarball'])

    remote_location = "{}{}".format(
        "{}@".format(remote_user) if remote_user else "",
        "{}:".format(remote_host) if remote_host else "")
    # Transfer the md5 file and tarball
    remote_files = {}
    for label in source_files.keys():
        remote_files[label] = "{}{}".format(
            remote_location,
            os.path.join(remote_path, os.path.basename(source_files[label])))
        arch.log.debug("Transferring {} to {}".format(source_files[label],
                                                      remote_files[label]))
        arch.app.cmd.transfer_file(source_files[label], remote_files[label])

    # Verify the transfer on the remote side using fabric (if necessary)
    use_fabric = remote_host is not None and remote_host != "localhost"
    passed = False
    arch.log.debug(
        "Verifying integrity of remote file {} after transfer".format(
            remote_files['tarball']))
    if use_fabric:
        # Verify the md5sum using fabric
        host, path = remote_files['tarball_md5'].split(':')
        result = execute(verify_upload, path, host=host)
        passed = result.get(host, False)
    else:
        passed = arch.app.cmd.verify_md5sum(remote_files['tarball_md5'])

    # If the verification was not successful, prompt to delete the corrupt files
    if not passed:
        arch.log.error(
            "md5 sum of remote file {} does not match after transfer".format(
                remote_files['tarball']))
        if query_yes_no("Remove the corrupted remote file {}?".format(
                remote_files['tarball']),
                        force=arch.pargs.force):
            for path in remote_files.values():
                arch.log.info("removing {}".format(path))
                if use_fabric:
                    path = path.split(':')[-1]
                    execute(rm_file, path, host=host)
                else:
                    arch.app.cmd.safe_unlink(path)
        arch.log.error("Upload of {} to remote destination failed".format(
            source_files['tarball']))
    else:
        arch.log.info("{} uploaded to {} successfully".format(
            source_files['tarball'], remote_files['tarball']))

    if use_fabric:
        disconnect_all()

    return passed