Пример #1
0
    def upload_qc(self):
        if not self._check_pargs(['flowcell']):
            return
        url = self.pargs.url if self.pargs.url else self.app.config.get(
            "db", "url")
        if not url:
            self.app.log.warn("Please provide a valid url: got {}".format(url))
            return
        if not validate_fc_directory_format(self.pargs.flowcell):
            self.app.log.warn(
                "Path '{}' does not conform to bcbio flowcell directory format; aborting"
                .format(self.pargs.flowcell))
            return

        runinfo_csv = os.path.join(os.path.abspath(self.pargs.flowcell),
                                   "{}.csv".format(fc_id(self.pargs.flowcell)))
        runinfo_yaml = os.path.join(os.path.abspath(self.pargs.flowcell),
                                    "run_info.yaml")
        (fc_date, fc_name) = fc_parts(self.pargs.flowcell)
        if int(fc_date) < 120815:
            self.log.info(
                "Assuming pre-casava based file structure for {}".format(
                    fc_id(self.pargs.flowcell)))
            qc_objects = self._collect_pre_casava_qc()
        else:
            self.log.info("Assuming casava based file structure for {}".format(
                fc_id(self.pargs.flowcell)))
            qc_objects = self._collect_casava_qc()

        if len(qc_objects) == 0:
            self.log.info("No out-of-date qc objects for {}".format(
                fc_id(self.pargs.flowcell)))
            return
        else:
            self.log.info("Retrieved {} updated qc objects".format(
                len(qc_objects)))

        s_con = SampleRunMetricsConnection(dbname=self.app.config.get(
            "db", "samples"),
                                           **vars(self.app.pargs))
        fc_con = FlowcellRunMetricsConnection(dbname=self.app.config.get(
            "db", "flowcells"),
                                              **vars(self.app.pargs))
        p_con = ProjectSummaryConnection(dbname=self.app.config.get(
            "db", "projects"),
                                         **vars(self.app.pargs))
        for obj in qc_objects:
            if self.app.pargs.debug:
                self.log.debug("{}: {}".format(str(obj), obj["_id"]))
            if isinstance(obj, FlowcellRunMetricsDocument):
                dry("Saving object {}".format(repr(obj)), fc_con.save(obj))
            if isinstance(obj, SampleRunMetricsDocument):
                project_sample = p_con.get_project_sample(
                    obj.get("sample_prj", None), obj.get("barcode_name", None),
                    self.pargs.extensive_matching)
                if project_sample:
                    obj["project_sample_name"] = project_sample['sample_name']
                dry("Saving object {}".format(repr(obj)), s_con.save(obj))
Пример #2
0
    def upload_qc(self):
        if not self._check_pargs(["flowcell"]):
            return
        url = self.pargs.url if self.pargs.url else self.app.config.get("db", "url")
        if not url:
            self.app.log.warn("Please provide a valid url: got {}".format(url))
            return
        if not validate_fc_directory_format(self.pargs.flowcell):
            self.app.log.warn(
                "Path '{}' does not conform to bcbio flowcell directory format; aborting".format(self.pargs.flowcell)
            )
            return

        runinfo_csv = os.path.join(os.path.abspath(self.pargs.flowcell), "{}.csv".format(fc_id(self.pargs.flowcell)))
        runinfo_yaml = os.path.join(os.path.abspath(self.pargs.flowcell), "run_info.yaml")
        (fc_date, fc_name) = fc_parts(self.pargs.flowcell)
        if int(fc_date) < 120815:
            self.log.info("Assuming pre-casava based file structure for {}".format(fc_id(self.pargs.flowcell)))
            qc_objects = self._collect_pre_casava_qc()
        else:
            self.log.info("Assuming casava based file structure for {}".format(fc_id(self.pargs.flowcell)))
            qc_objects = self._collect_casava_qc()

        if len(qc_objects) == 0:
            self.log.info("No out-of-date qc objects for {}".format(fc_id(self.pargs.flowcell)))
            return
        else:
            self.log.info("Retrieved {} updated qc objects".format(len(qc_objects)))

        s_con = SampleRunMetricsConnection(dbname=self.app.config.get("db", "samples"), **vars(self.app.pargs))
        fc_con = FlowcellRunMetricsConnection(dbname=self.app.config.get("db", "flowcells"), **vars(self.app.pargs))
        p_con = ProjectSummaryConnection(dbname=self.app.config.get("db", "projects"), **vars(self.app.pargs))
        for obj in qc_objects:
            if self.app.pargs.debug:
                self.log.debug("{}: {}".format(str(obj), obj["_id"]))
            if isinstance(obj, FlowcellRunMetricsDocument):
                dry("Saving object {}".format(repr(obj)), fc_con.save(obj))
            if isinstance(obj, SampleRunMetricsDocument):
                project_sample = p_con.get_project_sample(
                    obj.get("sample_prj", None), obj.get("barcode_name", None), self.pargs.extensive_matching
                )
                if project_sample:
                    obj["project_sample_name"] = project_sample["sample_name"]
                dry("Saving object {}".format(repr(obj)), s_con.save(obj))
Пример #3
0
 def upload_analysis(self):
     kw = vars(self.pargs)
     if not kw.get("flowcell"):
         kw["flowcell"] = "TOTAL"
     
     # Get a connection to the analysis database
     acon = AnalysisConnection(**kw)
     
     # Traverse the folder hierarchy and determine paths to process
     to_process = {}
     for pdir in os.listdir(self._meta.root_path):
         pdir = os.path.join(self._meta.root_path,pdir)
         if not os.path.isdir(pdir):
             continue
         plist = []
         for sdir in [d for d in os.listdir(pdir) if re.match(r'^P[0-9]{3,}_[0-9]+',d)]:
             fdir = os.path.join(pdir,sdir,kw.get("flowcell"))
             if not os.path.exists(fdir) or not modified_within_days(fdir, self.pargs.mtime):
                 continue
             plist.append(fdir)
         if plist:
             to_process[os.path.basename(pdir)] = plist
     
     # Collect the data from each folder
     for project_name, sdirs in to_process.items():
         self.log.info("Processing {}".format(project_name))
         samples = {}
         for sdir in sdirs:
             config = glob.glob(os.path.join(sdir,"*-bcbb-config.yaml"))
             if not config:
                 self.log.error("Could not find sample configuration file in {}. Skipping sample.".format(sdir))
                 continue
             if len(config) > 1:
                 self.log.warn("Multiple sample configuration files found in {}. Will only use {}.".format(sdir,os.path.basename(config[0])))
             
             # Parse the config file and get the flowcell, lane and index sequence that may be needed to parse
             info = {}
             sinfos = []
             with open(config[0]) as fh:
                 info = yaml.load(fh)
             fcdate = info.get("fc_date")
             fcname = info.get("fc_name")
             for laneinfo in info.get("details",[]):
                 for sampleinfo in laneinfo.get("multiplex",[laneinfo]):
                     linfo = laneinfo
                     linfo.update(sampleinfo)
                     name = linfo.get("name",linfo.get("description","unknown"))
                     m = re.match(r'(P[0-9_]{4,}[0-9])',name)
                     if m:
                         name = m.group(1)
                     sample_kw = {'flowcell': linfo.get("flowcell_id") if not fcname else fcname,
                                  'date': fcdate,
                                  'lane': linfo.get("lane"),
                                  'barcode_name': name,
                                  'sample_prj': linfo.get("sample_prj",project_name),
                                  'barcode_id': linfo.get("barcode_id","1"),
                                  'sequence': linfo.get("sequence","NoIndex")}
                     sinfos.append(sample_kw)
             
             # Create a parser object and collect the metrics
             parser = SampleRunMetricsParser(sdir)
             sinfo = sinfos[0]
             name = sinfo.get("barcode_name","unknown")
             samples[name] = {}
             samples[name]["bcbb_checkpoints"] = parser.parse_bcbb_checkpoints(**sinfo)
             samples[name]["software_versions"] = parser.parse_software_versions(**sinfo)
             samples[name]["project_summary"] = parser.parse_project_summary(**sinfo)
             samples[name]["snpeff_genes"] = parser.parse_snpeff_genes(**sinfo)
             for sinfo in sinfos:
                 picard = parser.read_picard_metrics(**sinfo)
                 if picard:
                     samples[name]["picard_metrics"] = picard
                 fq_scr = parser.parse_fastq_screen(**sinfo)
                 if fq_scr:
                     samples[name]["fastq_scr"] = fq_scr
                 fastqc = parser.read_fastqc_metrics(**sinfo)
                 if fastqc.get("stats"):
                     samples[name]["fastqc"] = fastqc
                 gteval = parser.parse_eval_metrics(**sinfo)
                 if gteval:
                     samples[name]["gatk_variant_eval"] = gteval
                     
         # Store the collected metrics in an analysis document
         obj = AnalysisDocument(**{'project_name': project_name,
                                   'name': project_name,
                                   'samples': samples})
         dry("Saving object {}".format(repr(obj)), acon.save(obj))
Пример #4
0
    def upload_analysis(self):
        kw = vars(self.pargs)
        if not kw.get("flowcell"):
            kw["flowcell"] = "TOTAL"

        # Get a connection to the analysis database
        acon = AnalysisConnection(**kw)

        # Traverse the folder hierarchy and determine paths to process
        to_process = {}
        for pdir in os.listdir(self._meta.root_path):
            pdir = os.path.join(self._meta.root_path, pdir)
            if not os.path.isdir(pdir):
                continue
            plist = []
            for sdir in [
                    d for d in os.listdir(pdir)
                    if re.match(r'^P[0-9]{3,}_[0-9]+', d)
            ]:
                fdir = os.path.join(pdir, sdir, kw.get("flowcell"))
                if not os.path.exists(fdir) or not modified_within_days(
                        fdir, self.pargs.mtime):
                    continue
                plist.append(fdir)
            if plist:
                to_process[os.path.basename(pdir)] = plist

        # Collect the data from each folder
        for project_name, sdirs in to_process.items():
            self.log.info("Processing {}".format(project_name))
            samples = {}
            for sdir in sdirs:
                config = glob.glob(os.path.join(sdir, "*-bcbb-config.yaml"))
                if not config:
                    self.log.error(
                        "Could not find sample configuration file in {}. Skipping sample."
                        .format(sdir))
                    continue
                if len(config) > 1:
                    self.log.warn(
                        "Multiple sample configuration files found in {}. Will only use {}."
                        .format(sdir, os.path.basename(config[0])))

                # Parse the config file and get the flowcell, lane and index sequence that may be needed to parse
                info = {}
                sinfos = []
                with open(config[0]) as fh:
                    info = yaml.load(fh)
                fcdate = info.get("fc_date")
                fcname = info.get("fc_name")
                for laneinfo in info.get("details", []):
                    for sampleinfo in laneinfo.get("multiplex", [laneinfo]):
                        linfo = laneinfo
                        linfo.update(sampleinfo)
                        name = linfo.get("name",
                                         linfo.get("description", "unknown"))
                        m = re.match(r'(P[0-9_]{4,}[0-9])', name)
                        if m:
                            name = m.group(1)
                        sample_kw = {
                            'flowcell':
                            linfo.get("flowcell_id") if not fcname else fcname,
                            'date':
                            fcdate,
                            'lane':
                            linfo.get("lane"),
                            'barcode_name':
                            name,
                            'sample_prj':
                            linfo.get("sample_prj", project_name),
                            'barcode_id':
                            linfo.get("barcode_id", "1"),
                            'sequence':
                            linfo.get("sequence", "NoIndex")
                        }
                        sinfos.append(sample_kw)

                # Create a parser object and collect the metrics
                parser = SampleRunMetricsParser(sdir)
                sinfo = sinfos[0]
                name = sinfo.get("barcode_name", "unknown")
                samples[name] = {}
                samples[name][
                    "bcbb_checkpoints"] = parser.parse_bcbb_checkpoints(
                        **sinfo)
                samples[name][
                    "software_versions"] = parser.parse_software_versions(
                        **sinfo)
                samples[name][
                    "project_summary"] = parser.parse_project_summary(**sinfo)
                samples[name]["snpeff_genes"] = parser.parse_snpeff_genes(
                    **sinfo)
                for sinfo in sinfos:
                    picard = parser.read_picard_metrics(**sinfo)
                    if picard:
                        samples[name]["picard_metrics"] = picard
                    fq_scr = parser.parse_fastq_screen(**sinfo)
                    if fq_scr:
                        samples[name]["fastq_scr"] = fq_scr
                    fastqc = parser.read_fastqc_metrics(**sinfo)
                    if fastqc.get("stats"):
                        samples[name]["fastqc"] = fastqc
                    gteval = parser.parse_eval_metrics(**sinfo)
                    if gteval:
                        samples[name]["gatk_variant_eval"] = gteval

            # Store the collected metrics in an analysis document
            obj = AnalysisDocument(
                **{
                    'project_name': project_name,
                    'name': project_name,
                    'samples': samples
                })
            dry("Saving object {}".format(repr(obj)), acon.save(obj))