def upload_qc(self): if not self._check_pargs(['flowcell']): return url = self.pargs.url if self.pargs.url else self.app.config.get( "db", "url") if not url: self.app.log.warn("Please provide a valid url: got {}".format(url)) return if not validate_fc_directory_format(self.pargs.flowcell): self.app.log.warn( "Path '{}' does not conform to bcbio flowcell directory format; aborting" .format(self.pargs.flowcell)) return runinfo_csv = os.path.join(os.path.abspath(self.pargs.flowcell), "{}.csv".format(fc_id(self.pargs.flowcell))) runinfo_yaml = os.path.join(os.path.abspath(self.pargs.flowcell), "run_info.yaml") (fc_date, fc_name) = fc_parts(self.pargs.flowcell) if int(fc_date) < 120815: self.log.info( "Assuming pre-casava based file structure for {}".format( fc_id(self.pargs.flowcell))) qc_objects = self._collect_pre_casava_qc() else: self.log.info("Assuming casava based file structure for {}".format( fc_id(self.pargs.flowcell))) qc_objects = self._collect_casava_qc() if len(qc_objects) == 0: self.log.info("No out-of-date qc objects for {}".format( fc_id(self.pargs.flowcell))) return else: self.log.info("Retrieved {} updated qc objects".format( len(qc_objects))) s_con = SampleRunMetricsConnection(dbname=self.app.config.get( "db", "samples"), **vars(self.app.pargs)) fc_con = FlowcellRunMetricsConnection(dbname=self.app.config.get( "db", "flowcells"), **vars(self.app.pargs)) p_con = ProjectSummaryConnection(dbname=self.app.config.get( "db", "projects"), **vars(self.app.pargs)) for obj in qc_objects: if self.app.pargs.debug: self.log.debug("{}: {}".format(str(obj), obj["_id"])) if isinstance(obj, FlowcellRunMetricsDocument): dry("Saving object {}".format(repr(obj)), fc_con.save(obj)) if isinstance(obj, SampleRunMetricsDocument): project_sample = p_con.get_project_sample( obj.get("sample_prj", None), obj.get("barcode_name", None), self.pargs.extensive_matching) if project_sample: obj["project_sample_name"] = project_sample['sample_name'] dry("Saving object {}".format(repr(obj)), s_con.save(obj))
def upload_qc(self): if not self._check_pargs(["flowcell"]): return url = self.pargs.url if self.pargs.url else self.app.config.get("db", "url") if not url: self.app.log.warn("Please provide a valid url: got {}".format(url)) return if not validate_fc_directory_format(self.pargs.flowcell): self.app.log.warn( "Path '{}' does not conform to bcbio flowcell directory format; aborting".format(self.pargs.flowcell) ) return runinfo_csv = os.path.join(os.path.abspath(self.pargs.flowcell), "{}.csv".format(fc_id(self.pargs.flowcell))) runinfo_yaml = os.path.join(os.path.abspath(self.pargs.flowcell), "run_info.yaml") (fc_date, fc_name) = fc_parts(self.pargs.flowcell) if int(fc_date) < 120815: self.log.info("Assuming pre-casava based file structure for {}".format(fc_id(self.pargs.flowcell))) qc_objects = self._collect_pre_casava_qc() else: self.log.info("Assuming casava based file structure for {}".format(fc_id(self.pargs.flowcell))) qc_objects = self._collect_casava_qc() if len(qc_objects) == 0: self.log.info("No out-of-date qc objects for {}".format(fc_id(self.pargs.flowcell))) return else: self.log.info("Retrieved {} updated qc objects".format(len(qc_objects))) s_con = SampleRunMetricsConnection(dbname=self.app.config.get("db", "samples"), **vars(self.app.pargs)) fc_con = FlowcellRunMetricsConnection(dbname=self.app.config.get("db", "flowcells"), **vars(self.app.pargs)) p_con = ProjectSummaryConnection(dbname=self.app.config.get("db", "projects"), **vars(self.app.pargs)) for obj in qc_objects: if self.app.pargs.debug: self.log.debug("{}: {}".format(str(obj), obj["_id"])) if isinstance(obj, FlowcellRunMetricsDocument): dry("Saving object {}".format(repr(obj)), fc_con.save(obj)) if isinstance(obj, SampleRunMetricsDocument): project_sample = p_con.get_project_sample( obj.get("sample_prj", None), obj.get("barcode_name", None), self.pargs.extensive_matching ) if project_sample: obj["project_sample_name"] = project_sample["sample_name"] dry("Saving object {}".format(repr(obj)), s_con.save(obj))
def upload_analysis(self): kw = vars(self.pargs) if not kw.get("flowcell"): kw["flowcell"] = "TOTAL" # Get a connection to the analysis database acon = AnalysisConnection(**kw) # Traverse the folder hierarchy and determine paths to process to_process = {} for pdir in os.listdir(self._meta.root_path): pdir = os.path.join(self._meta.root_path,pdir) if not os.path.isdir(pdir): continue plist = [] for sdir in [d for d in os.listdir(pdir) if re.match(r'^P[0-9]{3,}_[0-9]+',d)]: fdir = os.path.join(pdir,sdir,kw.get("flowcell")) if not os.path.exists(fdir) or not modified_within_days(fdir, self.pargs.mtime): continue plist.append(fdir) if plist: to_process[os.path.basename(pdir)] = plist # Collect the data from each folder for project_name, sdirs in to_process.items(): self.log.info("Processing {}".format(project_name)) samples = {} for sdir in sdirs: config = glob.glob(os.path.join(sdir,"*-bcbb-config.yaml")) if not config: self.log.error("Could not find sample configuration file in {}. Skipping sample.".format(sdir)) continue if len(config) > 1: self.log.warn("Multiple sample configuration files found in {}. Will only use {}.".format(sdir,os.path.basename(config[0]))) # Parse the config file and get the flowcell, lane and index sequence that may be needed to parse info = {} sinfos = [] with open(config[0]) as fh: info = yaml.load(fh) fcdate = info.get("fc_date") fcname = info.get("fc_name") for laneinfo in info.get("details",[]): for sampleinfo in laneinfo.get("multiplex",[laneinfo]): linfo = laneinfo linfo.update(sampleinfo) name = linfo.get("name",linfo.get("description","unknown")) m = re.match(r'(P[0-9_]{4,}[0-9])',name) if m: name = m.group(1) sample_kw = {'flowcell': linfo.get("flowcell_id") if not fcname else fcname, 'date': fcdate, 'lane': linfo.get("lane"), 'barcode_name': name, 'sample_prj': linfo.get("sample_prj",project_name), 'barcode_id': linfo.get("barcode_id","1"), 'sequence': linfo.get("sequence","NoIndex")} sinfos.append(sample_kw) # Create a parser object and collect the metrics parser = SampleRunMetricsParser(sdir) sinfo = sinfos[0] name = sinfo.get("barcode_name","unknown") samples[name] = {} samples[name]["bcbb_checkpoints"] = parser.parse_bcbb_checkpoints(**sinfo) samples[name]["software_versions"] = parser.parse_software_versions(**sinfo) samples[name]["project_summary"] = parser.parse_project_summary(**sinfo) samples[name]["snpeff_genes"] = parser.parse_snpeff_genes(**sinfo) for sinfo in sinfos: picard = parser.read_picard_metrics(**sinfo) if picard: samples[name]["picard_metrics"] = picard fq_scr = parser.parse_fastq_screen(**sinfo) if fq_scr: samples[name]["fastq_scr"] = fq_scr fastqc = parser.read_fastqc_metrics(**sinfo) if fastqc.get("stats"): samples[name]["fastqc"] = fastqc gteval = parser.parse_eval_metrics(**sinfo) if gteval: samples[name]["gatk_variant_eval"] = gteval # Store the collected metrics in an analysis document obj = AnalysisDocument(**{'project_name': project_name, 'name': project_name, 'samples': samples}) dry("Saving object {}".format(repr(obj)), acon.save(obj))
def upload_analysis(self): kw = vars(self.pargs) if not kw.get("flowcell"): kw["flowcell"] = "TOTAL" # Get a connection to the analysis database acon = AnalysisConnection(**kw) # Traverse the folder hierarchy and determine paths to process to_process = {} for pdir in os.listdir(self._meta.root_path): pdir = os.path.join(self._meta.root_path, pdir) if not os.path.isdir(pdir): continue plist = [] for sdir in [ d for d in os.listdir(pdir) if re.match(r'^P[0-9]{3,}_[0-9]+', d) ]: fdir = os.path.join(pdir, sdir, kw.get("flowcell")) if not os.path.exists(fdir) or not modified_within_days( fdir, self.pargs.mtime): continue plist.append(fdir) if plist: to_process[os.path.basename(pdir)] = plist # Collect the data from each folder for project_name, sdirs in to_process.items(): self.log.info("Processing {}".format(project_name)) samples = {} for sdir in sdirs: config = glob.glob(os.path.join(sdir, "*-bcbb-config.yaml")) if not config: self.log.error( "Could not find sample configuration file in {}. Skipping sample." .format(sdir)) continue if len(config) > 1: self.log.warn( "Multiple sample configuration files found in {}. Will only use {}." .format(sdir, os.path.basename(config[0]))) # Parse the config file and get the flowcell, lane and index sequence that may be needed to parse info = {} sinfos = [] with open(config[0]) as fh: info = yaml.load(fh) fcdate = info.get("fc_date") fcname = info.get("fc_name") for laneinfo in info.get("details", []): for sampleinfo in laneinfo.get("multiplex", [laneinfo]): linfo = laneinfo linfo.update(sampleinfo) name = linfo.get("name", linfo.get("description", "unknown")) m = re.match(r'(P[0-9_]{4,}[0-9])', name) if m: name = m.group(1) sample_kw = { 'flowcell': linfo.get("flowcell_id") if not fcname else fcname, 'date': fcdate, 'lane': linfo.get("lane"), 'barcode_name': name, 'sample_prj': linfo.get("sample_prj", project_name), 'barcode_id': linfo.get("barcode_id", "1"), 'sequence': linfo.get("sequence", "NoIndex") } sinfos.append(sample_kw) # Create a parser object and collect the metrics parser = SampleRunMetricsParser(sdir) sinfo = sinfos[0] name = sinfo.get("barcode_name", "unknown") samples[name] = {} samples[name][ "bcbb_checkpoints"] = parser.parse_bcbb_checkpoints( **sinfo) samples[name][ "software_versions"] = parser.parse_software_versions( **sinfo) samples[name][ "project_summary"] = parser.parse_project_summary(**sinfo) samples[name]["snpeff_genes"] = parser.parse_snpeff_genes( **sinfo) for sinfo in sinfos: picard = parser.read_picard_metrics(**sinfo) if picard: samples[name]["picard_metrics"] = picard fq_scr = parser.parse_fastq_screen(**sinfo) if fq_scr: samples[name]["fastq_scr"] = fq_scr fastqc = parser.read_fastqc_metrics(**sinfo) if fastqc.get("stats"): samples[name]["fastqc"] = fastqc gteval = parser.parse_eval_metrics(**sinfo) if gteval: samples[name]["gatk_variant_eval"] = gteval # Store the collected metrics in an analysis document obj = AnalysisDocument( **{ 'project_name': project_name, 'name': project_name, 'samples': samples }) dry("Saving object {}".format(repr(obj)), acon.save(obj))