def _collect_casava_qc(self): qc_objects = [] runinfo_csv = os.path.join(os.path.abspath(self.pargs.flowcell), "{}.csv".format(self._fc_id())) try: with open(runinfo_csv) as fh: runinfo_reader = csv.reader(fh) runinfo = [x for x in runinfo_reader] except IOError as e: self.app.log.warn(str(e)) raise e fcdir = os.path.join(os.path.abspath(self.pargs.analysis), self.pargs.flowcell) (fc_date, fc_name) = self._fc_parts() ## Check modification time if modified_within_days(fcdir, self.pargs.mtime): fc_kw = dict(path=fcdir, fc_date = fc_date, fc_name=fc_name) fcobj = FlowcellRunMetrics(**fc_kw) fcobj.parse_illumina_metrics(fullRTA=False) fcobj.parse_bc_metrics() fcobj.parse_demultiplex_stats_htm() fcobj.parse_samplesheet_csv() qc_objects.append(fcobj) for sample in runinfo[1:]: d = dict(zip(runinfo[0], sample)) if self.app.pargs.project and self.app.pargs.project != d['SampleProject']: continue if self.app.pargs.sample and self.app.pargs.sample != d['SampleID']: continue sampledir = os.path.join(os.path.abspath(self.pargs.analysis), d['SampleProject'].replace("__", "."), d['SampleID']) if not os.path.exists(sampledir): self.app.log.warn("No such sample directory: {}".format(sampledir)) continue sample_fcdir = os.path.join(sampledir, self._fc_fullname()) if not os.path.exists(sample_fcdir): self.app.log.warn("No such sample flowcell directory: {}".format(sample_fcdir)) continue if not modified_within_days(sample_fcdir, self.pargs.mtime): continue runinfo_yaml_file = os.path.join(sample_fcdir, "{}-bcbb-config.yaml".format(d['SampleID'])) if not os.path.exists(runinfo_yaml_file): self.app.log.warn("No such yaml file for sample: {}".format(runinfo_yaml_file)) raise IOError(2, "No such yaml file for sample: {}".format(runinfo_yaml_file), runinfo_yaml_file) with open(runinfo_yaml_file) as fh: runinfo_yaml = yaml.load(fh) if not runinfo_yaml['details'][0].get("multiplex", None): self.app.log.warn("No multiplex information for sample {}".format(d['SampleID'])) continue sample_kw = dict(path=sample_fcdir, flowcell=fc_name, date=fc_date, lane=d['Lane'], barcode_name=d['SampleID'], sample_prj=d['SampleProject'].replace("__", "."), barcode_id=runinfo_yaml['details'][0]['multiplex'][0]['barcode_id'], sequence=runinfo_yaml['details'][0]['multiplex'][0]['sequence']) obj = SampleRunMetrics(**sample_kw) obj.read_picard_metrics() obj.parse_fastq_screen() obj.parse_bc_metrics() obj.read_fastqc_metrics() qc_objects.append(obj) return qc_objects
def _parse_samplesheet(self, runinfo, qc_objects, fc_date, fc_name, fcdir, as_yaml=False, demultiplex_stats=None, setup=None): """Parse samplesheet information and populate sample run metrics object""" if as_yaml: for info in runinfo: if not info.get("multiplex", None): self.app.log.warn("No multiplex information for lane {}".format(info.get("lane"))) sample = {} sample.update({k: info.get(k, None) for k in ('analysis', 'description', 'flowcell_id', 'lane')}) sample_kw = dict(path=fcdir, flowcell=fc_name, date=fc_date, lane=sample.get('lane', None), barcode_name=sample.get('name', None), sample_prj=sample.get('sample_prj', None), barcode_id=sample.get('barcode_id', None), sequence=sample.get('sequence', "NoIndex")) for sample in info["multiplex"]: sample.update({k: info.get(k, None) for k in ('analysis', 'description', 'flowcell_id', 'lane')}) sample_kw = dict(flowcell=fc_name, date=fc_date, lane=sample['lane'], barcode_name=sample['name'], sample_prj=sample.get('sample_prj', None), barcode_id=sample['barcode_id'], sequence=sample.get('sequence', "NoIndex")) parser = SampleRunMetricsParser(fcdir) obj = SampleRunMetricsDocument(**sample_kw) obj["picard_metrics"] = parser.read_picard_metrics(**sample_kw) obj["fastq_scr"] = parser.parse_fastq_screen(**sample_kw) obj["bc_count"] = parser.get_bc_count(run_setup=setup, **sample_kw) obj["fastqc"] = parser.read_fastqc_metrics(**sample_kw) obj["bcbb_checkpoints"] = parser.parse_bcbb_checkpoints(**sample_kw) qc_objects.append(obj) else: for d in runinfo: LOG.debug("Getting information for sample defined by {}".format(d.values())) if self.app.pargs.project_name and self.app.pargs.project_name != d['SampleProject']: continue if self.app.pargs.sample and self.app.pargs.sample != d['SampleID']: continue sampledir = os.path.join(os.path.abspath(self._meta.production_root_path), d['SampleProject'].replace("__", "."), d['SampleID']) if not os.path.exists(sampledir): self.app.log.warn("No such sample directory: {}".format(sampledir)) continue sample_fcdir = os.path.join(sampledir, fc_fullname(self.pargs.flowcell)) if not os.path.exists(sample_fcdir): self.app.log.warn("No such sample flowcell directory: {}".format(sample_fcdir)) continue if not modified_within_days(sample_fcdir, self.pargs.mtime): continue runinfo_yaml_file = os.path.join(sample_fcdir, "{}-bcbb-config.yaml".format(d['SampleID'])) if not os.path.exists(runinfo_yaml_file): self.app.log.warn("No such yaml file for sample: {}".format(runinfo_yaml_file)) raise IOError(2, "No such yaml file for sample: {}".format(runinfo_yaml_file), runinfo_yaml_file) with open(runinfo_yaml_file) as fh: runinfo_yaml = yaml.load(fh) if not runinfo_yaml['details'][0].get("multiplex", None): self.app.log.warn("No multiplex information for sample {}".format(d['SampleID'])) runinfo_yaml['details'][0]['multiplex'] = [{'barcode_id': 0, 'sequence': 'NoIndex'}] sample_kw = dict(flowcell=fc_name, date=fc_date, lane=d['Lane'], barcode_name=d['SampleID'], sample_prj=d['SampleProject'].replace("__", "."), barcode_id=runinfo_yaml['details'][0]['multiplex'][0]['barcode_id'], sequence=runinfo_yaml['details'][0]['multiplex'][0]['sequence']) parser = SampleRunMetricsParser(sample_fcdir) obj = SampleRunMetricsDocument(**sample_kw) obj["picard_metrics"] = parser.read_picard_metrics(**sample_kw) obj["fastq_scr"] = parser.parse_fastq_screen(**sample_kw) obj["bc_count"] = parser.get_bc_count(demultiplex_stats=demultiplex_stats, run_setup=setup, **sample_kw) obj["fastqc"] = parser.read_fastqc_metrics(**sample_kw) obj["bcbb_checkpoints"] = parser.parse_bcbb_checkpoints(**sample_kw) qc_objects.append(obj) return qc_objects
def _collect_casava_qc(self): qc_objects = [] runinfo_csv = os.path.join(os.path.join(self._meta.root_path, self.pargs.flowcell), "{}.csv".format(fc_id(self.pargs.flowcell))) if not os.path.exists(runinfo_csv): LOG.warn("No such file {}: trying fallback SampleSheet.csv".format(runinfo_csv)) runinfo_csv = os.path.join(os.path.join(self._meta.root_path, self.pargs.flowcell), "SampleSheet.csv") try: with open(runinfo_csv) as fh: runinfo_reader = csv.reader(fh) runinfo = [x for x in runinfo_reader] except IOError as e: self.app.log.warn(str(e)) raise e fcdir = os.path.join(os.path.abspath(self._meta.root_path), self.pargs.flowcell) (fc_date, fc_name) = fc_parts(self.pargs.flowcell) ## Check modification time demux_stats = None if modified_within_days(fcdir, self.pargs.mtime): fc_kw = dict(fc_date = fc_date, fc_name=fc_name) parser = FlowcellRunMetricsParser(fcdir) fcobj = FlowcellRunMetricsDocument(fc_date, fc_name) fcobj["RunInfo"] = parser.parseRunInfo(**fc_kw) fcobj["RunParameters"] = parser.parseRunParameters(**fc_kw) fcobj["illumina"] = parser.parse_illumina_metrics(fullRTA=False, **fc_kw) fcobj["bc_metrics"] = parser.parse_bc_metrics(**fc_kw) fcobj["undemultiplexed_barcodes"] = parser.parse_undemultiplexed_barcode_metrics(**fc_kw) fcobj["illumina"].update({"Demultiplex_Stats" : parser.parse_demultiplex_stats_htm(**fc_kw)}) fcobj["samplesheet_csv"] = parser.parse_samplesheet_csv(runinfo_csv=runinfo_csv, **fc_kw) demux_stats = fcobj["illumina"]["Demultiplex_Stats"] qc_objects.append(fcobj) qc_objects = self._parse_samplesheet(runinfo, qc_objects, fc_date, fc_name, fcdir, demultiplex_stats=demux_stats) return qc_objects
def _collect_casava_qc(self): qc_objects = [] runinfo_csv = os.path.join(os.path.join(self._meta.root_path, self.pargs.flowcell), "{}.csv".format(fc_id(self.pargs.flowcell))) if not os.path.exists(runinfo_csv): LOG.warn("No such file {}: trying fallback SampleSheet.csv".format(runinfo_csv)) runinfo_csv = os.path.join(os.path.join(self._meta.root_path, self.pargs.flowcell), "SampleSheet.csv") try: with open(runinfo_csv) as fh: runinfo_reader = csv.reader(fh) runinfo = [x for x in runinfo_reader] except IOError as e: self.app.log.warn(str(e)) raise e fcdir = os.path.join(os.path.abspath(self._meta.root_path), self.pargs.flowcell) (fc_date, fc_name) = fc_parts(self.pargs.flowcell) ## Check modification time if modified_within_days(fcdir, self.pargs.mtime): fc_kw = dict(fc_date = fc_date, fc_name=fc_name) parser = FlowcellRunMetricsParser(fcdir) fcobj = FlowcellRunMetricsDocument(fc_date, fc_name) fcobj["RunInfo"] = parser.parseRunInfo(**fc_kw) fcobj["RunParameters"] = parser.parseRunParameters(**fc_kw) fcobj["illumina"] = parser.parse_illumina_metrics(fullRTA=False, **fc_kw) fcobj["bc_metrics"] = parser.parse_bc_metrics(**fc_kw) fcobj["undemultiplexed_barcodes"] = parser.parse_undemultiplexed_barcode_metrics(**fc_kw) fcobj["illumina"].update({"Demultiplex_Stats" : parser.parse_demultiplex_stats_htm(**fc_kw)}) fcobj["samplesheet_csv"] = parser.parse_samplesheet_csv(runinfo_csv=runinfo_csv, **fc_kw) qc_objects.append(fcobj) qc_objects = self._parse_samplesheet(runinfo, qc_objects, fc_date, fc_name, fcdir, demultiplex_stats=fcobj["illumina"]["Demultiplex_Stats"]) return qc_objects
def _collect_casava_qc(self): qc_objects = [] read_setup = None demux_stats = None fcdir = os.path.join(os.path.abspath(self._meta.root_path), self.pargs.flowcell) # Get the fc_name, fc_date from RunInfo parser = FlowcellRunMetricsParser(fcdir) runinfo_xml = parser.parseRunInfo() runparams = parser.parseRunParameters() fc_date = runinfo_xml.get('Date', None) fc_name = runinfo_xml.get('Flowcell', None) fc_pos = runparams.get('FCPosition', '') runinfo_csv = os.path.join( os.path.join(self._meta.root_path, self.pargs.flowcell), "{}.csv".format(fc_name)) if not os.path.exists(runinfo_csv): LOG.warn("No such file {}: trying fallback SampleSheet.csv".format( runinfo_csv)) runinfo_csv = os.path.join( os.path.join(self._meta.root_path, self.pargs.flowcell), "SampleSheet.csv") runinfo = parser.parse_samplesheet_csv(runinfo_csv=runinfo_csv) if modified_within_days(fcdir, self.pargs.mtime): # Most of the code expects to have the flowcell position pre-pended to the flowcell id fc_kw = dict(fc_date=fc_date, fc_name="{}{}".format(fc_pos, fc_name)) fcobj = FlowcellRunMetricsDocument(**fc_kw) fcobj["RunInfo"] = runinfo_xml fcobj["RunParameters"] = runparams fcobj["DemultiplexConfig"] = parser.parseDemultiplexConfig(**fc_kw) fcobj["illumina"] = parser.parse_illumina_metrics(fullRTA=False, **fc_kw) fcobj["bc_metrics"] = parser.parse_bc_metrics(**fc_kw) fcobj[ "undemultiplexed_barcodes"] = parser.parse_undemultiplexed_barcode_metrics( **fc_kw) fcobj["illumina"].update({ "Demultiplex_Stats": parser.parse_demultiplex_stats_htm(**fc_kw) }) fcobj["samplesheet_csv"] = runinfo read_setup = fcobj["RunInfo"].get('Reads', []) fcobj["run_setup"] = self._run_setup(read_setup) demux_stats = fcobj["illumina"]["Demultiplex_Stats"] qc_objects.append(fcobj) qc_objects = self._parse_samplesheet(runinfo, qc_objects, fc_date, "{}{}".format(fc_pos, fc_name), fcdir, demultiplex_stats=demux_stats, setup=read_setup) return qc_objects
def _collect_pre_casava_qc(self): qc_objects = [] as_yaml = False runinfo_csv = os.path.join( os.path.join(self._meta.root_path, self.pargs.flowcell), "{}.csv".format(fc_id(self.pargs.flowcell))) if not os.path.exists(runinfo_csv): LOG.warn("No such file {}: trying fallback SampleSheet.csv".format( runinfo_csv)) runinfo_csv = os.path.join( os.path.join(self._meta.root_path, self.pargs.flowcell), "SampleSheet.csv") runinfo_yaml = os.path.join(os.path.abspath(self.pargs.flowcell), "run_info.yaml") try: if os.path.exists(runinfo_csv): with open(runinfo_csv) as fh: runinfo_reader = csv.reader(fh) runinfo = [x for x in runinfo_reader] else: as_yaml = True with open(runinfo_yaml) as fh: runinfo = yaml.load(fh) except IOError as e: self.app.log.warn(str(e)) raise e fcdir = os.path.abspath(self.pargs.flowcell) (fc_date, fc_name) = fc_parts(self.pargs.flowcell) ## Check modification time if modified_within_days(fcdir, self.pargs.mtime): fc_kw = dict(fc_date=fc_date, fc_name=fc_name) parser = FlowcellRunMetricsParser(fcdir) fcobj = FlowcellRunMetricsDocument(**fc_kw) fcobj["RunInfo"] = parser.parseRunInfo(**fc_kw) fcobj["RunParameters"] = parser.parseRunParameters(**fc_kw) fcobj["illumina"] = parser.parse_illumina_metrics(fullRTA=False, **fc_kw) fcobj["bc_metrics"] = parser.parse_bc_metrics(**fc_kw) fcobj["filter_metrics"] = parser.parse_filter_metrics(**fc_kw) fcobj["samplesheet_csv"] = parser.parse_samplesheet_csv( runinfo_csv=runinfo_csv, **fc_kw) fcobj["run_info_yaml"] = parser.parse_run_info_yaml(**fc_kw) qc_objects.append(fcobj) else: return qc_objects qc_objects = self._parse_samplesheet(runinfo, qc_objects, fc_date, fc_name, fcdir, as_yaml=as_yaml) return qc_objects
def _collect_pre_casava_qc(self): qc_objects = [] as_yaml = False read_setup = None fcdir = os.path.abspath(self.pargs.flowcell) ## Check modification time if not modified_within_days(fcdir, self.pargs.mtime): return qc_objects # Get the fc_name, fc_date from RunInfo parser = FlowcellRunMetricsParser(fcdir) runinfo_xml = parser.parseRunInfo() runparams = parser.parseRunParameters() fc_date = runinfo_xml.get('Date',None) fc_name = runinfo_xml.get('Flowcell',None) fc_pos = runparams.get('FCPosition','') runinfo_csv = os.path.join(os.path.join(self._meta.root_path, self.pargs.flowcell), "{}.csv".format(fc_name)) if not os.path.exists(runinfo_csv): LOG.warn("No such file {}: trying fallback SampleSheet.csv".format(runinfo_csv)) runinfo_csv = os.path.join(os.path.join(self._meta.root_path, self.pargs.flowcell), "SampleSheet.csv") runinfo = parser.parse_samplesheet_csv(runinfo_csv=runinfo_csv) if len(runinfo) == 0: runinfo_yaml = os.path.join(os.path.abspath(self.pargs.flowcell), "run_info.yaml") as_yaml = True try: with open(runinfo_yaml) as fh: runinfo = yaml.load(fh) except IOError as e: self.app.log.warn(str(e)) raise e # Most of the code expects to have the flowcell position pre-pended to the flowcell id fc_kw = dict(fc_date = fc_date, fc_name="{}{}".format(fc_pos,fc_name)) fcobj = FlowcellRunMetricsDocument(**fc_kw) fcobj["RunInfo"] = runinfo_xml fcobj["RunParameters"] = runparams fcobj["illumina"] = parser.parse_illumina_metrics(fullRTA=False, **fc_kw) fcobj["bc_metrics"] = parser.parse_bc_metrics(**fc_kw) fcobj["filter_metrics"] = parser.parse_filter_metrics(**fc_kw) fcobj["samplesheet_csv"] = runinfo fcobj["run_info_yaml"] = parser.parse_run_info_yaml(**fc_kw) read_setup = fcobj["RunInfo"].get('Reads',[]) fcobj["run_setup"] = self._run_setup(read_setup) qc_objects.append(fcobj) qc_objects = self._parse_samplesheet(runinfo, qc_objects, fc_date, "{}{}".format(fc_pos,fc_name), fcdir, as_yaml=as_yaml, setup=read_setup) return qc_objects
def _collect_pre_casava_qc(self): qc_objects = [] as_yaml = False runinfo_csv = os.path.join( os.path.join(self._meta.root_path, self.pargs.flowcell), "{}.csv".format(fc_id(self.pargs.flowcell)) ) if not os.path.exists(runinfo_csv): LOG.warn("No such file {}: trying fallback SampleSheet.csv".format(runinfo_csv)) runinfo_csv = os.path.join(os.path.join(self._meta.root_path, self.pargs.flowcell), "SampleSheet.csv") runinfo_yaml = os.path.join(os.path.abspath(self.pargs.flowcell), "run_info.yaml") try: if os.path.exists(runinfo_csv): with open(runinfo_csv) as fh: runinfo_reader = csv.reader(fh) runinfo = [x for x in runinfo_reader] else: as_yaml = True with open(runinfo_yaml) as fh: runinfo = yaml.load(fh) except IOError as e: self.app.log.warn(str(e)) raise e fcdir = os.path.abspath(self.pargs.flowcell) (fc_date, fc_name) = fc_parts(self.pargs.flowcell) ## Check modification time if modified_within_days(fcdir, self.pargs.mtime): fc_kw = dict(fc_date=fc_date, fc_name=fc_name) parser = FlowcellRunMetricsParser(fcdir) fcobj = FlowcellRunMetricsDocument(**fc_kw) fcobj["RunInfo"] = parser.parseRunInfo(**fc_kw) fcobj["RunParameters"] = parser.parseRunParameters(**fc_kw) fcobj["illumina"] = parser.parse_illumina_metrics(fullRTA=False, **fc_kw) fcobj["bc_metrics"] = parser.parse_bc_metrics(**fc_kw) fcobj["filter_metrics"] = parser.parse_filter_metrics(**fc_kw) fcobj["samplesheet_csv"] = parser.parse_samplesheet_csv(runinfo_csv=runinfo_csv, **fc_kw) fcobj["run_info_yaml"] = parser.parse_run_info_yaml(**fc_kw) qc_objects.append(fcobj) else: return qc_objects qc_objects = self._parse_samplesheet(runinfo, qc_objects, fc_date, fc_name, fcdir, as_yaml=as_yaml) return qc_objects
def _collect_pre_casava_qc(self): qc_objects = [] runinfo_yaml = os.path.join(os.path.abspath(self.pargs.flowcell), "run_info.yaml") try: with open(runinfo_yaml) as fh: runinfo = yaml.load(fh) except IOError as e: self.app.log.warn(str(e)) raise e fcdir = os.path.abspath(self.pargs.flowcell) (fc_date, fc_name) = self._fc_parts() ## Check modification time if modified_within_days(fcdir, self.pargs.mtime): fc_kw = dict(path=fcdir, fc_date = fc_date, fc_name=fc_name) fcobj = FlowcellRunMetrics(**fc_kw) fcobj.parse_illumina_metrics(fullRTA=False) fcobj.parse_bc_metrics() fcobj.parse_filter_metrics() if not fcobj.parse_samplesheet_csv(): fcobj.parse_run_info_yaml() qc_objects.append(fcobj) else: return qc_objects for info in runinfo: if not info.get("multiplex", None): self.app.log.warn("No multiplex information for lane {}".format(info.get("lane"))) sample.update({k: info.get(k, None) for k in ('analysis', 'description', 'flowcell_id', 'lane')}) sample_kw = dict(path=fcdir, flowcell=fc_name, date=fc_date, lane=sample.get('lane', None), barcode_name=sample.get('name', None), sample_prj=sample.get('sample_prj', None), barcode_id=sample.get('barcode_id', None), sequence=sample.get('sequence', "NoIndex")) for sample in info["multiplex"]: sample.update({k: info.get(k, None) for k in ('analysis', 'description', 'flowcell_id', 'lane')}) sample_kw = dict(path=fcdir, flowcell=fc_name, date=fc_date, lane=sample['lane'], barcode_name=sample['name'], sample_prj=sample.get('sample_prj', None), barcode_id=sample['barcode_id'], sequence=sample.get('sequence', "NoIndex")) obj = SampleRunMetrics(**sample_kw) obj.read_picard_metrics() obj.parse_fastq_screen() obj.parse_bc_metrics() obj.read_fastqc_metrics() qc_objects.append(obj) return qc_objects
def _collect_casava_qc(self): qc_objects = [] read_setup = None demux_stats = None fcdir = os.path.join(os.path.abspath(self._meta.root_path), self.pargs.flowcell) # Get the fc_name, fc_date from RunInfo parser = FlowcellRunMetricsParser(fcdir) runinfo_xml = parser.parseRunInfo() runparams = parser.parseRunParameters() fc_date = runinfo_xml.get('Date',None) fc_name = runinfo_xml.get('Flowcell',None) fc_pos = runparams.get('FCPosition','') runinfo_csv = os.path.join(os.path.join(self._meta.root_path, self.pargs.flowcell), "{}.csv".format(fc_name)) if not os.path.exists(runinfo_csv): LOG.warn("No such file {}: trying fallback SampleSheet.csv".format(runinfo_csv)) runinfo_csv = os.path.join(os.path.join(self._meta.root_path, self.pargs.flowcell), "SampleSheet.csv") runinfo = parser.parse_samplesheet_csv(runinfo_csv=runinfo_csv) if modified_within_days(fcdir, self.pargs.mtime): # Most of the code expects to have the flowcell position pre-pended to the flowcell id fc_kw = dict(fc_date = fc_date, fc_name="{}{}".format(fc_pos,fc_name)) fcobj = FlowcellRunMetricsDocument(**fc_kw) fcobj["RunInfo"] = runinfo_xml fcobj["RunParameters"] = runparams fcobj["DemultiplexConfig"] = parser.parseDemultiplexConfig(**fc_kw) fcobj["illumina"] = parser.parse_illumina_metrics(fullRTA=False, **fc_kw) fcobj["bc_metrics"] = parser.parse_bc_metrics(**fc_kw) fcobj["undemultiplexed_barcodes"] = parser.parse_undemultiplexed_barcode_metrics(**fc_kw) fcobj["illumina"].update({"Demultiplex_Stats" : parser.parse_demultiplex_stats_htm(**fc_kw)}) fcobj["samplesheet_csv"] = runinfo read_setup = fcobj["RunInfo"].get('Reads',[]) fcobj["run_setup"] = self._run_setup(read_setup) demux_stats = fcobj["illumina"]["Demultiplex_Stats"] qc_objects.append(fcobj) qc_objects = self._parse_samplesheet(runinfo, qc_objects, fc_date, "{}{}".format(fc_pos,fc_name), fcdir, demultiplex_stats=demux_stats, setup=read_setup) return qc_objects
def upload_analysis(self): kw = vars(self.pargs) if not kw.get("flowcell"): kw["flowcell"] = "TOTAL" # Get a connection to the analysis database acon = AnalysisConnection(**kw) # Traverse the folder hierarchy and determine paths to process to_process = {} for pdir in os.listdir(self._meta.root_path): pdir = os.path.join(self._meta.root_path,pdir) if not os.path.isdir(pdir): continue plist = [] for sdir in [d for d in os.listdir(pdir) if re.match(r'^P[0-9]{3,}_[0-9]+',d)]: fdir = os.path.join(pdir,sdir,kw.get("flowcell")) if not os.path.exists(fdir) or not modified_within_days(fdir, self.pargs.mtime): continue plist.append(fdir) if plist: to_process[os.path.basename(pdir)] = plist # Collect the data from each folder for project_name, sdirs in to_process.items(): self.log.info("Processing {}".format(project_name)) samples = {} for sdir in sdirs: config = glob.glob(os.path.join(sdir,"*-bcbb-config.yaml")) if not config: self.log.error("Could not find sample configuration file in {}. Skipping sample.".format(sdir)) continue if len(config) > 1: self.log.warn("Multiple sample configuration files found in {}. Will only use {}.".format(sdir,os.path.basename(config[0]))) # Parse the config file and get the flowcell, lane and index sequence that may be needed to parse info = {} sinfos = [] with open(config[0]) as fh: info = yaml.load(fh) fcdate = info.get("fc_date") fcname = info.get("fc_name") for laneinfo in info.get("details",[]): for sampleinfo in laneinfo.get("multiplex",[laneinfo]): linfo = laneinfo linfo.update(sampleinfo) name = linfo.get("name",linfo.get("description","unknown")) m = re.match(r'(P[0-9_]{4,}[0-9])',name) if m: name = m.group(1) sample_kw = {'flowcell': linfo.get("flowcell_id") if not fcname else fcname, 'date': fcdate, 'lane': linfo.get("lane"), 'barcode_name': name, 'sample_prj': linfo.get("sample_prj",project_name), 'barcode_id': linfo.get("barcode_id","1"), 'sequence': linfo.get("sequence","NoIndex")} sinfos.append(sample_kw) # Create a parser object and collect the metrics parser = SampleRunMetricsParser(sdir) sinfo = sinfos[0] name = sinfo.get("barcode_name","unknown") samples[name] = {} samples[name]["bcbb_checkpoints"] = parser.parse_bcbb_checkpoints(**sinfo) samples[name]["software_versions"] = parser.parse_software_versions(**sinfo) samples[name]["project_summary"] = parser.parse_project_summary(**sinfo) samples[name]["snpeff_genes"] = parser.parse_snpeff_genes(**sinfo) for sinfo in sinfos: picard = parser.read_picard_metrics(**sinfo) if picard: samples[name]["picard_metrics"] = picard fq_scr = parser.parse_fastq_screen(**sinfo) if fq_scr: samples[name]["fastq_scr"] = fq_scr fastqc = parser.read_fastqc_metrics(**sinfo) if fastqc.get("stats"): samples[name]["fastqc"] = fastqc gteval = parser.parse_eval_metrics(**sinfo) if gteval: samples[name]["gatk_variant_eval"] = gteval # Store the collected metrics in an analysis document obj = AnalysisDocument(**{'project_name': project_name, 'name': project_name, 'samples': samples}) dry("Saving object {}".format(repr(obj)), acon.save(obj))
def _parse_samplesheet(self, runinfo, qc_objects, fc_date, fc_name, fcdir, as_yaml=False, demultiplex_stats=None): """Parse samplesheet information and populate sample run metrics object""" if as_yaml: for info in runinfo: if not info.get("multiplex", None): self.app.log.warn("No multiplex information for lane {}".format(info.get("lane"))) sample = {} sample.update({k: info.get(k, None) for k in ("analysis", "description", "flowcell_id", "lane")}) sample_kw = dict( path=fcdir, flowcell=fc_name, date=fc_date, lane=sample.get("lane", None), barcode_name=sample.get("name", None), sample_prj=sample.get("sample_prj", None), barcode_id=sample.get("barcode_id", None), sequence=sample.get("sequence", "NoIndex"), ) for sample in info["multiplex"]: sample.update({k: info.get(k, None) for k in ("analysis", "description", "flowcell_id", "lane")}) sample_kw = dict( flowcell=fc_name, date=fc_date, lane=sample["lane"], barcode_name=sample["name"], sample_prj=sample.get("sample_prj", None), barcode_id=sample["barcode_id"], sequence=sample.get("sequence", "NoIndex"), ) parser = SampleRunMetricsParser(fcdir) obj = SampleRunMetricsDocument(**sample_kw) obj["picard_metrics"] = parser.read_picard_metrics(**sample_kw) obj["fastq_scr"] = parser.parse_fastq_screen(**sample_kw) obj["bc_count"] = parser.get_bc_count(**sample_kw) obj["fastqc"] = parser.read_fastqc_metrics(**sample_kw) qc_objects.append(obj) else: for sample in runinfo[1:]: LOG.debug("Getting information for sample defined by {}".format(sample)) d = dict(zip(runinfo[0], sample)) if self.app.pargs.project_name and self.app.pargs.project_name != d["SampleProject"]: continue if self.app.pargs.sample and self.app.pargs.sample != d["SampleID"]: continue sampledir = os.path.join( os.path.abspath(self._meta.production_root_path), d["SampleProject"].replace("__", "."), d["SampleID"], ) if not os.path.exists(sampledir): self.app.log.warn("No such sample directory: {}".format(sampledir)) continue sample_fcdir = os.path.join(sampledir, fc_fullname(self.pargs.flowcell)) if not os.path.exists(sample_fcdir): self.app.log.warn("No such sample flowcell directory: {}".format(sample_fcdir)) continue if not modified_within_days(sample_fcdir, self.pargs.mtime): continue runinfo_yaml_file = os.path.join(sample_fcdir, "{}-bcbb-config.yaml".format(d["SampleID"])) if not os.path.exists(runinfo_yaml_file): self.app.log.warn("No such yaml file for sample: {}".format(runinfo_yaml_file)) raise IOError(2, "No such yaml file for sample: {}".format(runinfo_yaml_file), runinfo_yaml_file) with open(runinfo_yaml_file) as fh: runinfo_yaml = yaml.load(fh) if not runinfo_yaml["details"][0].get("multiplex", None): self.app.log.warn("No multiplex information for sample {}".format(d["SampleID"])) continue sample_kw = dict( flowcell=fc_name, date=fc_date, lane=d["Lane"], barcode_name=d["SampleID"], sample_prj=d["SampleProject"].replace("__", "."), barcode_id=runinfo_yaml["details"][0]["multiplex"][0]["barcode_id"], sequence=runinfo_yaml["details"][0]["multiplex"][0]["sequence"], ) parser = SampleRunMetricsParser(sample_fcdir) obj = SampleRunMetricsDocument(**sample_kw) obj["picard_metrics"] = parser.read_picard_metrics(**sample_kw) obj["fastq_scr"] = parser.parse_fastq_screen(**sample_kw) obj["bc_count"] = parser.get_bc_count(demultiplex_stats=demultiplex_stats, **sample_kw) obj["fastqc"] = parser.read_fastqc_metrics(**sample_kw) qc_objects.append(obj) return qc_objects
def upload_analysis(self): kw = vars(self.pargs) if not kw.get("flowcell"): kw["flowcell"] = "TOTAL" # Get a connection to the analysis database acon = AnalysisConnection(**kw) # Traverse the folder hierarchy and determine paths to process to_process = {} for pdir in os.listdir(self._meta.root_path): pdir = os.path.join(self._meta.root_path, pdir) if not os.path.isdir(pdir): continue plist = [] for sdir in [ d for d in os.listdir(pdir) if re.match(r'^P[0-9]{3,}_[0-9]+', d) ]: fdir = os.path.join(pdir, sdir, kw.get("flowcell")) if not os.path.exists(fdir) or not modified_within_days( fdir, self.pargs.mtime): continue plist.append(fdir) if plist: to_process[os.path.basename(pdir)] = plist # Collect the data from each folder for project_name, sdirs in to_process.items(): self.log.info("Processing {}".format(project_name)) samples = {} for sdir in sdirs: config = glob.glob(os.path.join(sdir, "*-bcbb-config.yaml")) if not config: self.log.error( "Could not find sample configuration file in {}. Skipping sample." .format(sdir)) continue if len(config) > 1: self.log.warn( "Multiple sample configuration files found in {}. Will only use {}." .format(sdir, os.path.basename(config[0]))) # Parse the config file and get the flowcell, lane and index sequence that may be needed to parse info = {} sinfos = [] with open(config[0]) as fh: info = yaml.load(fh) fcdate = info.get("fc_date") fcname = info.get("fc_name") for laneinfo in info.get("details", []): for sampleinfo in laneinfo.get("multiplex", [laneinfo]): linfo = laneinfo linfo.update(sampleinfo) name = linfo.get("name", linfo.get("description", "unknown")) m = re.match(r'(P[0-9_]{4,}[0-9])', name) if m: name = m.group(1) sample_kw = { 'flowcell': linfo.get("flowcell_id") if not fcname else fcname, 'date': fcdate, 'lane': linfo.get("lane"), 'barcode_name': name, 'sample_prj': linfo.get("sample_prj", project_name), 'barcode_id': linfo.get("barcode_id", "1"), 'sequence': linfo.get("sequence", "NoIndex") } sinfos.append(sample_kw) # Create a parser object and collect the metrics parser = SampleRunMetricsParser(sdir) sinfo = sinfos[0] name = sinfo.get("barcode_name", "unknown") samples[name] = {} samples[name][ "bcbb_checkpoints"] = parser.parse_bcbb_checkpoints( **sinfo) samples[name][ "software_versions"] = parser.parse_software_versions( **sinfo) samples[name][ "project_summary"] = parser.parse_project_summary(**sinfo) samples[name]["snpeff_genes"] = parser.parse_snpeff_genes( **sinfo) for sinfo in sinfos: picard = parser.read_picard_metrics(**sinfo) if picard: samples[name]["picard_metrics"] = picard fq_scr = parser.parse_fastq_screen(**sinfo) if fq_scr: samples[name]["fastq_scr"] = fq_scr fastqc = parser.read_fastqc_metrics(**sinfo) if fastqc.get("stats"): samples[name]["fastqc"] = fastqc gteval = parser.parse_eval_metrics(**sinfo) if gteval: samples[name]["gatk_variant_eval"] = gteval # Store the collected metrics in an analysis document obj = AnalysisDocument( **{ 'project_name': project_name, 'name': project_name, 'samples': samples }) dry("Saving object {}".format(repr(obj)), acon.save(obj))