def test_dbcon(self): """Test database connection and that we get expected values.""" s_con = SampleRunMetricsConnection(dbname="samples-test", username="******", password="******") samples = [s_con.get_entry(x) for x in s_con.name_view] samples_d = {x["name"]: x for x in samples} self.assertEqual(samples_d["1_120924_AC003CCCXX_TGACCA"]["date"], "120924") self.assertEqual(samples_d["1_121015_BB002BBBXX_TGACCA"]["flowcell"], "BB002BBBXX") self.assertEqual(samples_d["2_120924_AC003CCCXX_ACAGTG"]["entity_type"], "sample_run_metrics") self.assertEqual(samples_d["3_120924_AC003CCCXX_ACAGTG"]["lane"], "3") self.assertEqual(samples_d["4_120924_AC003CCCXX_CGTTAA"]["sequence"], "CGTTAA") self.assertEqual(samples_d["2_121015_BB002BBBXX_TGACCA"]["project_id"], "P002") fc_con = FlowcellRunMetricsConnection(dbname="flowcells-test", username="******", password="******") flowcells = [fc_con.get_entry(x) for x in fc_con.name_view] flowcells_d = {x["name"]: x for x in flowcells} self.assertEqual(flowcells_d["120924_AC003CCCXX"]["name"], "120924_AC003CCCXX") self.assertEqual(flowcells_d["121015_BB002BBBXX"]["name"], "121015_BB002BBBXX") self.assertEqual(flowcells_d["120924_AC003CCCXX"]["entity_type"], "flowcell_run_metrics") p_con = ProjectSummaryConnection(dbname="projects-test", username="******", password="******") projects = [p_con.get_entry(x) for x in p_con.name_view] projects_d = {x["project_name"]: x for x in projects} self.assertEqual(projects_d["J.Doe_00_01"]["min_m_reads_per_sample_ordered"], 0.1) self.assertEqual(projects_d["J.Doe_00_01"]["no_of_samples"], 2) self.assertEqual( set(projects_d["J.Doe_00_01"]["samples"].keys()), set(["P001_101_index3", "P001_102", "P001_103"]) ) self.assertEqual(projects_d["J.Doe_00_01"]["customer_reference"], "GnuGenome") self.assertEqual(projects_d["J.Doe_00_02"]["min_m_reads_per_sample_ordered"], 0.2) self.assertEqual(projects_d["J.Doe_00_03"]["samples"].keys(), ["3_index6"]) self.assertIn("A", projects_d["J.Doe_00_03"]["samples"]["3_index6"]["library_prep"])
def test_2_make_note(self): """Make a note subset by example flowcell and project""" s_con = SampleRunMetricsConnection(username=self.user, password=self.pw, url=self.url) fc_con = FlowcellRunMetricsConnection(username=self.user, password=self.pw, url=self.url) p_con = ProjectSummaryConnection(username=self.user, password=self.pw, url=self.url) paragraphs = sample_note_paragraphs() headers = sample_note_headers() samples = s_con.get_samples(self.examples["flowcell"], self.examples["project"]) project = p_con.get_entry(self.examples["project"]) samples = p_con.map_srm_to_name(self.examples["project"], fc_id=self.examples["flowcell"], use_bc_map=True) for k,v in samples.items(): s_param = parameters s = s_con.get_entry(k) s_param.update({key:s[srm_to_parameter[key]] for key in srm_to_parameter.keys()}) fc = "{}_{}".format(s["date"], s["flowcell"]) s_param["phix_error_rate"] = fc_con.get_phix_error_rate(str(fc), s["lane"]) s_param['avg_quality_score'] = s_con.calc_avg_qv(s["name"]) s_param['rounded_read_count'] = round(float(s_param['rounded_read_count'])/1e6,1) if s_param['rounded_read_count'] else None s_param['customer_name'] = project['samples'][v["sample"]].get('customer_name', None) if project: s_param['ordered_amount'] = p_con.get_ordered_amount(self.examples["project"]) s_param['customer_reference'] = s_param.get('customer_reference', project['customer_reference']) s_param['uppnex_project_id'] = s_param.get('uppnex_project_id', project['uppnex_id']) s_param['success'] = sequencing_success(s_param, cutoffs) s_param.update({k:"N/A" for k in s_param.keys() if s_param[k] is None}) make_note("{}.pdf".format(s["barcode_name"]), headers, paragraphs, **s_param)
def swestore(self): """This function is the entry point for tasks having to do with packaging and sending runs to swestore """ db_info = self.app.config.get_section_dict('db') f_conn = FlowcellRunMetricsConnection(username=db_info.get('user'), password=db_info.get('password'), url=db_info.get('url')) # Create a tarball out of the run folder if self.pargs.package_run: # We require a flowcell argument if not self._check_pargs(["flowcell"]): return self.pargs.tarball = package_run(self,self.config.get('archive','swestore_staging'), **vars(self.pargs)) if not self.pargs.tarball: self.log.error("No tarball was created, exiting") return if self.pargs.clean: rm_run(self,self.config.get('archive','root'), flowcell=self.pargs.flowcell) if self.pargs.clean_from_staging: #Check that the run has been archived on the NAS before removing it, otherwise it will keep synching if self.pargs.flowcell in f_conn.get_storage_status('NAS_nosync').keys(): rm_run(self,self.config.get('archive','swestore_staging'), flowcell=self.pargs.flowcell) else: self.log.warn("Run storage status is not NAS_nosync, not removing run from swestore_stage!") if not self.pargs.tarball: self.log.error("Required argument --tarball was not specified") return if not os.path.exists(self.pargs.tarball): self.log.error("Tarball {} does not exist".format(self.pargs.tarball)) return # Upload a tarball to a remote host if self.pargs.remote_upload: result = upload_tarball(self, **dict(self.config.get_section_dict('archive').items() + vars(self.pargs).items())) if not result: return if self.pargs.clean: rm_tarball(self,tarball=self.pargs.tarball) # Send the tarball to Swestore using irods if self.pargs.send_to_swestore: result = send_to_swestore(self,**dict(self.config.get_section_dict('archive').items() + vars(self.pargs).items())) if not result: # If archiving failed, we need to give a non-zero exit code in order for a remote instance to detect the failure sys.exit(1) if self.pargs.clean: rm_tarball(self,tarball=self.pargs.tarball) #Set the run as archived in StatusDB fc_db_id = f_conn.id_view.get(self.pargs.flowcell) f_conn.set_storage_status(fc_db_id, 'swestore_archived') # Log to statusdb if self.pargs.log_to_db: # implement this raise NotImplementedError("logging to db functionality not implemented")
def setUp(self): """FIXME: All other tests depend on data being uploaded, so these are not real unit tests. The setup to TestQCUpload has to be run prior to other tests, else unexpected failures will occur.""" self.app = self.make_app( argv=['qc', 'upload-qc', flowcells[0], '--mtime', '10000'], extensions=[ 'scilifelab.pm.ext.ext_qc', 'scilifelab.pm.ext.ext_couchdb' ]) self._run_app() self.app = self.make_app( argv=['qc', 'upload-qc', flowcells[1], '--mtime', '10000'], extensions=[ 'scilifelab.pm.ext.ext_qc', 'scilifelab.pm.ext.ext_couchdb' ]) self._run_app() self.s_con = SampleRunMetricsConnection(dbname="samples-test", username="******", password="******") self.p_con = ProjectSummaryConnection(dbname="projects-test", username="******", password="******") self.fc_con = FlowcellRunMetricsConnection(dbname="flowcells-test", username="******", password="******")
def list_projects(self): if not self._check_pargs(["flowcell"]): return url = self.pargs.url if self.pargs.url else self.app.config.get("db", "url") if not url: self.app.log.warn("Please provide a valid url: got {}".format(url)) return if not validate_fc_directory_format(self.pargs.flowcell): self.app.log.warn( "Path '{}' does not conform to bcbio flowcell directory format; aborting".format(self.pargs.flowcell) ) return out_data = [[self.pargs.flowcell]] s = self.pargs.flowcell.split("_") fcid = "_".join([s[0], s[-1]]) self.log.debug("Establishing FlowcellRunMetricsConnection") fc_con = FlowcellRunMetricsConnection(dbname=self.app.config.get("db", "flowcells"), **vars(self.app.pargs)) self.log.debug("Establishing ProjectSummaryConnection") p_con = ProjectSummaryConnection(dbname=self.app.config.get("db", "projects"), **vars(self.app.pargs)) self.log.debug("Fetching flowcell metric document for flowcell {}".format(fcid)) fc = fc_con.get_entry(fcid) if fc is None: self.log.warn("No flowcell metric document for flowcell {}".format(fcid)) return self.log.debug("Fetching csv samplesheet data for flowcell {}".format(fcid)) ssheet_data = self._get_samplesheet_sample_data(fc) if len(ssheet_data) == 0: self.log.warn("No csv samplesheet data for flowcell {}".format(fcid)) return self.log.debug("Fetch runParameter data for flowcell {}".format(fcid)) run_data = self._get_run_parameter_data(fc) if len(run_data) == 0: self.log.warn("No runParameter data for flowcell {}".format(fcid)) out_data = [ [self.pargs.flowcell, run_data.get("InstrumentType", "HiSeq2000"), run_data.get("RunMode", "High Output")] ] # Extract the project names projects = set([proj[0].replace("__", ".") for data in ssheet_data.values() for proj in data.values()]) # Extract application for each project for project in projects: self.log.debug("Fetching project data document for project {}".format(project)) pdoc = p_con.get_entry(project) if pdoc is None: self.log.warn("No project data document for project {}".format(project)) pdoc = {} application = pdoc.get("application", "N/A") out_data.append([project, application]) self.app._output_data["stdout"].write("\n".join(["\t".join([str(r) for r in row]) for row in out_data]))
def test_get_barcode_lane_statistics(self): """Test getting barcode lane statistics from flowcell database""" fc_con = FlowcellRunMetricsConnection(dbname="flowcells-test", username="******", password="******") # Try getting wrong sample name, should return None data = fc_con.get_barcode_lane_statistics("J.Doe_00_01", "P001_101_index6", "120924_AC003CCCXX", "1") self.assertEqual(data, (None, None)) data = fc_con.get_barcode_lane_statistics("J.Doe_00_01", "P001_101_index3", "120924_AC003CCCXX", "1") self.assertEqual(data, (u'35.22', u'90.05'))
def upload_qc(self): if not self._check_pargs(['flowcell']): return url = self.pargs.url if self.pargs.url else self.app.config.get( "db", "url") if not url: self.app.log.warn("Please provide a valid url: got {}".format(url)) return if not validate_fc_directory_format(self.pargs.flowcell): self.app.log.warn( "Path '{}' does not conform to bcbio flowcell directory format; aborting" .format(self.pargs.flowcell)) return runinfo_csv = os.path.join(os.path.abspath(self.pargs.flowcell), "{}.csv".format(fc_id(self.pargs.flowcell))) runinfo_yaml = os.path.join(os.path.abspath(self.pargs.flowcell), "run_info.yaml") (fc_date, fc_name) = fc_parts(self.pargs.flowcell) if int(fc_date) < 120815: self.log.info( "Assuming pre-casava based file structure for {}".format( fc_id(self.pargs.flowcell))) qc_objects = self._collect_pre_casava_qc() else: self.log.info("Assuming casava based file structure for {}".format( fc_id(self.pargs.flowcell))) qc_objects = self._collect_casava_qc() if len(qc_objects) == 0: self.log.info("No out-of-date qc objects for {}".format( fc_id(self.pargs.flowcell))) return else: self.log.info("Retrieved {} updated qc objects".format( len(qc_objects))) s_con = SampleRunMetricsConnection(dbname=self.app.config.get( "db", "samples"), **vars(self.app.pargs)) fc_con = FlowcellRunMetricsConnection(dbname=self.app.config.get( "db", "flowcells"), **vars(self.app.pargs)) p_con = ProjectSummaryConnection(dbname=self.app.config.get( "db", "projects"), **vars(self.app.pargs)) for obj in qc_objects: if self.app.pargs.debug: self.log.debug("{}: {}".format(str(obj), obj["_id"])) if isinstance(obj, FlowcellRunMetricsDocument): dry("Saving object {}".format(repr(obj)), fc_con.save(obj)) if isinstance(obj, SampleRunMetricsDocument): project_sample = p_con.get_project_sample( obj.get("sample_prj", None), obj.get("barcode_name", None), self.pargs.extensive_matching) if project_sample: obj["project_sample_name"] = project_sample['sample_name'] dry("Saving object {}".format(repr(obj)), s_con.save(obj))
def list_projects(self): if not self._check_pargs(["flowcell"]): return url = self.pargs.url if self.pargs.url else self.app.config.get("db", "url") if not url: self.app.log.warn("Please provide a valid url: got {}".format(url)) return if not validate_fc_directory_format(self.pargs.flowcell): self.app.log.warn("Path '{}' does not conform to bcbio flowcell directory format; aborting".format(self.pargs.flowcell)) return out_data = [[self.pargs.flowcell]] s = self.pargs.flowcell.split("_") fcid = "_".join([s[0],s[-1]]) self.log.debug("Establishing FlowcellRunMetricsConnection") fc_con = FlowcellRunMetricsConnection(dbname=self.app.config.get("db", "flowcells"), **vars(self.app.pargs)) self.log.debug("Establishing ProjectSummaryConnection") p_con = ProjectSummaryConnection(dbname=self.app.config.get("db", "projects"), **vars(self.app.pargs)) self.log.debug("Fetching flowcell metric document for flowcell {}".format(fcid)) fc = fc_con.get_entry(fcid) if fc is None: self.log.warn("No flowcell metric document for flowcell {}".format(fcid)) return self.log.debug("Fetching csv samplesheet data for flowcell {}".format(fcid)) ssheet_data = self._get_samplesheet_sample_data(fc) if len(ssheet_data) == 0: self.log.warn("No csv samplesheet data for flowcell {}".format(fcid)) return self.log.debug("Fetch runParameter data for flowcell {}".format(fcid)) run_data = self._get_run_parameter_data(fc) if len(run_data) == 0: self.log.warn("No runParameter data for flowcell {}".format(fcid)) out_data = [[self.pargs.flowcell, run_data.get("InstrumentType","HiSeq2000"), run_data.get("RunMode","High Output")]] # Extract the project names projects = set([proj[0].replace("__",".") for data in ssheet_data.values() for proj in data.values()]) # Extract application for each project for project in projects: self.log.debug("Fetching project data document for project {}".format(project)) pdoc = p_con.get_entry(project) if pdoc is None: self.log.warn("No project data document for project {}".format(project)) pdoc = {} application = pdoc.get("application","N/A") type = pdoc.get("type","Check GPL") out_data.append([project,application,type]) self.app._output_data['stdout'].write("\n".join(["\t".join([str(r) for r in row]) for row in out_data]))
def test_dbcon(self): """Test database connection and that we get expected values.""" s_con = SampleRunMetricsConnection(dbname="samples-test", username="******", password="******") samples = [s_con.get_entry(x) for x in s_con.name_view] samples_d = {x["name"]: x for x in samples} self.assertEqual(samples_d["1_120924_AC003CCCXX_TGACCA"]["date"], "120924") self.assertEqual(samples_d["1_121015_BB002BBBXX_TGACCA"]["flowcell"], "BB002BBBXX") self.assertEqual( samples_d["2_120924_AC003CCCXX_ACAGTG"]["entity_type"], "sample_run_metrics") self.assertEqual(samples_d["3_120924_AC003CCCXX_ACAGTG"]["lane"], "3") self.assertEqual(samples_d["4_120924_AC003CCCXX_CGTTAA"]["sequence"], "CGTTAA") self.assertEqual(samples_d["2_121015_BB002BBBXX_TGACCA"]["project_id"], "P002") fc_con = FlowcellRunMetricsConnection(dbname="flowcells-test", username="******", password="******") flowcells = [fc_con.get_entry(x) for x in fc_con.name_view] flowcells_d = {x["name"]: x for x in flowcells} self.assertEqual(flowcells_d["120924_AC003CCCXX"]["name"], "120924_AC003CCCXX") self.assertEqual(flowcells_d["121015_BB002BBBXX"]["name"], "121015_BB002BBBXX") self.assertEqual(flowcells_d["120924_AC003CCCXX"]["entity_type"], "flowcell_run_metrics") p_con = ProjectSummaryConnection(dbname="projects-test", username="******", password="******") projects = [p_con.get_entry(x) for x in p_con.name_view] projects_d = {x["project_name"]: x for x in projects} self.assertEqual( projects_d["J.Doe_00_01"]["min_m_reads_per_sample_ordered"], 0.1) self.assertEqual(projects_d["J.Doe_00_01"]["no_of_samples"], 2) self.assertEqual(set(projects_d["J.Doe_00_01"]["samples"].keys()), set(["P001_101_index3", "P001_102", "P001_103"])) self.assertEqual(projects_d["J.Doe_00_01"]["customer_reference"], "GnuGenome") self.assertEqual( projects_d["J.Doe_00_02"]["min_m_reads_per_sample_ordered"], 0.2) self.assertEqual(projects_d["J.Doe_00_03"]["samples"].keys(), ["3_index6"]) self.assertIn( "A", projects_d["J.Doe_00_03"]["samples"]["3_index6"]["library_prep"])
def setUp(self): self.app = self.make_app( argv=["qc", "upload-qc", flowcells[0], "--mtime", "10000"], extensions=["scilifelab.pm.ext.ext_qc", "scilifelab.pm.ext.ext_couchdb"], ) self._run_app() self.s_con = SampleRunMetricsConnection(dbname="samples-test", username="******", password="******") self.p_con = ProjectSummaryConnection(dbname="projects-test", username="******", password="******") self.fc_con = FlowcellRunMetricsConnection(dbname="flowcells-test", username="******", password="******")
def upload_qc(self): if not self._check_pargs(["flowcell"]): return url = self.pargs.url if self.pargs.url else self.app.config.get("db", "url") if not url: self.app.log.warn("Please provide a valid url: got {}".format(url)) return if not validate_fc_directory_format(self.pargs.flowcell): self.app.log.warn( "Path '{}' does not conform to bcbio flowcell directory format; aborting".format(self.pargs.flowcell) ) return runinfo_csv = os.path.join(os.path.abspath(self.pargs.flowcell), "{}.csv".format(fc_id(self.pargs.flowcell))) runinfo_yaml = os.path.join(os.path.abspath(self.pargs.flowcell), "run_info.yaml") (fc_date, fc_name) = fc_parts(self.pargs.flowcell) if int(fc_date) < 120815: self.log.info("Assuming pre-casava based file structure for {}".format(fc_id(self.pargs.flowcell))) qc_objects = self._collect_pre_casava_qc() else: self.log.info("Assuming casava based file structure for {}".format(fc_id(self.pargs.flowcell))) qc_objects = self._collect_casava_qc() if len(qc_objects) == 0: self.log.info("No out-of-date qc objects for {}".format(fc_id(self.pargs.flowcell))) return else: self.log.info("Retrieved {} updated qc objects".format(len(qc_objects))) s_con = SampleRunMetricsConnection(dbname=self.app.config.get("db", "samples"), **vars(self.app.pargs)) fc_con = FlowcellRunMetricsConnection(dbname=self.app.config.get("db", "flowcells"), **vars(self.app.pargs)) p_con = ProjectSummaryConnection(dbname=self.app.config.get("db", "projects"), **vars(self.app.pargs)) for obj in qc_objects: if self.app.pargs.debug: self.log.debug("{}: {}".format(str(obj), obj["_id"])) if isinstance(obj, FlowcellRunMetricsDocument): dry("Saving object {}".format(repr(obj)), fc_con.save(obj)) if isinstance(obj, SampleRunMetricsDocument): project_sample = p_con.get_project_sample( obj.get("sample_prj", None), obj.get("barcode_name", None), self.pargs.extensive_matching ) if project_sample: obj["project_sample_name"] = project_sample["sample_name"] dry("Saving object {}".format(repr(obj)), s_con.save(obj))
def setUp(self): """FIXME: All other tests depend on data being uploaded, so these are not real unit tests. The setup to TestQCUpload has to be run prior to other tests, else unexpected failures will occur.""" self.app = self.make_app(argv = ['qc', 'upload-qc', flowcells[0], '--mtime', '10000'], extensions=['scilifelab.pm.ext.ext_qc', 'scilifelab.pm.ext.ext_couchdb']) self._run_app() self.app = self.make_app(argv = ['qc', 'upload-qc', flowcells[1], '--mtime', '10000'], extensions=['scilifelab.pm.ext.ext_qc', 'scilifelab.pm.ext.ext_couchdb']) self._run_app() self.s_con = SampleRunMetricsConnection(dbname="samples-test", username="******", password="******") self.p_con = ProjectSummaryConnection(dbname="projects-test", username="******", password="******") self.fc_con = FlowcellRunMetricsConnection(dbname="flowcells-test", username="******", password="******")
def _project_status_note_table(project_name=None, username=None, password=None, url=None, use_ps_map=True, use_bc_map=False, check_consistency=False, ordered_million_reads=None, uppnex_id=None, customer_reference=None, exclude_sample_ids={}, project_alias=None, sample_aliases={}, projectdb="projects", samplesdb="samples", flowcelldb="flowcells", include_all_samples=False, param={}, **kw): # mapping project_summary to parameter keys ps_to_parameter = { "scilife_name": "scilife_name", "customer_name": "customer_name", "project_name": "project_name" } # mapping project sample to table table_keys = [ 'ScilifeID', 'SubmittedID', 'BarcodeSeq', 'MSequenced', 'MOrdered' ] output_data = { 'stdout': StringIO(), 'stderr': StringIO(), 'debug': StringIO() } # Connect and run s_con = SampleRunMetricsConnection(dbname=samplesdb, username=username, password=password, url=url) fc_con = FlowcellRunMetricsConnection(dbname=flowcelldb, username=username, password=password, url=url) p_con = ProjectSummaryConnection(dbname=projectdb, username=username, password=password, url=url) #Get the information source for this project source = p_con.get_info_source(project_name) # Get project summary from project database sample_aliases = _literal_eval_option(sample_aliases, default={}) prj_summary = p_con.get_entry(project_name) if not prj_summary: LOG.warn("No such project '{}'".format(project_name)) return LOG.debug("Working on project '{}'.".format(project_name)) # Determine if project is finished by getting all samples sequenced date try: all_samples_sequenced = prj_summary['project_summary'][ 'all_samples_sequenced'] except (TypeError, KeyError): all_samples_sequenced = False # Get sample run list and loop samples to make mapping sample -> {sampleruns} sample_run_list = _set_sample_run_list(project_name, flowcell=None, project_alias=project_alias, s_con=s_con) samples = {} for s in sample_run_list: prj_sample = p_con.get_project_sample( project_name, s.get("project_sample_name", None)) if prj_sample: sample_name = prj_sample['project_sample'].get( "scilife_name", None) s_d = {s["name"]: {'sample': sample_name, 'id': s["_id"]}} samples.update(s_d) else: if s["barcode_name"] in sample_aliases: s_d = { sample_aliases[s["barcode_name"]]: { 'sample': sample_aliases[s["barcode_name"]], 'id': s["_id"] } } samples.update(s_d) else: s_d = { s["name"]: { 'sample': s["name"], 'id': s["_id"], 'barcode_name': s["barcode_name"] } } LOG.warn( "No mapping found for sample run:\n '{}'".format(s_d)) # Convert to mapping from desired sample name to list of aliases # Less important for the moment; one solution is to update the # Google docs summary table to use the P names sample_dict = prj_summary['samples'] param.update({ key: prj_summary.get(ps_to_parameter[key], None) for key in ps_to_parameter.keys() }) param["ordered_amount"] = param.get( "ordered_amount", p_con.get_ordered_amount(project_name, samples=sample_dict)) if not param.get('customer_reference'): try: param['customer_reference'] = prj_summary['details'][ 'customer_project_reference'] except (TypeError, KeyError): param['customer_reference'] = prj_summary.get('customer_reference') param['uppnex_project_id'] = param.get('uppnex_project_id', prj_summary.get('uppnex_id')) # Override database values if options passed at command line if uppnex_id: param["uppnex_project_id"] = uppnex_id if customer_reference: param["customer_reference"] = customer_reference # Process options ordered_million_reads = _literal_eval_option(ordered_million_reads) exclude_sample_ids = _literal_eval_option(exclude_sample_ids, default={}) ## Start collecting the data sample_table = [] samples_excluded = [] last_library_preps = p_con.get_latest_library_prep(project_name) last_library_preps_srm = [ x for l in last_library_preps.values() for x in l ] LOG.debug( "Looping through sample map that maps project sample names to sample run metrics ids" ) for k, v in samples.items(): LOG.debug("project sample '{}' maps to '{}'".format(k, v)) if not include_all_samples: if v['sample'] not in last_library_preps.keys(): LOG.info( "No library prep information for sample {}; keeping in report" .format(v['sample'])) else: if k not in last_library_preps_srm: LOG.info( "Sample run {} ('{}') is not latest library prep ({}) for project sample {}: excluding from report" .format( k, v["id"], ",".join( list( set(last_library_preps[ v['sample']].values()))), v['sample'])) continue else: pass if re.search("Unexpected", k): continue barcode_seq = s_con.get_entry(k, "sequence") # Exclude sample id? if _exclude_sample_id(exclude_sample_ids, v['sample'], barcode_seq): samples_excluded.append(v['sample']) continue # Get the project sample name from the sample run and set table values project_sample = sample_dict[v['sample']] vals = _set_sample_table_values(v['sample'], project_sample, barcode_seq, ordered_million_reads, param) sample_table.append([vals[k] for k in table_keys]) # Loop through samples in sample_dict for which there is no sample run information samples_in_table_or_excluded = list(set([x[0] for x in sample_table ])) + samples_excluded samples_not_in_table = list( set(sample_dict.keys()) - set(samples_in_table_or_excluded)) for sample in samples_not_in_table: if re.search("Unexpected", sample): continue project_sample = sample_dict[sample] # Set project_sample_d: a dictionary mapping from sample run metrics name to sample run metrics database id project_sample_d = _set_project_sample_dict(project_sample, source) if project_sample_d: for k, v in project_sample_d.iteritems(): barcode_seq = s_con.get_entry(k, "sequence") vals = _set_sample_table_values(sample, project_sample, barcode_seq, ordered_million_reads, param) sample_table.append([vals[k] for k in table_keys]) else: barcode_seq = None vals = _set_sample_table_values(sample, project_sample, barcode_seq, ordered_million_reads, param) sample_table.append([vals[k] for k in table_keys]) if all_samples_sequenced: param["finished"] = 'All samples for this project have been sequenced.' sample_table.sort() sample_table = list(sample_table for sample_table, _ in itertools.groupby(sample_table)) sample_table.insert( 0, ['ScilifeID', 'SubmittedID', 'BarcodeSeq', 'MSequenced', 'MOrdered']) return output_data, sample_table, param
class TestQCUpload(PmFullTest): def setUp(self): """FIXME: All other tests depend on data being uploaded, so these are not real unit tests. The setup to TestQCUpload has to be run prior to other tests, else unexpected failures will occur.""" self.app = self.make_app(argv = ['qc', 'upload-qc', flowcells[0], '--mtime', '10000'], extensions=['scilifelab.pm.ext.ext_qc', 'scilifelab.pm.ext.ext_couchdb']) self._run_app() self.app = self.make_app(argv = ['qc', 'upload-qc', flowcells[1], '--mtime', '10000'], extensions=['scilifelab.pm.ext.ext_qc', 'scilifelab.pm.ext.ext_couchdb']) self._run_app() self.s_con = SampleRunMetricsConnection(dbname="samples-test", username="******", password="******") self.p_con = ProjectSummaryConnection(dbname="projects-test", username="******", password="******") self.fc_con = FlowcellRunMetricsConnection(dbname="flowcells-test", username="******", password="******") def test_samplesheet(self): """Test samplesheet upload""" fc = self.fc_con.get_entry("120924_AC003CCCXX") self.assertEqual(fc["samplesheet_csv"][0]["Index"], "TGACCA") self.assertEqual(fc["samplesheet_csv"][0]["Description"], "J__Doe_00_01") self.assertEqual(fc["samplesheet_csv"][0]["FCID"], "C003CCCXX") self.assertEqual(fc["samplesheet_csv"][1]["SampleRef"], "hg19") self.assertEqual(fc["samplesheet_csv"][2]["SampleID"], "P002_101_index3") def test_qc_upload(self): """Test running qc upload to server. Slightly circular testing here - I setup the module with qc update so by definition the test must 'work'""" self.app = self.make_app(argv = ['qc', 'upload-qc', flowcells[1], '--mtime', '100'], extensions=['scilifelab.pm.ext.ext_qc', 'scilifelab.pm.ext.ext_couchdb']) self._run_app() s = self.s_con.get_entry("4_120924_AC003CCCXX_CGTTAA") self.assertIsNone(s["project_sample_name"]) self.assertEqual(s["project_id"], "P003") def test_qc_update(self): """Test running qc update of a project id""" s = self.s_con.get_entry("4_120924_AC003CCCXX_CGTTAA") s["project_id"]= None self.assertIsNone(s["project_id"]) self.s_con.save(s) self.app = self.make_app(argv = ['qc', 'update', '--sample_prj', projects[2], '--project_id', 'P003', '--debug', '--force'], extensions=['scilifelab.pm.ext.ext_qc', 'scilifelab.pm.ext.ext_couchdb']) self._run_app() s = self.s_con.get_entry("4_120924_AC003CCCXX_CGTTAA") self.assertEqual(s["project_id"], "P003") def test_qc_update_sample_names(self): """Test running qc update of project sample names""" s1 = self.s_con.get_entry("1_120924_AC003CCCXX_TGACCA") s2 = self.s_con.get_entry("2_120924_AC003CCCXX_ACAGTG") s1["project_sample_name"] = None s2["project_sample_name"] = None self.assertIsNone(s1["project_sample_name"]) self.assertIsNone(s2["project_sample_name"]) self.s_con.save(s1) self.s_con.save(s2) sample_map = {'P001_101_index3': 'P001_101_index3', 'P001_102_index6':'P001_102'} self.app = self.make_app(argv = ['qc', 'update', '--sample_prj', projects[0], '--names', "{}".format(sample_map), '--debug', '--force'], extensions=['scilifelab.pm.ext.ext_qc', 'scilifelab.pm.ext.ext_couchdb']) self._run_app() s1 = self.s_con.get_entry("1_120924_AC003CCCXX_TGACCA") s2 = self.s_con.get_entry("2_120924_AC003CCCXX_ACAGTG") self.assertEqual(s1["project_sample_name"], "P001_101_index3") self.assertEqual(s2["project_sample_name"], "P001_102")
def sample_status_note(project_name=None, flowcell=None, username=None, password=None, url=None, ordered_million_reads=None, uppnex_id=None, customer_reference=None, bc_count=None, project_alias=[], projectdb="projects", samplesdb="samples", flowcelldb="flowcells", phix=None, is_paired=True, **kw): """Make a sample status note. Used keywords: :param project_name: project name :param flowcell: flowcell id :param username: db username :param password: db password :param url: db url :param ordered_million_reads: number of ordered reads in millions :param uppnex_id: the uppnex id :param customer_reference: customer project name :param project_alias: project alias name :param phix: phix error rate :param is_paired: True if run is paired-end, False for single-end """ # Cutoffs cutoffs = { "phix_err_cutoff": 2.0, "qv_cutoff": 30, } instrument = _parse_instrument_config( os.path.expanduser(kw.get("instrument_config", ""))) instrument_dict = {i['instrument_id']: i for i in instrument} # parameters parameters = { "project_name": None, "start_date": None, "FC_id": None, "scilifelab_name": None, "rounded_read_count": None, "phix_error_rate": None, "avg_quality_score": None, "pct_q30_bases": None, "success": None, "run_mode": None, "is_paired": True } # key mapping from sample_run_metrics to parameter keys srm_to_parameter = { "project_name": "sample_prj", "FC_id": "flowcell", "scilifelab_name": "barcode_name", "start_date": "date", "rounded_read_count": "bc_count", "lane": "lane" } LOG.debug("got parameters {}".format(parameters)) output_data = { 'stdout': StringIO(), 'stderr': StringIO(), 'debug': StringIO() } if not _assert_flowcell_format(flowcell): LOG.warn( "Wrong flowcell format {}; skipping. Please use the flowcell id (format \"[A-Z0-9\-]+\")" .format(flowcell)) return output_data output_data = _update_sample_output_data(output_data, cutoffs) # Connect and run s_con = SampleRunMetricsConnection(dbname=samplesdb, username=username, password=password, url=url) fc_con = FlowcellRunMetricsConnection(dbname=flowcelldb, username=username, password=password, url=url) p_con = ProjectSummaryConnection(dbname=projectdb, username=username, password=password, url=url) # Set up paragraphs paragraphs = sample_note_paragraphs() headers = sample_note_headers() # Get project project = p_con.get_entry(project_name) source = p_con.get_info_source(project_name) if not project: LOG.warn("No such project '{}'".format(project_name)) return output_data # Set samples list sample_run_list = _set_sample_run_list(project_name, flowcell, project_alias, s_con) if len(sample_run_list) == 0: LOG.warn( "No samples for project '{}', flowcell '{}'. Maybe there are no sample run metrics in statusdb?" .format(project_name, flowcell)) return output_data # Set options ordered_million_reads = _literal_eval_option(ordered_million_reads) bc_count = _literal_eval_option(bc_count) phix = _literal_eval_option(phix) # Count number of times a sample has been run on a flowcell; if several, make lane-specific reports sample_count = Counter([x.get("barcode_name") for x in sample_run_list]) # Loop samples and collect information s_param_out = [] fcdoc = None for s in sample_run_list: s_param = {} LOG.debug( "working on sample '{}', sample run metrics name '{}', id '{}'". format(s.get("barcode_name", None), s.get("name", None), s.get("_id", None))) s_param.update(parameters) s_param.update( {key: s[srm_to_parameter[key]] for key in srm_to_parameter.keys()}) fc = "{}_{}".format(s.get("date"), s.get("flowcell")) # Get instrument try: s_param.update(instrument_dict[fc_con.get_instrument(str(fc))]) except: LOG.warn( "Failed to set instrument and software versions for flowcell {} in report due to missing RunInfo -> Instrument field in statusdb. Either rerun 'pm qc update-qc' or search-and-replace 'NN' in the sample report." .format(fc)) s_param.update(instrument_dict['default']) # Get run mode if not fcdoc or fcdoc.get("name") != fc: fcdoc = fc_con.get_entry(fc) runp = fcdoc.get("RunParameters", {}) s_param[ "sequencing_platform"] = "MiSeq" if "MCSVersion" in runp else "HiSeq2500" s_param["clustering_method"] = "onboard clustering" if runp.get( "ClusteringChoice", "") == "OnBoardClustering" or s_param[ "sequencing_platform"] == "MiSeq" else "cBot" s_param["sequencing_setup"] = fcdoc.get("run_setup") s_param["sequencing_mode"] = runp.get("RunMode", "High Output") s_param["sequencing_software"] = "RTA {}".format( runp.get("RTAVersion")) if s_param["sequencing_platform"] == "MiSeq": s_param["sequencing_software"] = "MCS {}/{}".format( runp.get("MCSVersion"), s_param["sequencing_software"]) else: s_param["sequencing_software"] = "{} {}/{}".format( runp.get("ApplicationName"), runp.get("ApplicationVersion"), s_param["sequencing_software"]) s_param["is_paired"] = fc_con.is_paired_end(str(fc)) if s_param["is_paired"] is None: LOG.warn( "Could not determine run setup for flowcell {}. Will assume paired-end." .format(fc)) s_param["is_paired"] = True s_param.update(software_versions) s_param["phix_error_rate"] = fc_con.get_phix_error_rate( str(fc), s["lane"]) if phix: s_param["phix_error_rate"] = _get_phix_error_rate(s["lane"], phix) # Get quality score from demultiplex stats, if that fails # (which it shouldn't), fall back on fastqc data. (avg_quality_score, pct_q30_bases) = fc_con.get_barcode_lane_statistics( project_name, s.get("barcode_name"), fc, s["lane"]) s_param[ 'avg_quality_score'] = avg_quality_score if avg_quality_score else calc_avg_qv( s) if not s_param['avg_quality_score']: LOG.warn( "Setting average quality failed for sample {}, id {}".format( s.get("name"), s.get("_id"))) s_param['pct_q30_bases'] = pct_q30_bases if not s_param['pct_q30_bases']: LOG.warn( "Setting % of >= Q30 Bases (PF) failed for sample {}, id {}". format(s.get("name"), s.get("_id"))) # Compare phix error and qv to cutoffs err_stat = "OK" qv_stat = "OK" if s_param["phix_error_rate"] > cutoffs["phix_err_cutoff"]: err_stat = "HIGH" elif s_param["phix_error_rate"] == -1: err_stat = "N/A" if s_param["avg_quality_score"] < cutoffs["qv_cutoff"]: qv_stat = "LOW" output_data["stdout"].write( "{:>18}\t{:>6}\t{:>12}\t{:>12}\t{:>12}\t{:>12}\n".format( s["barcode_name"], s["lane"], s_param["phix_error_rate"], err_stat, s_param["avg_quality_score"], qv_stat)) # Update/set remaning sample run parameters, falling back on project defaults if *key* is missing s_param['ordered_amount'] = s_param.get( 'ordered_amount', p_con.get_ordered_amount(project_name, samples=p_con.get_entry( project_name, 'samples'))) s_param['customer_reference'] = s_param.get( 'customer_reference', project.get('customer_reference')) s_param['uppnex_project_id'] = s_param.get('uppnex_project_id', project.get('uppnex_id')) # Override database settings if options passed at command line if ordered_million_reads: s_param["ordered_amount"] = _get_ordered_million_reads( s["barcode_name"], ordered_million_reads) if bc_count: s_param["rounded_read_count"] = _round_read_count_in_millions( _get_bc_count(s["barcode_name"], bc_count, s)) else: s_param["rounded_read_count"] = _round_read_count_in_millions( s_param["rounded_read_count"]) if uppnex_id: s_param["uppnex_project_id"] = uppnex_id if customer_reference: s_param["customer_reference"] = customer_reference # Get the project sample name corresponding to the sample run project_sample = p_con.get_project_sample( project_name, s.get("project_sample_name", None)) if project_sample: LOG.debug( "project sample run metrics mapping found: '{}' : '{}'".format( s["name"], project_sample["sample_name"])) project_sample_item = project_sample['project_sample'] # Set project_sample_d: a dictionary mapping from sample run metrics name to sample run metrics database id project_sample_d = _set_project_sample_dict( project_sample_item, source) if not project_sample_d: LOG.warn( "No sample_run_metrics information for sample '{}', barcode name '{}', id '{}'\n\tProject summary information {}" .format(s["name"], s["barcode_name"], s["_id"], project_sample)) # Check if sample run metrics name present in project database: if so, verify that database ids are consistent if s["name"] not in project_sample_d.keys(): LOG.warn( "no such sample run metrics '{}' in project sample run metrics dictionary" .format(s["name"])) else: if s["_id"] == project_sample_d[s["name"]]: LOG.debug( "project sample run metrics mapping found: '{}' : '{}'" .format(s["name"], project_sample_d[s["name"]])) else: LOG.warn( "inconsistent mapping for '{}': '{}' != '{}' (project summary id)" .format(s["name"], s["_id"], project_sample_d[s["name"]])) s_param['customer_name'] = project_sample_item.get( "customer_name", None) # Always normalize submitted id, since module textttable does not support unicode if type(s_param['customer_name']) is unicode: s_param['customer_name'] = unicodedata.normalize( 'NFKD', s_param['customer_name']).encode('ascii', 'ignore') # No project sample found. Manual upload to database necessary. else: s_param['customer_name'] = None LOG.warn( "No project sample name found for sample run name '{}'".format( s["barcode_name"])) LOG.info( "Please run 'pm qc upload-qc FLOWCELL_ID --extensive-matching' to update project sample names " ) LOG.info( "or 'pm qc update --sample_prj PROJECT_NAME --names BARCODE_TO_SAMPLE_MAP to update project sample names." ) LOG.info("Please refer to the pm documentation for examples.") query_ok(force=kw.get("force", False)) # Finally assess sequencing success, update parameters and set outputs s_param['success'] = sequencing_success(s_param, cutoffs) s_param.update({ k: "N/A" for k in s_param.keys() if s_param[k] is None or s_param[k] == "" or s_param[k] == -1.0 }) if sample_count[s.get("barcode_name")] > 1: outfile = "{}_{}_{}_{}.pdf".format(s["barcode_name"], s["date"], s["flowcell"], s["lane"]) else: outfile = "{}_{}_{}.pdf".format(s["barcode_name"], s["date"], s["flowcell"]) s_param["outfile"] = outfile s_param_out.append(s_param) # Write final output to reportlab and rst files output_data["debug"].write( json.dumps({ 's_param': s_param_out, 'sample_runs': {s["name"]: s["barcode_name"] for s in sample_run_list} })) notes = [ make_note(headers=headers, paragraphs=paragraphs, **sp) for sp in s_param_out ] rest_notes = make_sample_rest_notes( "{}_{}_{}_sample_summary.rst".format(project_name, s.get("date", None), s.get("flowcell", None)), s_param_out) concatenate_notes( notes, "{}_{}_{}_sample_summary.pdf".format(project_name, s.get("date", None), s.get("flowcell", None))) return output_data
def multiplex_qc(self): MAX_UNDEMULTIPLEXED_INDEX_COUNT = 1000000 EXPECTED_LANE_YIELD = 143000000 MAX_PHIX_ERROR_RATE = 2.0 MIN_PHIX_ERROR_RATE = 0.0 MIN_GTQ30 = 80.0 read_pairs = True out_data = [] if not self._check_pargs(['flowcell']): return url = self.pargs.url if self.pargs.url else self.app.config.get("db", "url") if not url: self.app.log.warn("Please provide a valid url: got {}".format(url)) return # Construct the short form of the fcid sp = os.path.basename(self.pargs.flowcell).split("_") fcid = "_".join([sp[0],sp[-1]]) # Get a connection to the flowcell database and fetch the corresponding document self.log.debug("Connecting to flowcell database".format(fcid)) fc_con = FlowcellRunMetricsConnection(dbname=self.app.config.get("db", "flowcells"), **vars(self.app.pargs)) self.log.debug("Fetching run metrics entry for flowcell {}".format(fcid)) fc_doc = fc_con.get_entry(fcid) if not fc_doc: self.log.warn("Could not fetch run metrics entry for flowcell {}".format(fcid)) return # Adjust the read pairs variable according to the run setup read_pairs = fc_con.is_paired_end(fcid) # Get the yield per sample from the Demultiplex_Stats self.log.debug("Getting yield for flowcell {}".format(fcid)) sample_yield = self._get_yield_per_sample(fc_doc, read_pairs) # Get the yield per lane from the Demultiplex_Stats self.log.debug("Getting lane yield for flowcell {}".format(fcid)) lane_yield = self._get_yield_per_lane(fc_doc, read_pairs) lanes = lane_yield.keys() # Get the number of samples in the pools from the Demultiplex_Stats self.log.debug("Getting lane pool sizes for flowcell {}".format(fcid)) pool_size = self._get_pool_size(fc_doc) # Get the sample information from the csv samplesheet self.log.debug("Getting csv samplesheet data for flowcell {}".format(fcid)) ssheet_samples = self._get_samplesheet_sample_data(fc_doc) if len(ssheet_samples) == 0: self.log.warn("No samplesheet data available for flowcell {}".format(fcid)) # Verify that all samples in samplesheet have reported metrics for id in ssheet_samples.keys(): for key in ssheet_samples[id].keys(): lane, index = key.split("_") project = ssheet_samples[id][key][0] if id not in sample_yield or \ key not in sample_yield[id]: self.log.warn("Sample {} from project {} is in samplesheet but no yield was reported in " \ "Demultiplex_Stats.htm for lane {} and index {}".format(id, project, lane, index)) continue sample_yield[id][key].append('verified') # Check that all samples in Demultiplex_Stats have entries in Samplesheet for id in sample_yield.keys(): for key in sample_yield[id].keys(): lane, index = key.split("_") if "verified" not in sample_yield[id][key] and \ index != "Undetermined": self.log.warn("Sample {} from project {}, with index {} on lane {} is in Demultiplex_Stats " \ "but no corresponding entry is present in SampleSheet".format(id, sample_yield[id][key][1], index, lane)) # Check the PhiX error rate for each lane self.log.debug("Getting PhiX error rates for flowcell {}".format(fcid)) for lane in lanes: status = "N/A" err_rate = fc_con.get_phix_error_rate(fcid,lane) if err_rate < 0: self.log.warn("Could not get PhiX error rate for lane {} on flowcell {}".format(lane,fcid)) elif err_rate <= MIN_PHIX_ERROR_RATE or err_rate > MAX_PHIX_ERROR_RATE: status = "FAIL" else: status = "PASS" out_data.append([status, "PhiX error rate", lane, err_rate, "{} < PhiX e (%) <= {}".format(MIN_PHIX_ERROR_RATE, MAX_PHIX_ERROR_RATE)]) # Check the %>=Q30 value for each sample sample_quality = self._get_quality_per_sample(fc_doc) for id in sample_quality.keys(): for key in sample_quality[id].keys(): lane, index = key.split("_") status = "FAIL" if float(sample_quality[id][key][0]) >= MIN_GTQ30: status = "PASS" out_data.append([status,"Sample quality",lane,sample_quality[id][key][2],id,sample_quality[id][key][0],"[%>=Q30 >= {}%]".format(MIN_GTQ30)]) # Check that each lane received the minimum amount of reads for lane, reads in lane_yield.items(): status = "FAIL" if reads >= EXPECTED_LANE_YIELD: status = "PASS" out_data.append([status,"Lane yield",lane,reads,"[Yield >= {}]".format(EXPECTED_LANE_YIELD)]) # Check that all samples in the pool have received a minimum number of reads for id in sample_yield.keys(): for key in sample_yield[id].keys(): lane, index = key.split("_") if index == "Undetermined": continue status = "FAIL" mplx_min = int(0.5*EXPECTED_LANE_YIELD/pool_size[lane]) if sample_yield[id][key][0] >= mplx_min: status = "PASS" out_data.append([status,"Sample yield",lane,sample_yield[id][key][1],id,sample_yield[id][key][0],"[Yield >= {}]".format(mplx_min)]) # Check that the number of undetermined reads in each lane is below 10% of the total yield for the lane for lane, reads in lane_yield.items(): status = "FAIL" key = "_".join([lane,"Undetermined"]) undetermined = sum([counts.get(key,[0])[0] for counts in sample_yield.values()]) cutoff = 0.1*reads if undetermined < cutoff: status = "PASS" out_data.append([status,"Index read",lane,undetermined,"[Undetermined < {}]".format(cutoff)]) # Check that no overrepresented index sequence exists in undemultiplexed output self.log.debug("Fetching undemultiplexed barcode data for flowcell {}".format(fcid)) undemux_data = self._get_undetermined_index_counts(fc_doc) if len(undemux_data) == 0: self.log.warn("No undemultiplexed barcode data available for flowcell {}".format(fcid)) for lane, counts in undemux_data.items(): mplx_min = int(min(MAX_UNDEMULTIPLEXED_INDEX_COUNT, 0.5*EXPECTED_LANE_YIELD/max(1,pool_size[lane]))) status = "N/A" if len(counts) > 0: for i in range(len(counts)): status = "FAIL" if int(counts[i][0]) < mplx_min: status = "PASS" out_data.append([status,"Index",lane,counts[i][1],counts[i][2],counts[i][0],"[Undetermined index < {}]".format(mplx_min)]) else: out_data.append([status,"Index",lane,"","",mplx_min,"-"]) self.app._output_data['stdout'].write("\n".join(["\t".join([str(r) for r in row]) for row in out_data]))
def sample_status_note(project_name=None, flowcell=None, username=None, password=None, url=None, ordered_million_reads=None, uppnex_id=None, customer_reference=None, bc_count=None, project_alias=[], projectdb="projects", samplesdb="samples", flowcelldb="flowcells", phix=None, is_paired=True, **kw): """Make a sample status note. Used keywords: :param project_name: project name :param flowcell: flowcell id :param username: db username :param password: db password :param url: db url :param ordered_million_reads: number of ordered reads in millions :param uppnex_id: the uppnex id :param customer_reference: customer project name :param project_alias: project alias name :param phix: phix error rate :param is_paired: True if run is paired-end, False for single-end """ # Cutoffs cutoffs = { "phix_err_cutoff" : 2.0, "qv_cutoff" : 30, } instrument = _parse_instrument_config(os.path.expanduser(kw.get("instrument_config",""))) instrument_dict = {i['instrument_id']: i for i in instrument} # parameters parameters = { "project_name" : None, "start_date" : None, "FC_id" : None, "scilifelab_name" : None, "rounded_read_count" : None, "phix_error_rate" : None, "avg_quality_score" : None, "pct_q30_bases" : None, "success" : None, "run_mode":None, "is_paired":True } # key mapping from sample_run_metrics to parameter keys srm_to_parameter = {"project_name":"sample_prj", "FC_id":"flowcell", "scilifelab_name":"barcode_name", "start_date":"date", "rounded_read_count":"bc_count", "lane": "lane"} LOG.debug("got parameters {}".format(parameters)) output_data = {'stdout':StringIO(), 'stderr':StringIO(), 'debug':StringIO()} if not _assert_flowcell_format(flowcell): LOG.warn("Wrong flowcell format {}; skipping. Please use the flowcell id (format \"[A-Z0-9\-]+\")".format(flowcell) ) return output_data output_data = _update_sample_output_data(output_data, cutoffs) # Connect and run s_con = SampleRunMetricsConnection(dbname=samplesdb, username=username, password=password, url=url) fc_con = FlowcellRunMetricsConnection(dbname=flowcelldb, username=username, password=password, url=url) p_con = ProjectSummaryConnection(dbname=projectdb, username=username, password=password, url=url) # Set up paragraphs paragraphs = sample_note_paragraphs() headers = sample_note_headers() # Get project project = p_con.get_entry(project_name) source = p_con.get_info_source(project_name) if not project: LOG.warn("No such project '{}'".format(project_name)) return output_data # Set samples list sample_run_list = _set_sample_run_list(project_name, flowcell, project_alias, s_con) if len(sample_run_list) == 0: LOG.warn("No samples for project '{}', flowcell '{}'. Maybe there are no sample run metrics in statusdb?".format(project_name, flowcell)) return output_data # Set options ordered_million_reads = _literal_eval_option(ordered_million_reads) bc_count = _literal_eval_option(bc_count) phix = _literal_eval_option(phix) # Count number of times a sample has been run on a flowcell; if several, make lane-specific reports sample_count = Counter([x.get("barcode_name") for x in sample_run_list]) # Loop samples and collect information s_param_out = [] fcdoc = None for s in sample_run_list: s_param = {} LOG.debug("working on sample '{}', sample run metrics name '{}', id '{}'".format(s.get("barcode_name", None), s.get("name", None), s.get("_id", None))) s_param.update(parameters) s_param.update({key:s[srm_to_parameter[key]] for key in srm_to_parameter.keys()}) fc = "{}_{}".format(s.get("date"), s.get("flowcell")) # Get instrument try: s_param.update(instrument_dict[fc_con.get_instrument(str(fc))]) except: LOG.warn("Failed to set instrument and software versions for flowcell {} in report due to missing RunInfo -> Instrument field in statusdb. Either rerun 'pm qc update-qc' or search-and-replace 'NN' in the sample report.".format(fc)) s_param.update(instrument_dict['default']) # Get run mode if not fcdoc or fcdoc.get("name") != fc: fcdoc = fc_con.get_entry(fc) runp = fcdoc.get("RunParameters",{}) s_param["sequencing_platform"] = "MiSeq" if "MCSVersion" in runp else "HiSeq2500" s_param["clustering_method"] = "onboard clustering" if runp.get("ClusteringChoice","") == "OnBoardClustering" or s_param["sequencing_platform"] == "MiSeq" else "cBot" s_param["sequencing_setup"] = fcdoc.get("run_setup") s_param["sequencing_mode"] = runp.get("RunMode","High Output") s_param["sequencing_software"] = "RTA {}".format(runp.get("RTAVersion")) if s_param["sequencing_platform"] == "MiSeq": s_param["sequencing_software"] = "MCS {}/{}".format(runp.get("MCSVersion"),s_param["sequencing_software"]) else: s_param["sequencing_software"] = "{} {}/{}".format(runp.get("ApplicationName"),runp.get("ApplicationVersion"),s_param["sequencing_software"]) s_param["is_paired"] = fc_con.is_paired_end(str(fc)) if s_param["is_paired"] is None: LOG.warn("Could not determine run setup for flowcell {}. Will assume paired-end.".format(fc)) s_param["is_paired"] = True s_param.update(software_versions) s_param["phix_error_rate"] = fc_con.get_phix_error_rate(str(fc), s["lane"]) if phix: s_param["phix_error_rate"] = _get_phix_error_rate(s["lane"], phix) # Get quality score from demultiplex stats, if that fails # (which it shouldn't), fall back on fastqc data. (avg_quality_score, pct_q30_bases) = fc_con.get_barcode_lane_statistics(project_name, s.get("barcode_name"), fc, s["lane"]) s_param['avg_quality_score'] = avg_quality_score if avg_quality_score else calc_avg_qv(s) if not s_param['avg_quality_score']: LOG.warn("Setting average quality failed for sample {}, id {}".format(s.get("name"), s.get("_id"))) s_param['pct_q30_bases'] = pct_q30_bases if not s_param['pct_q30_bases']: LOG.warn("Setting % of >= Q30 Bases (PF) failed for sample {}, id {}".format(s.get("name"), s.get("_id"))) # Compare phix error and qv to cutoffs err_stat = "OK" qv_stat = "OK" if s_param["phix_error_rate"] > cutoffs["phix_err_cutoff"]: err_stat = "HIGH" elif s_param["phix_error_rate"] == -1: err_stat = "N/A" if s_param["avg_quality_score"] < cutoffs["qv_cutoff"]: qv_stat = "LOW" output_data["stdout"].write("{:>18}\t{:>6}\t{:>12}\t{:>12}\t{:>12}\t{:>12}\n".format(s["barcode_name"], s["lane"], s_param["phix_error_rate"], err_stat, s_param["avg_quality_score"], qv_stat)) # Update/set remaning sample run parameters, falling back on project defaults if *key* is missing s_param['ordered_amount'] = s_param.get('ordered_amount', p_con.get_ordered_amount(project_name, samples=p_con.get_entry(project_name,'samples'))) s_param['customer_reference'] = s_param.get('customer_reference', project.get('customer_reference')) s_param['uppnex_project_id'] = s_param.get('uppnex_project_id', project.get('uppnex_id')) # Override database settings if options passed at command line if ordered_million_reads: s_param["ordered_amount"] = _get_ordered_million_reads(s["barcode_name"], ordered_million_reads) if bc_count: s_param["rounded_read_count"] = _round_read_count_in_millions(_get_bc_count(s["barcode_name"], bc_count, s)) else: s_param["rounded_read_count"] = _round_read_count_in_millions(s_param["rounded_read_count"]) if uppnex_id: s_param["uppnex_project_id"] = uppnex_id if customer_reference: s_param["customer_reference"] = customer_reference # Get the project sample name corresponding to the sample run project_sample = p_con.get_project_sample(project_name, s.get("project_sample_name", None)) if project_sample: LOG.debug("project sample run metrics mapping found: '{}' : '{}'".format(s["name"], project_sample["sample_name"])) project_sample_item = project_sample['project_sample'] # Set project_sample_d: a dictionary mapping from sample run metrics name to sample run metrics database id project_sample_d = _set_project_sample_dict(project_sample_item, source) if not project_sample_d: LOG.warn("No sample_run_metrics information for sample '{}', barcode name '{}', id '{}'\n\tProject summary information {}".format(s["name"], s["barcode_name"], s["_id"], project_sample)) # Check if sample run metrics name present in project database: if so, verify that database ids are consistent if s["name"] not in project_sample_d.keys(): LOG.warn("no such sample run metrics '{}' in project sample run metrics dictionary".format(s["name"]) ) else: if s["_id"] == project_sample_d[s["name"]]: LOG.debug("project sample run metrics mapping found: '{}' : '{}'".format(s["name"], project_sample_d[s["name"]])) else: LOG.warn("inconsistent mapping for '{}': '{}' != '{}' (project summary id)".format(s["name"], s["_id"], project_sample_d[s["name"]])) s_param['customer_name'] = project_sample_item.get("customer_name", None) # Always normalize submitted id, since module textttable does not support unicode if type(s_param['customer_name']) is unicode: s_param['customer_name'] = unicodedata.normalize('NFKD', s_param['customer_name']).encode('ascii', 'ignore') # No project sample found. Manual upload to database necessary. else: s_param['customer_name'] = None LOG.warn("No project sample name found for sample run name '{}'".format(s["barcode_name"])) LOG.info("Please run 'pm qc upload-qc FLOWCELL_ID --extensive-matching' to update project sample names ") LOG.info("or 'pm qc update --sample_prj PROJECT_NAME --names BARCODE_TO_SAMPLE_MAP to update project sample names.") LOG.info("Please refer to the pm documentation for examples.") query_ok(force=kw.get("force", False)) # Finally assess sequencing success, update parameters and set outputs s_param['success'] = sequencing_success(s_param, cutoffs) s_param.update({k:"N/A" for k in s_param.keys() if s_param[k] is None or s_param[k] == "" or s_param[k] == -1.0}) if sample_count[s.get("barcode_name")] > 1: outfile = "{}_{}_{}_{}.pdf".format(s["barcode_name"], s["date"], s["flowcell"], s["lane"]) else: outfile = "{}_{}_{}.pdf".format(s["barcode_name"], s["date"], s["flowcell"]) s_param["outfile"] = outfile s_param_out.append(s_param) # Write final output to reportlab and rst files output_data["debug"].write(json.dumps({'s_param': s_param_out, 'sample_runs':{s["name"]:s["barcode_name"] for s in sample_run_list}})) notes = [make_note(headers=headers, paragraphs=paragraphs, **sp) for sp in s_param_out] rest_notes = make_sample_rest_notes("{}_{}_{}_sample_summary.rst".format(project_name, s.get("date", None), s.get("flowcell", None)), s_param_out) concatenate_notes(notes, "{}_{}_{}_sample_summary.pdf".format(project_name, s.get("date", None), s.get("flowcell", None))) return output_data
def multiplex_qc(self): MAX_UNDEMULTIPLEXED_INDEX_COUNT = 1000000 EXPECTED_LANE_YIELD = 143000000 MAX_PHIX_ERROR_RATE = 2.0 MIN_PHIX_ERROR_RATE = 0.0 MIN_GTQ30 = 80.0 read_pairs = True out_data = [] if not self._check_pargs(['flowcell']): return url = self.pargs.url if self.pargs.url else self.app.config.get("db", "url") if not url: self.app.log.warn("Please provide a valid url: got {}".format(url)) return # Construct the short form of the fcid sp = os.path.basename(self.pargs.flowcell).split("_") fcid = "_".join([sp[0],sp[-1]]) # Get a connection to the flowcell database and fetch the corresponding document self.log.debug("Connecting to flowcell database".format(fcid)) fc_con = FlowcellRunMetricsConnection(dbname=self.app.config.get("db", "flowcells"), **vars(self.app.pargs)) self.log.debug("Fetching run metrics entry for flowcell {}".format(fcid)) fc_doc = fc_con.get_entry(fcid) if not fc_doc: self.log.warn("Could not fetch run metrics entry for flowcell {}".format(fcid)) return # Get the yield per sample from the Demultiplex_Stats self.log.debug("Getting yield for flowcell {}".format(fcid)) sample_yield = self._get_yield_per_sample(fc_doc, read_pairs) # Get the yield per lane from the Demultiplex_Stats self.log.debug("Getting lane yield for flowcell {}".format(fcid)) lane_yield = self._get_yield_per_lane(fc_doc, read_pairs) lanes = lane_yield.keys() # Get the number of samples in the pools from the Demultiplex_Stats self.log.debug("Getting lane pool sizes for flowcell {}".format(fcid)) pool_size = self._get_pool_size(fc_doc) # Get the sample information from the csv samplesheet self.log.debug("Getting csv samplesheet data for flowcell {}".format(fcid)) ssheet_samples = self._get_samplesheet_sample_data(fc_doc) if len(ssheet_samples) == 0: self.log.warn("No samplesheet data available for flowcell {}".format(fcid)) # Verify that all samples in samplesheet have reported metrics for id in ssheet_samples.keys(): for key in ssheet_samples[id].keys(): lane, index = key.split("_") project = ssheet_samples[id][key][0] if id not in sample_yield or \ key not in sample_yield[id]: self.log.warn("Sample {} from project {} is in samplesheet but no yield was reported in " \ "Demultiplex_Stats.htm for lane {} and index {}".format(id, project, lane, index)) continue sample_yield[id][key].append('verified') # Check that all samples in Demultiplex_Stats have entries in Samplesheet for id in sample_yield.keys(): for key in sample_yield[id].keys(): lane, index = key.split("_") if "verified" not in sample_yield[id][key] and \ index != "Undetermined": self.log.warn("Sample {} from project {}, with index {} on lane {} is in Demultiplex_Stats " \ "but no corresponding entry is present in SampleSheet".format(id, sample_yield[id][key][1], index, lane)) # Check the PhiX error rate for each lane self.log.debug("Getting PhiX error rates for flowcell {}".format(fcid)) for lane in lanes: status = "N/A" err_rate = fc_con.get_phix_error_rate(fcid,lane) if err_rate < 0: self.log.warn("Could not get PhiX error rate for lane {} on flowcell {}".format(lane,fcid)) elif err_rate <= MIN_PHIX_ERROR_RATE or err_rate > MAX_PHIX_ERROR_RATE: status = "FAIL" else: status = "PASS" out_data.append([status, "PhiX error rate", lane, err_rate, "{} < PhiX e (%) <= {}".format(MIN_PHIX_ERROR_RATE, MAX_PHIX_ERROR_RATE)]) # Check the %>=Q30 value for each sample sample_quality = self._get_quality_per_sample(fc_doc) for id in sample_quality.keys(): for key in sample_quality[id].keys(): lane, index = key.split("_") status = "FAIL" if float(sample_quality[id][key][0]) >= MIN_GTQ30: status = "PASS" out_data.append([status,"Sample quality",lane,sample_quality[id][key][2],id,sample_quality[id][key][0],"[%>=Q30 >= {}%]".format(MIN_GTQ30)]) # Check that each lane received the minimum amount of reads for lane, reads in lane_yield.items(): status = "FAIL" if reads >= EXPECTED_LANE_YIELD: status = "PASS" out_data.append([status,"Lane yield",lane,reads,"[Yield >= {}]".format(EXPECTED_LANE_YIELD)]) # Check that all samples in the pool have received a minimum number of reads for id in sample_yield.keys(): for key in sample_yield[id].keys(): lane, index = key.split("_") if index == "Undetermined": continue status = "FAIL" mplx_min = int(0.5*EXPECTED_LANE_YIELD/pool_size[lane]) if sample_yield[id][key][0] >= mplx_min: status = "PASS" out_data.append([status,"Sample yield",lane,sample_yield[id][key][1],id,sample_yield[id][key][0],"[Yield >= {}]".format(mplx_min)]) # Check that the number of undetermined reads in each lane is below 10% of the total yield for the lane for lane, reads in lane_yield.items(): status = "FAIL" key = "_".join([lane,"Undetermined"]) undetermined = sum([counts.get(key,[0])[0] for counts in sample_yield.values()]) cutoff = 0.1*reads if undetermined < cutoff: status = "PASS" out_data.append([status,"Index read",lane,undetermined,"[Undetermined < {}]".format(cutoff)]) # Check that no overrepresented index sequence exists in undemultiplexed output self.log.debug("Fetching undemultiplexed barcode data for flowcell {}".format(fcid)) undemux_data = self._get_undetermined_index_counts(fc_doc) if len(undemux_data) == 0: self.log.warn("No undemultiplexed barcode data available for flowcell {}".format(fcid)) for lane, counts in undemux_data.items(): mplx_min = int(min(MAX_UNDEMULTIPLEXED_INDEX_COUNT, 0.5*EXPECTED_LANE_YIELD/max(1,pool_size[lane]))) status = "N/A" if len(counts) > 0: for i in range(len(counts)): status = "FAIL" if int(counts[i][0]) < mplx_min: status = "PASS" out_data.append([status,"Index",lane,counts[i][1],counts[i][2],counts[i][0],"[Undetermined index < {}]".format(mplx_min)]) else: out_data.append([status,"Index",lane,"","",mplx_min,"-"]) self.app._output_data['stdout'].write("\n".join(["\t".join([str(r) for r in row]) for row in out_data]))
def raw_data(self): if not self._check_pargs(["project"]): return # if necessary, reformat flowcell identifier if self.pargs.flowcell: self.pargs.flowcell = self.pargs.flowcell.split("_")[-1] # get the uid and gid to use for destination files uid = os.getuid() gid = os.getgid() if self.pargs.group is not None and len(self.pargs.group) > 0: gid = grp.getgrnam(group).gr_gid self.log.debug("Connecting to project database") p_con = ProjectSummaryConnection(**vars(self.pargs)) assert p_con, "Could not get connection to project database" self.log.debug("Connecting to flowcell database") f_con = FlowcellRunMetricsConnection(**vars(self.pargs)) assert f_con, "Could not get connection to flowcell database" self.log.debug("Connecting to x_flowcell database") x_con = X_FlowcellRunMetricsConnection(**vars(self.pargs)) assert x_con, "Could not get connection to x_flowcell database" # Fetch the Uppnex project to deliver to if not self.pargs.uppmax_project: self.pargs.uppmax_project = p_con.get_entry( self.pargs.project, "uppnex_id") if not self.pargs.uppmax_project: self.log.error( "Uppmax project was not specified and could not be fetched from project database" ) return # Setup paths and verify parameters self._meta.production_root = self.pargs.root if self.pargs.root else self.app.config.get( "production", "root") self._meta.root_path = self._meta.production_root proj_base_dir = os.path.join(self._meta.root_path, self.pargs.project) assert os.path.exists( self._meta.production_root ), "No such directory {}; check your production config".format( self._meta.production_root) assert os.path.exists( proj_base_dir), "No project {} in production path {}".format( self.pargs.project, self._meta.root_path) try: self._meta.uppnex_project_root = self.app.config.get( "deliver", "uppnex_project_root") except Exception as e: self.log.warn( "{}, will use '/proj' as uppnext_project_root".format(e)) self._meta.uppnex_project_root = '/proj' try: self._meta.uppnex_delivery_dir = self.app.config.get( "deliver", "uppnex_project_delivery_path") except Exception as e: self.log.warn( "{}, will use 'INBOX' as uppnext_project_delivery_path".format( e)) self._meta.uppnex_delivery_dir = 'INBOX' destination_root = os.path.join(self._meta.uppnex_project_root, self.pargs.uppmax_project, self._meta.uppnex_delivery_dir) assert os.path.exists( destination_root ), "Delivery destination folder {} does not exist".format( destination_root) destination_root = os.path.join(destination_root, self.pargs.project) # Find uncompressed fastq uncompressed = self._find_uncompressed_fastq_files( proj_base_dir=proj_base_dir, sample=self.pargs.sample, flowcell=self.pargs.flowcell) if len(uncompressed) > 0: self.log.error( "There are uncompressed fastq file for project, kindly check all files are compressed properly before delivery" ) return # Extract the list of samples and runs associated with the project and sort them samples = self.samples_to_copy( pid=p_con.get_entry(self.pargs.project, "project_id"), pod=p_con.get_entry(self.pargs.project, "open_date"), fc_dict={ 'HiSeq2500': f_con.proj_list, 'HiSeqX': x_con.proj_list }, proj_base_dir=proj_base_dir, destination_root=destination_root, sample=self.pargs.sample, flowcell=self.pargs.flowcell) # If interactively select, build a list of samples to skip if self.pargs.interactive: to_process = {} for sample in samples: if query_yes_no("Deliver sample {} ?".format(sample), default="no"): to_process[sample] = samples[sample] samples = to_process if self.pargs.sample: sample = samples.get(self.pargs.sample) if not sample: self.log.error( "There is no such sample {} for project {}".format( self.pargs.sample, self.pargs.project)) return samples = {self.pargs.sample: sample} self.log.info( "Will deliver data for {} samples from project {} to {}".format( len(samples), self.pargs.project, destination_root)) if not query_yes_no("Continue?"): return # Make sure that transfer will be with rsync if not self.pargs.rsync: self.log.warn("Files must be transferred using rsync") if not query_yes_no( "Do you wish to continue delivering using rsync?", default="yes"): return self.pargs.rsync = True # Process each sample for sample, flowcells in samples.iteritems(): for fc, files in flowcells.iteritems(): self.log.info("Processing sample {} and flowcell {}".format( sample, fc)) # transfer files self.log.debug("Transferring {} fastq files".format( len(files['src']))) self._transfer_files(sources=files['src'], targets=files['dst']) passed = True if self.pargs.link or self.pargs.dry_run: passed = False else: # calculate md5sums on the source side and write it on the destination md5 = [] for s, d in zip(files['src'], files['dst']): m = md5sum(s) mfile = "{}.md5".format(d) md5.append([m, mfile, s]) self.log.debug("md5sum for source file {}: {}".format( s, m)) # write the md5sum to a file at the destination and verify the transfer for m, mfile, srcpath in md5: dstfile = os.path.splitext(mfile)[0] self.log.debug( "Writing md5sum to file {}".format(mfile)) self.app.cmd.write( mfile, "{} {}".format(m, os.path.basename(dstfile)), True) self.log.debug( "Verifying md5sum for file {}".format(dstfile)) dm = md5sum(dstfile) self.log.debug( "md5sum for destination file {}: {}".format( dstfile, dm)) if m != dm: self.log.warn( "md5sum verification FAILED for {}. Source: {}, Target: {}" .format(dstfile, m, dm)) self.log.warn( "Improperly transferred file {} is removed from destination, please retry transfer of this file" .format(dstfile)) self.app.cmd.safe_unlink(dstfile) self.app.cmd.safe_unlink(mfile) passed = False continue # Modify the permissions to ug+rw for f in [dstfile, mfile]: self.app.cmd.chmod( f, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP) # touch the flag to trigger uppmax inbox permission fix self.app.cmd.safe_touchfile( os.path.join("/sw", "uppmax", "var", "inboxfix", "schedule", self.pargs.uppmax_project)) # log the transfer to statusdb if verification passed if passed: data = { 'raw_data_delivery': { 'timestamp': utc_time(), 'files': { os.path.splitext( (os.path.basename(srcpath)))[0]: { 'md5': m, 'path': os.path.splitext(mfile)[0], 'size_in_bytes': self._getsize(os.path.splitext(mfile)[0]), 'source_location': srcpath } for m, mfile, srcpath in md5 } } } jsonstr = json.dumps(data) jsonfile = os.path.join( proj_base_dir, sample, fc, "{}_{}_raw_data_delivery.json".format(sample, fc)) self.log.debug( "Writing delivery to json file {}".format(jsonfile)) self.app.cmd.write(jsonfile, data=jsonstr, overwrite=True) self.log.debug( "Saving delivery in StatusDB document {}".format(id)) if self.proj_flowcells[fc]['type'] == 'HiSeqX': fc_con = x_con else: fc_con = f_con fc_obj = fc_con.get_entry(fc) self.log.info( "Logging delivery to StatusDB document {}".format( fc_obj.get('_id'))) fc_raw_data = fc_obj.get('raw_data_delivery', {}) fc_raw_data.update(data['raw_data_delivery']) fc_obj['raw_data_delivery'] = fc_raw_data self._save(fc_con, fc_obj) self.log.debug(jsonstr)
def test_2_make_project_note(self): """Make a project note subset by flowcell and project""" s_con = SampleRunMetricsConnection(username=self.user, password=self.pw, url=self.url) fc_con = FlowcellRunMetricsConnection(username=self.user, password=self.pw, url=self.url) p_con = ProjectSummaryConnection(username=self.user, password=self.pw, url=self.url) paragraphs = project_note_paragraphs() headers = project_note_headers() param = parameters project = p_con.get_entry(self.examples["project"]) if not project: print "No project named {}".format(self.examples["project"]) return if project: ordered_amount = p_con.get_ordered_amount(self.examples["project"]) else: return ordered_amount = self.pargs.ordered_million_reads ## Start collecting the data sample_table = [] sample_list = project['samples'] param.update({ key: project.get(ps_to_parameter[key], None) for key in ps_to_parameter.keys() }) samples = p_con.map_name_to_srm(self.examples["project"], check_consistency=True, use_bc_map=True) all_passed = True for k, v in samples.items(): if k == "Unexpected": continue project_sample = sample_list[k] vals = { x: project_sample.get(prjs_to_table[x], None) for x in prjs_to_table.keys() } vals['MOrdered'] = ordered_amount vals['BarcodeSeq'] = s_con.get_entry(v.keys()[0], "sequence") ## Set status vals['Status'] = set_status( vals) if vals['Status'] is None else vals['Status'] vals.update({k: "N/A" for k in vals.keys() if vals[k] is None}) if vals['Status'] == "N/A" or vals['Status'] == "NP": all_passed = False sample_table.append([vals[k] for k in table_keys]) if all_passed: param["finished"] = 'Project finished.' sample_table.sort() sample_table = list( sample_table for sample_table, _ in itertools.groupby(sample_table)) sample_table.insert(0, [ 'ScilifeID', 'CustomerID', 'BarcodeSeq', 'MSequenced', 'MOrdered', 'Status' ]) paragraphs["Samples"]["tpl"] = make_sample_table(sample_table) make_note("{}.pdf".format(self.examples["project"]), headers, paragraphs, **param)
def storage_cleanup(self): storage_conf = self.app.config.get_section_dict('storage') db_info = self.app.config.get_section_dict('db') f_conn = FlowcellRunMetricsConnection(username=db_info.get('user'), password=db_info.get('password'), url=db_info.get('url')) servers = [server for server in storage_conf.keys()] server = platform.node().split('.')[0].lower() if server in servers: self.app.log.info( "Performing cleanup on production server \"{}\"...".format( server)) dirs = [d.lstrip() for d in storage_conf.get(server).split(',')] #Collect old runs (> 30 days in nosync folder) to remove old_runs = [] for d in dirs: nosync_dir = os.path.join(d, 'nosync') for fc in glob.iglob(os.path.join(nosync_dir, '1*')): if os.path.isdir(fc): fc_name = os.path.basename(fc) #Check that there is no check file indicating to not remove the run if not os.path.exists(os.path.join( fc, 'no_remove.txt')): stats = os.stat(os.path.join( fc, 'RTAComplete.txt')) mod_time = datetime.now() - datetime.fromtimestamp( stats.st_mtime) if mod_time.days >= 30: old_runs.append(fc) else: self.app.log.warn( "no_remove.txt file found in {}, skipping run". format(fc_name)) #NAS servers if 'nas' in server: #Collect newly finished runs fc_list = [] for d in dirs: for fc in glob.glob(os.path.join(d, '1*')): if os.path.exists(os.path.join(fc, 'RTAComplete.txt')): fc_list.append(fc) #Move to nosync retries = 5 for fc in fc_list: fc_name = os.path.basename(fc) while retries: if 'Finished' in last_lines( storage_conf.get('lsyncd_log'), 1)[0]: break retries -= 1 time.sleep(3) if retries: self.app.log.info("lsyncd process seems to be up to speed, and run {} " \ "is finished, moving it to nosync".format(fc_name)) shutil.move( fc, os.path.join(os.path.dirname(fc), 'nosync')) #Touch RTAComplete.txt file to that the modification date is the date when #it was moved to nosync try: open( os.path.join(os.path.dirname(fc), 'nosync', os.path.basename(fc), 'RTAComplete.txt'), 'w').close() except IOError: self.app.log.warn("No RTAComplete.txt file was found for run {}." \ " Please check".format(os.path.basename(fc_name))) fc_db_id = f_conn.id_view.get(fc_name) if fc_db_id: f_conn.set_storage_status(fc_db_id, 'NAS_nosync') else: self.app.log.warn( "Flowcell {} not found in the database, not changing status." .format(fc_name)) else: self.app.log.warn("lsyncd process doesn't seem to be finished. " \ "Skipping run {}".format(os.path.basename(fc))) #Remove old runs for fc in old_runs: fc_name = os.path.basename(fc) #Check that the run has been archived in swestore before removing permanently if fc_name in f_conn.get_storage_status( 'swestore_archived').keys(): self.app.log.info("Run {} has been in nosync for more than 30 days " \ "and is archived in swestore. Permanently removing it from the NAS".format(fc_name)) shutil.rmtree(fc) else: self.app.log.warn("Run {} has been in nosync for more than 30 " \ "days, but has not yet been archived in swestore. " \ "Not removing, please check it".format(fc_name)) #Processing servers (b5) else: #Collect finished runs fc_list = [] for d in dirs: for fc in glob.glob(os.path.join(d, '1*')): if os.path.exists( os.path.join( fc, 'second_read_processing_completed.txt')): fc_list.append(fc) #Move to nosync for fc in fc_list: fc_name = os.path.basename(fc) self.app.log.info( "Moving run {} to nosync".format(fc_name)) shutil.move(fc, os.path.join(os.path.dirname(fc), 'nosync')) #Remove old runs for fc in old_runs: fc_name = os.path.basename(fc) self.app.log.info("Run {} has been in nosync for more than 30 " \ "days, permanently removing it from {}".format(fc_name, server)) shutil.rmtree(fc) else: self.app.log.warn("You're running the cleanup functionality in {}. But this " \ "server doen't seem to be on your pm.conf file. Are you on the correct server?".format(server))
def project_status_note(project_name=None, username=None, password=None, url=None, use_ps_map=True, use_bc_map=False, check_consistency=False, ordered_million_reads=None, uppnex_id=None, customer_reference=None, exclude_sample_ids={}, project_alias=None, sample_aliases={}, projectdb="projects", samplesdb="samples", flowcelldb="flowcells", include_all_samples=False, **kw): """Make a project status note. Used keywords: :param project_name: project name :param user: db user name :param password: db password :param url: db url :param use_ps_map: use project summary mapping :param use_bc_map: use project to barcode name mapping :param check_consistency: check consistency between mappings :param ordered_million_reads: number of ordered reads in millions :param uppnex_id: the uppnex id :param customer_reference: customer project name :param exclude_sample_ids: exclude some sample ids from project note :param project_alias: project alias name :param sample_aliases: sample alias names :param projectdb: project db name :param samplesdb: samples db name :param flowcelldb: flowcells db name :param include_all_samples: include all samples in report """ # parameters parameters = { "project_name": project_name, "finished": "Not finished, or cannot yet assess if finished.", } # mapping project_summary to parameter keys ps_to_parameter = { "scilife_name": "scilife_name", "customer_name": "customer_name", "project_name": "project_name" } # mapping project sample to table table_keys = [ 'ScilifeID', 'CustomerID', 'BarcodeSeq', 'MSequenced', 'MOrdered', 'Status' ] output_data = { 'stdout': StringIO(), 'stderr': StringIO(), 'debug': StringIO() } # Connect and run s_con = SampleRunMetricsConnection(dbname=samplesdb, username=username, password=password, url=url) fc_con = FlowcellRunMetricsConnection(dbname=flowcelldb, username=username, password=password, url=url) p_con = ProjectSummaryConnection(dbname=projectdb, username=username, password=password, url=url) # Set report paragraphs paragraphs = project_note_paragraphs() headers = project_note_headers() # Set local param variable param = parameters # Get project summary from project database sample_aliases = _literal_eval_option(sample_aliases, default={}) prj_summary = p_con.get_entry(project_name) if not prj_summary: LOG.warn("No such project '{}'".format(project_name)) return LOG.debug("Working on project '{}'.".format(project_name)) # Get sample run list and loop samples to make mapping sample -> {sampleruns} sample_run_list = _set_sample_run_list(project_name, flowcell=None, project_alias=project_alias, s_con=s_con) samples = {} for s in sample_run_list: prj_sample = p_con.get_project_sample( project_name, s.get("project_sample_name", None)) if prj_sample: sample_name = prj_sample['project_sample'].get( "scilife_name", None) s_d = {s["name"]: {'sample': sample_name, 'id': s["_id"]}} samples.update(s_d) else: if s["barcode_name"] in sample_aliases: s_d = { sample_aliases[s["barcode_name"]]: { 'sample': sample_aliases[s["barcode_name"]], 'id': s["_id"] } } samples.update(s_d) else: s_d = { s["name"]: { 'sample': s["name"], 'id': s["_id"], 'barcode_name': s["barcode_name"] } } LOG.warn( "No mapping found for sample run:\n '{}'".format(s_d)) # Convert to mapping from desired sample name to list of aliases # Less important for the moment; one solution is to update the # Google docs summary table to use the P names sample_dict = prj_summary['samples'] param.update({ key: prj_summary.get(ps_to_parameter[key], None) for key in ps_to_parameter.keys() }) param["ordered_amount"] = param.get("ordered_amount", p_con.get_ordered_amount(project_name)) param['customer_reference'] = param.get( 'customer_reference', prj_summary.get('customer_reference')) param['uppnex_project_id'] = param.get('uppnex_project_id', prj_summary.get('uppnex_id')) # Override database values if options passed at command line if uppnex_id: param["uppnex_project_id"] = uppnex_id if customer_reference: param["customer_reference"] = customer_reference # Process options ordered_million_reads = _literal_eval_option(ordered_million_reads) exclude_sample_ids = _literal_eval_option(exclude_sample_ids, default={}) ## Start collecting the data sample_table = [] samples_excluded = [] all_passed = True last_library_preps = p_con.get_latest_library_prep(project_name) last_library_preps_srm = [ x for l in last_library_preps.values() for x in l ] LOG.debug( "Looping through sample map that maps project sample names to sample run metrics ids" ) for k, v in samples.items(): LOG.debug("project sample '{}' maps to '{}'".format(k, v)) if not include_all_samples: if v['sample'] not in last_library_preps.keys(): LOG.info( "No library prep information for sample {}; keeping in report" .format(v['sample'])) else: if k not in last_library_preps_srm: LOG.info( "Sample run {} ('{}') is not latest library prep ({}) for project sample {}: excluding from report" .format(k, v["id"], last_library_preps[v['sample']].values()[0], v['sample'])) continue else: pass if re.search("Unexpected", k): continue barcode_seq = s_con.get_entry(k, "sequence") # Exclude sample id? if _exclude_sample_id(exclude_sample_ids, v['sample'], barcode_seq): samples_excluded.append(v['sample']) continue # Get the project sample name from the sample run and set table values project_sample = sample_dict[v['sample']] vals = _set_sample_table_values(v['sample'], project_sample, barcode_seq, ordered_million_reads, param) if vals['Status'] == "N/A" or vals['Status'] == "NP": all_passed = False sample_table.append([vals[k] for k in table_keys]) # Loop through samples in sample_dict for which there is no sample run information samples_in_table_or_excluded = list(set([x[0] for x in sample_table ])) + samples_excluded samples_not_in_table = list( set(sample_dict.keys()) - set(samples_in_table_or_excluded)) for sample in samples_not_in_table: if re.search("Unexpected", sample): continue project_sample = sample_dict[sample] # Set project_sample_d: a dictionary mapping from sample run metrics name to sample run metrics database id project_sample_d = _set_project_sample_dict(project_sample) if project_sample_d: for k, v in project_sample_d.iteritems(): barcode_seq = s_con.get_entry(k, "sequence") vals = _set_sample_table_values(sample, project_sample, barcode_seq, ordered_million_reads, param) if vals['Status'] == "N/A" or vals['Status'] == "NP": all_passed = False sample_table.append([vals[k] for k in table_keys]) else: barcode_seq = None vals = _set_sample_table_values(sample, project_sample, barcode_seq, ordered_million_reads, param) if vals['Status'] == "N/A" or vals['Status'] == "NP": all_passed = False sample_table.append([vals[k] for k in table_keys]) if all_passed: param["finished"] = 'Project finished.' sample_table.sort() sample_table = list(sample_table for sample_table, _ in itertools.groupby(sample_table)) sample_table.insert(0, [ 'ScilifeID', 'CustomerID', 'BarcodeSeq', 'MSequenced', 'MOrdered', 'Status' ]) paragraphs["Samples"]["tpl"] = make_sample_table(sample_table) make_note("{}_project_summary.pdf".format(project_name), headers, paragraphs, **param) make_rest_note("{}_project_summary.rst".format(project_name), sample_table=sample_table, report="project_report", **param) param.update( {k: "N/A" for k in param.keys() if param[k] is None or param[k] == ""}) output_data["debug"].write( json.dumps({ 'param': param, 'table': sample_table })) return output_data
class TestQCUpload(PmFullTest): def setUp(self): self.app = self.make_app( argv=["qc", "upload-qc", flowcells[0], "--mtime", "10000"], extensions=["scilifelab.pm.ext.ext_qc", "scilifelab.pm.ext.ext_couchdb"], ) self._run_app() self.s_con = SampleRunMetricsConnection(dbname="samples-test", username="******", password="******") self.p_con = ProjectSummaryConnection(dbname="projects-test", username="******", password="******") self.fc_con = FlowcellRunMetricsConnection(dbname="flowcells-test", username="******", password="******") def test_samplesheet(self): """Test samplesheet upload""" fc = self.fc_con.get_entry("120924_AC003CCCXX") self.assertEqual(fc["samplesheet_csv"][0]["Index"], "TGACCA") self.assertEqual(fc["samplesheet_csv"][0]["Description"], "J__Doe_00_01") self.assertEqual(fc["samplesheet_csv"][0]["FCID"], "C003CCCXX") self.assertEqual(fc["samplesheet_csv"][1]["SampleRef"], "hg19") self.assertEqual(fc["samplesheet_csv"][2]["SampleID"], "P001_101_index3") def test_qc_upload(self): """Test running qc upload to server""" self.app = self.make_app( argv=["qc", "upload-qc", flowcells[1], "--mtime", "100"], extensions=["scilifelab.pm.ext.ext_qc", "scilifelab.pm.ext.ext_couchdb"], ) self._run_app() s = self.s_con.get_entry("4_120924_AC003CCCXX_CGTTAA") self.assertIsNone(s["project_sample_name"]) self.assertEqual(s["project_id"], "P003") def test_qc_update(self): """Test running qc update of a project id""" s = self.s_con.get_entry("4_120924_AC003CCCXX_CGTTAA") s["project_id"] = None self.assertIsNone(s["project_id"]) self.s_con.save(s) self.app = self.make_app( argv=["qc", "update", "--sample_prj", projects[2], "--project_id", "P003", "--debug", "--force"], extensions=["scilifelab.pm.ext.ext_qc", "scilifelab.pm.ext.ext_couchdb"], ) self._run_app() s = self.s_con.get_entry("4_120924_AC003CCCXX_CGTTAA") self.assertEqual(s["project_id"], "P003") def test_qc_update_sample_names(self): """Test running qc update of project sample names""" s1 = self.s_con.get_entry("1_120924_AC003CCCXX_TGACCA") s2 = self.s_con.get_entry("2_120924_AC003CCCXX_ACAGTG") s1["project_sample_name"] = None s2["project_sample_name"] = None self.assertIsNone(s1["project_sample_name"]) self.assertIsNone(s2["project_sample_name"]) self.s_con.save(s1) self.s_con.save(s2) sample_map = {"P001_101_index3": "P001_101_index3", "P001_102_index6": "P001_102"} self.app = self.make_app( argv=[ "qc", "update", "--sample_prj", projects[0], "--names", "{}".format(sample_map), "--debug", "--force", ], extensions=["scilifelab.pm.ext.ext_qc", "scilifelab.pm.ext.ext_couchdb"], ) self._run_app() s1 = self.s_con.get_entry("1_120924_AC003CCCXX_TGACCA") s2 = self.s_con.get_entry("2_120924_AC003CCCXX_ACAGTG") self.assertEqual(s1["project_sample_name"], "P001_101_index3") self.assertEqual(s2["project_sample_name"], "P001_102")
def sample_status_note(project_name=None, flowcell=None, username=None, password=None, url=None, ordered_million_reads=None, uppnex_id=None, customer_reference=None, bc_count=None, project_alias=[], projectdb="projects", samplesdb="samples", flowcelldb="flowcells", phix=None, **kw): """Make a sample status note. Used keywords: :param project_name: project name :param flowcell: flowcell id :param username: db username :param password: db password :param url: db url :param ordered_million_reads: number of ordered reads in millions :param uppnex_id: the uppnex id :param customer_reference: customer project name :param project_alias: project alias name :param phix: phix error rate """ # Cutoffs cutoffs = { "phix_err_cutoff": 2.0, "qv_cutoff": 30, } # parameters parameters = { "project_name": None, "start_date": None, "FC_id": None, "scilifelab_name": None, "rounded_read_count": None, "phix_error_rate": None, "avg_quality_score": None, "success": None, "run_mode": None, } # key mapping from sample_run_metrics to parameter keys srm_to_parameter = { "project_name": "sample_prj", "FC_id": "flowcell", "scilifelab_name": "barcode_name", "start_date": "date", "rounded_read_count": "bc_count" } LOG.debug("got parameters {}".format(parameters)) output_data = { 'stdout': StringIO(), 'stderr': StringIO(), 'debug': StringIO() } if not _assert_flowcell_format(flowcell): LOG.warn( "Wrong flowcell format {}; skipping. Please use the flowcell id (format \"[A-Z0-9]+XX\")" .format(flowcell)) return output_data output_data = _update_sample_output_data(output_data, cutoffs) # Connect and run s_con = SampleRunMetricsConnection(dbname=samplesdb, username=username, password=password, url=url) fc_con = FlowcellRunMetricsConnection(dbname=flowcelldb, username=username, password=password, url=url) p_con = ProjectSummaryConnection(dbname=projectdb, username=username, password=password, url=url) # Set up paragraphs paragraphs = sample_note_paragraphs() headers = sample_note_headers() # Get project project = p_con.get_entry(project_name) if not project: LOG.warn("No such project '{}'".format(project_name)) return output_data # Set samples list sample_run_list = _set_sample_run_list(project_name, flowcell, project_alias, s_con) if len(sample_run_list) == 0: LOG.warn( "No samples for project '{}', flowcell '{}'. Maybe there are no sample run metrics in statusdb?" .format(project_name, flowcell)) return output_data # Set options ordered_million_reads = _literal_eval_option(ordered_million_reads) bc_count = _literal_eval_option(bc_count) phix = _literal_eval_option(phix) # Count number of times a sample has been run on a flowcell; if several, make lane-specific reports sample_count = Counter([x.get("barcode_name") for x in sample_run_list]) # Loop samples and collect information s_param_out = [] for s in sample_run_list: s_param = {} LOG.debug( "working on sample '{}', sample run metrics name '{}', id '{}'". format(s.get("barcode_name", None), s.get("name", None), s.get("_id", None))) s_param.update(parameters) s_param.update( {key: s[srm_to_parameter[key]] for key in srm_to_parameter.keys()}) fc = "{}_{}".format(s.get("date"), s.get("flowcell")) # Get instrument try: s_param.update(instrument[fc_con.get_instrument(str(fc))]) except: LOG.warn( "Failed to set instrument and software versions for flowcell {} in report due to missing RunInfo -> Instrument field in statusdb. Either rerun 'pm qc update-qc' or search-and-replace 'NN' in the sample report." .format(fc)) s_param.update(instrument['default']) # Get run mode s_param["run_mode"] = fc_con.get_run_mode(str(fc)) s_param.update(software_versions) s_param["phix_error_rate"] = fc_con.get_phix_error_rate( str(fc), s["lane"]) if phix: s_param["phix_error_rate"] = _get_phix_error_rate(s["lane"], phix) s_param['avg_quality_score'] = calc_avg_qv(s) if not s_param['avg_quality_score']: LOG.warn( "Calculation of average quality failed for sample {}, id {}". format(s.get("name"), s.get("_id"))) # Compare phix error and qv to cutoffs err_stat = "OK" qv_stat = "OK" if s_param["phix_error_rate"] > cutoffs["phix_err_cutoff"]: err_stat = "HIGH" elif s_param["phix_error_rate"] == -1: err_stat = "N/A" if s_param["avg_quality_score"] < cutoffs["qv_cutoff"]: qv_stat = "LOW" output_data["stdout"].write( "{:>18}\t{:>6}\t{:>12}\t{:>12}\t{:>12}\t{:>12}\n".format( s["barcode_name"], s["lane"], s_param["phix_error_rate"], err_stat, s_param["avg_quality_score"], qv_stat)) # Update/set remaning sample run parameters, falling back on project defaults if *key* is missing s_param['ordered_amount'] = s_param.get( 'ordered_amount', p_con.get_ordered_amount(project_name)) s_param['customer_reference'] = s_param.get( 'customer_reference', project.get('customer_reference')) s_param['uppnex_project_id'] = s_param.get('uppnex_project_id', project.get('uppnex_id')) # Override database settings if options passed at command line if ordered_million_reads: s_param["ordered_amount"] = _get_ordered_million_reads( s["barcode_name"], ordered_million_reads) if bc_count: s_param["rounded_read_count"] = _round_read_count_in_millions( _get_bc_count(s["barcode_name"], bc_count, s)) else: s_param["rounded_read_count"] = _round_read_count_in_millions( s_param["rounded_read_count"]) if uppnex_id: s_param["uppnex_project_id"] = uppnex_id if customer_reference: s_param["customer_reference"] = customer_reference # Get the project sample name corresponding to the sample run project_sample = p_con.get_project_sample( project_name, s.get("project_sample_name", None)) if project_sample: LOG.debug( "project sample run metrics mapping found: '{}' : '{}'".format( s["name"], project_sample["sample_name"])) project_sample_item = project_sample['project_sample'] # Set project_sample_d: a dictionary mapping from sample run metrics name to sample run metrics database id project_sample_d = _set_project_sample_dict(project_sample_item) if not project_sample_d: LOG.warn( "No sample_run_metrics information for sample '{}', barcode name '{}', id '{}'\n\tProject summary information {}" .format(s["name"], s["barcode_name"], s["_id"], project_sample)) # Check if sample run metrics name present in project database: if so, verify that database ids are consistent if s["name"] not in project_sample_d.keys(): LOG.warn( "no such sample run metrics '{}' in project sample run metrics dictionary" .format(s["name"])) else: if s["_id"] == project_sample_d[s["name"]]: LOG.debug( "project sample run metrics mapping found: '{}' : '{}'" .format(s["name"], project_sample_d[s["name"]])) else: LOG.warn( "inconsistent mapping for '{}': '{}' != '{}' (project summary id)" .format(s["name"], s["_id"], project_sample_d[s["name"]])) s_param['customer_name'] = project_sample_item.get( "customer_name", None) # No project sample found. Manual upload to database necessary. else: s_param['customer_name'] = None LOG.warn( "No project sample name found for sample run name '{}'".format( s["barcode_name"])) LOG.info( "Please run 'pm qc upload-qc FLOWCELL_ID --extensive-matching' to update project sample names " ) LOG.info( "or 'pm qc update --sample_prj PROJECT_NAME --names BARCODE_TO_SAMPLE_MAP to update project sample names." ) LOG.info("Please refer to the pm documentation for examples.") query_ok(force=kw.get("force", False)) # Finally assess sequencing success, update parameters and set outputs s_param['success'] = sequencing_success(s_param, cutoffs) s_param.update({ k: "N/A" for k in s_param.keys() if s_param[k] is None or s_param[k] == "" or s_param[k] == -1.0 }) if sample_count[s.get("barcode_name")] > 1: outfile = "{}_{}_{}_{}.pdf".format(s["barcode_name"], s["date"], s["flowcell"], s["lane"]) else: outfile = "{}_{}_{}.pdf".format(s["barcode_name"], s["date"], s["flowcell"]) s_param["outfile"] = outfile s_param_out.append(s_param) # Write final output to reportlab and rst files output_data["debug"].write( json.dumps({ 's_param': s_param_out, 'sample_runs': {s["name"]: s["barcode_name"] for s in sample_run_list} })) notes = [ make_note(headers=headers, paragraphs=paragraphs, **sp) for sp in s_param_out ] rest_notes = make_sample_rest_notes( "{}_{}_{}_sample_summary.rst".format(project_name, s.get("date", None), s.get("flowcell", None)), s_param_out) concatenate_notes( notes, "{}_{}_{}_sample_summary.pdf".format(project_name, s.get("date", None), s.get("flowcell", None))) return output_data
def storage_cleanup(self): storage_conf = self.app.config.get_section_dict('storage') db_info = self.app.config.get_section_dict('db') f_conn = FlowcellRunMetricsConnection(username=db_info.get('user'), password=db_info.get('password'), url=db_info.get('url')) servers = [server for server in storage_conf.keys()] server = platform.node().split('.')[0].lower() if server in servers: self.app.log.info("Performing cleanup on production server \"{}\"...".format(server)) dirs = [d.lstrip() for d in storage_conf.get(server).split(',')] #Collect old runs (> 30 days in nosync folder) to remove old_runs = [] for d in dirs: nosync_dir = os.path.join(d, 'nosync') for fc in glob.iglob(os.path.join(nosync_dir, '1*')): if os.path.isdir(fc): fc_name = os.path.basename(fc) #Check that there is no check file indicating to not remove the run if not os.path.exists(os.path.join(fc, 'no_remove.txt')): stats = os.stat(os.path.join(fc, 'RTAComplete.txt')) mod_time = datetime.now() - datetime.fromtimestamp(stats.st_mtime) if mod_time.days >= 30: old_runs.append(fc) else: self.app.log.warn("no_remove.txt file found in {}, skipping run".format(fc_name)) #NAS servers if 'nas' in server: #Collect newly finished runs fc_list = [] for d in dirs: for fc in glob.glob(os.path.join(d, '1*')): if os.path.exists(os.path.join(fc, 'RTAComplete.txt')): fc_list.append(fc) #Move to nosync retries = 5 for fc in fc_list: fc_name = os.path.basename(fc) while retries: if 'Finished' in last_lines(storage_conf.get('lsyncd_log'), 1)[0]: break retries -= 1 time.sleep(3) if retries: self.app.log.info("lsyncd process seems to be up to speed, and run {} " \ "is finished, moving it to nosync".format(fc_name)) shutil.move(fc, os.path.join(os.path.dirname(fc), 'nosync')) #Touch RTAComplete.txt file to that the modification date is the date when #it was moved to nosync try: open(os.path.join(os.path.dirname(fc), 'nosync', os.path.basename(fc), 'RTAComplete.txt'), 'w').close() except IOError: self.app.log.warn("No RTAComplete.txt file was found for run {}." \ " Please check".format(os.path.basename(fc_name))) fc_db_id = f_conn.id_view.get(fc_name) if fc_db_id: f_conn.set_storage_status(fc_db_id, 'NAS_nosync') else: self.app.log.warn("Flowcell {} not found in the database, not changing status.".format(fc_name)) else: self.app.log.warn("lsyncd process doesn't seem to be finished. " \ "Skipping run {}".format(os.path.basename(fc))) #Remove old runs for fc in old_runs: fc_name = os.path.basename(fc) #Check that the run has been archived in swestore before removing permanently if fc_name in f_conn.get_storage_status('swestore_archived').keys(): self.app.log.info("Run {} has been in nosync for more than 30 days " \ "and is archived in swestore. Permanently removing it from the NAS".format(fc_name)) shutil.rmtree(fc) else: self.app.log.warn("Run {} has been in nosync for more than 30 " \ "days, but has not yet been archived in swestore. " \ "Not removing, please check it".format(fc_name)) #Processing servers (b5) else: #Collect finished runs fc_list = [] for d in dirs: for fc in glob.glob(os.path.join(d, '1*')): if os.path.exists(os.path.join(fc, 'second_read_processing_completed.txt')): fc_list.append(fc) #Move to nosync for fc in fc_list: fc_name = os.path.basename(fc) self.app.log.info("Moving run {} to nosync".format(fc_name)) shutil.move(fc, os.path.join(os.path.dirname(fc), 'nosync')) #Remove old runs for fc in old_runs: fc_name = os.path.basename(fc) self.app.log.info("Run {} has been in nosync for more than 30 " \ "days, permanently removing it from {}".format(fc_name, server)) shutil.rmtree(fc) else: self.app.log.warn("You're running the cleanup functionality in {}. But this " \ "server doen't seem to be on your pm.conf file. Are you on the correct server?".format(server))
def sample_status_note(project_name=None, flowcell=None, username=None, password=None, url=None, ordered_million_reads=None, uppnex_id=None, customer_reference=None, bc_count=None, project_alias=[], projectdb="projects", samplesdb="samples", flowcelldb="flowcells", phix=None, **kw): """Make a sample status note. Used keywords: :param project_name: project name :param flowcell: flowcell id :param username: db username :param password: db password :param url: db url :param ordered_million_reads: number of ordered reads in millions :param uppnex_id: the uppnex id :param customer_reference: customer project name :param project_alias: project alias name :param phix: phix error rate """ # Cutoffs cutoffs = { "phix_err_cutoff" : 2.0, "qv_cutoff" : 30, } # parameters parameters = { "project_name" : None, "start_date" : None, "FC_id" : None, "scilifelab_name" : None, "rounded_read_count" : None, "phix_error_rate" : None, "avg_quality_score" : None, "success" : None, "run_mode":None, } # key mapping from sample_run_metrics to parameter keys srm_to_parameter = {"project_name":"sample_prj", "FC_id":"flowcell", "scilifelab_name":"barcode_name", "start_date":"date", "rounded_read_count":"bc_count"} LOG.debug("got parameters {}".format(parameters)) output_data = {'stdout':StringIO(), 'stderr':StringIO(), 'debug':StringIO()} if not _assert_flowcell_format(flowcell): LOG.warn("Wrong flowcell format {}; skipping. Please use the flowcell id (format \"[A-Z0-9]+XX\")".format(flowcell) ) return output_data output_data = _update_sample_output_data(output_data, cutoffs) # Connect and run s_con = SampleRunMetricsConnection(dbname=samplesdb, username=username, password=password, url=url) fc_con = FlowcellRunMetricsConnection(dbname=flowcelldb, username=username, password=password, url=url) p_con = ProjectSummaryConnection(dbname=projectdb, username=username, password=password, url=url) # Set up paragraphs paragraphs = sample_note_paragraphs() headers = sample_note_headers() # Get project project = p_con.get_entry(project_name) if not project: LOG.warn("No such project '{}'".format(project_name)) return output_data # Set samples list sample_run_list = _set_sample_run_list(project_name, flowcell, project_alias, s_con) if len(sample_run_list) == 0: LOG.warn("No samples for project '{}', flowcell '{}'. Maybe there are no sample run metrics in statusdb?".format(project_name, flowcell)) return output_data # Set options ordered_million_reads = _literal_eval_option(ordered_million_reads) bc_count = _literal_eval_option(bc_count) phix = _literal_eval_option(phix) # Count number of times a sample has been run on a flowcell; if several, make lane-specific reports sample_count = Counter([x.get("barcode_name") for x in sample_run_list]) # Loop samples and collect information s_param_out = [] for s in sample_run_list: s_param = {} LOG.debug("working on sample '{}', sample run metrics name '{}', id '{}'".format(s.get("barcode_name", None), s.get("name", None), s.get("_id", None))) s_param.update(parameters) s_param.update({key:s[srm_to_parameter[key]] for key in srm_to_parameter.keys()}) fc = "{}_{}".format(s.get("date"), s.get("flowcell")) # Get instrument try: s_param.update(instrument[fc_con.get_instrument(str(fc))]) except: LOG.warn("Failed to set instrument and software versions for flowcell {} in report due to missing RunInfo -> Instrument field in statusdb. Either rerun 'pm qc update-qc' or search-and-replace 'NN' in the sample report.".format(fc)) s_param.update(instrument['default']) # Get run mode s_param["run_mode"] = fc_con.get_run_mode(str(fc)) s_param.update(software_versions) s_param["phix_error_rate"] = fc_con.get_phix_error_rate(str(fc), s["lane"]) if phix: s_param["phix_error_rate"] = _get_phix_error_rate(s["lane"], phix) s_param['avg_quality_score'] = calc_avg_qv(s) if not s_param['avg_quality_score']: LOG.warn("Calculation of average quality failed for sample {}, id {}".format(s.get("name"), s.get("_id"))) # Compare phix error and qv to cutoffs err_stat = "OK" qv_stat = "OK" if s_param["phix_error_rate"] > cutoffs["phix_err_cutoff"]: err_stat = "HIGH" elif s_param["phix_error_rate"] == -1: err_stat = "N/A" if s_param["avg_quality_score"] < cutoffs["qv_cutoff"]: qv_stat = "LOW" output_data["stdout"].write("{:>18}\t{:>6}\t{:>12}\t{:>12}\t{:>12}\t{:>12}\n".format(s["barcode_name"], s["lane"], s_param["phix_error_rate"], err_stat, s_param["avg_quality_score"], qv_stat)) # Update/set remaning sample run parameters, falling back on project defaults if *key* is missing s_param['ordered_amount'] = s_param.get('ordered_amount', p_con.get_ordered_amount(project_name)) s_param['customer_reference'] = s_param.get('customer_reference', project.get('customer_reference')) s_param['uppnex_project_id'] = s_param.get('uppnex_project_id', project.get('uppnex_id')) # Override database settings if options passed at command line if ordered_million_reads: s_param["ordered_amount"] = _get_ordered_million_reads(s["barcode_name"], ordered_million_reads) if bc_count: s_param["rounded_read_count"] = _round_read_count_in_millions(_get_bc_count(s["barcode_name"], bc_count, s)) else: s_param["rounded_read_count"] = _round_read_count_in_millions(s_param["rounded_read_count"]) if uppnex_id: s_param["uppnex_project_id"] = uppnex_id if customer_reference: s_param["customer_reference"] = customer_reference # Get the project sample name corresponding to the sample run project_sample = p_con.get_project_sample(project_name, s.get("project_sample_name", None)) if project_sample: LOG.debug("project sample run metrics mapping found: '{}' : '{}'".format(s["name"], project_sample["sample_name"])) project_sample_item = project_sample['project_sample'] # Set project_sample_d: a dictionary mapping from sample run metrics name to sample run metrics database id project_sample_d = _set_project_sample_dict(project_sample_item) if not project_sample_d: LOG.warn("No sample_run_metrics information for sample '{}', barcode name '{}', id '{}'\n\tProject summary information {}".format(s["name"], s["barcode_name"], s["_id"], project_sample)) # Check if sample run metrics name present in project database: if so, verify that database ids are consistent if s["name"] not in project_sample_d.keys(): LOG.warn("no such sample run metrics '{}' in project sample run metrics dictionary".format(s["name"]) ) else: if s["_id"] == project_sample_d[s["name"]]: LOG.debug("project sample run metrics mapping found: '{}' : '{}'".format(s["name"], project_sample_d[s["name"]])) else: LOG.warn("inconsistent mapping for '{}': '{}' != '{}' (project summary id)".format(s["name"], s["_id"], project_sample_d[s["name"]])) s_param['customer_name'] = project_sample_item.get("customer_name", None) # No project sample found. Manual upload to database necessary. else: s_param['customer_name'] = None LOG.warn("No project sample name found for sample run name '{}'".format(s["barcode_name"])) LOG.info("Please run 'pm qc upload-qc FLOWCELL_ID --extensive-matching' to update project sample names ") LOG.info("or 'pm qc update --sample_prj PROJECT_NAME --names BARCODE_TO_SAMPLE_MAP to update project sample names.") LOG.info("Please refer to the pm documentation for examples.") query_ok(force=kw.get("force", False)) # Finally assess sequencing success, update parameters and set outputs s_param['success'] = sequencing_success(s_param, cutoffs) s_param.update({k:"N/A" for k in s_param.keys() if s_param[k] is None or s_param[k] == "" or s_param[k] == -1.0}) if sample_count[s.get("barcode_name")] > 1: outfile = "{}_{}_{}_{}.pdf".format(s["barcode_name"], s["date"], s["flowcell"], s["lane"]) else: outfile = "{}_{}_{}.pdf".format(s["barcode_name"], s["date"], s["flowcell"]) s_param["outfile"] = outfile s_param_out.append(s_param) # Write final output to reportlab and rst files output_data["debug"].write(json.dumps({'s_param': s_param_out, 'sample_runs':{s["name"]:s["barcode_name"] for s in sample_run_list}})) notes = [make_note(headers=headers, paragraphs=paragraphs, **sp) for sp in s_param_out] rest_notes = make_sample_rest_notes("{}_{}_{}_sample_summary.rst".format(project_name, s.get("date", None), s.get("flowcell", None)), s_param_out) concatenate_notes(notes, "{}_{}_{}_sample_summary.pdf".format(project_name, s.get("date", None), s.get("flowcell", None))) return output_data
def swestore(self): """This function is the entry point for tasks having to do with packaging and sending runs to swestore """ db_info = self.app.config.get_section_dict('db') f_conn = FlowcellRunMetricsConnection(username=db_info.get('user'), password=db_info.get('password'), url=db_info.get('url')) swestore_paths = set(self.config.get('archive','swestore_staging').split(',')) run = self.pargs.tarball if self.pargs.tarball else self.pargs.flowcell swestore_dir = get_path_swestore_staging(run, swestore_paths) # Create a tarball out of the run folder if self.pargs.package_run: # We require a flowcell argument if not self._check_pargs(["flowcell"]): return self.pargs.tarball = package_run(self, swestore_dir, **vars(self.pargs)) if not self.pargs.tarball: self.log.error("No tarball was created, exiting") return if self.pargs.clean: rm_run(self,self.config.get('archive','root'), flowcell=self.pargs.flowcell) if self.pargs.clean_from_staging: #Check that the run has been archived on the NAS before removing it, otherwise it will keep synching if self.pargs.flowcell in f_conn.get_storage_status('NAS_nosync').keys(): rm_run(self, swestore_dir, flowcell=self.pargs.flowcell) else: self.log.warn("Run storage status is not NAS_nosync, not removing run from swestore_stage!") if not self.pargs.tarball: self.log.error("Required argument --tarball was not specified") return if not os.path.exists(os.path.join(swestore_dir, self.pargs.tarball)): self.log.error("Tarball {} does not exist".format(self.pargs.tarball)) return # Upload a tarball to a remote host if self.pargs.remote_upload: result = upload_tarball(self, **dict(self.config.get_section_dict('archive').items() + vars(self.pargs).items())) if not result: return if self.pargs.clean: rm_tarball(self,tarball=self.pargs.tarball) # Send the tarball to Swestore using irods if self.pargs.send_to_swestore: result = send_to_swestore(self,**dict(self.config.get_section_dict('archive').items() + vars(self.pargs).items())) if not result: # If archiving failed, we need to give a non-zero exit code in order for a remote instance to detect the failure sys.exit(1) if self.pargs.clean: rm_tarball(self,tarball=self.pargs.tarball) #Set the run as archived in StatusDB fc_id = self.pargs.flowcell if self.pargs.flowcell else self.pargs.tarball.split('.')[0] fc_db_id = f_conn.id_view.get(fc_id) if fc_db_id: f_conn.set_storage_status(fc_db_id, 'swestore_archived') else: self.log.warn("Flowcell {} not found in the database, not changing status.".format(fc_id)) # Log to statusdb if self.pargs.log_to_db: # implement this raise NotImplementedError("logging to db functionality not implemented")
class TestQCUpload(PmFullTest): def setUp(self): """FIXME: All other tests depend on data being uploaded, so these are not real unit tests. The setup to TestQCUpload has to be run prior to other tests, else unexpected failures will occur.""" self.app = self.make_app( argv=['qc', 'upload-qc', flowcells[0], '--mtime', '10000'], extensions=[ 'scilifelab.pm.ext.ext_qc', 'scilifelab.pm.ext.ext_couchdb' ]) self._run_app() self.app = self.make_app( argv=['qc', 'upload-qc', flowcells[1], '--mtime', '10000'], extensions=[ 'scilifelab.pm.ext.ext_qc', 'scilifelab.pm.ext.ext_couchdb' ]) self._run_app() self.s_con = SampleRunMetricsConnection(dbname="samples-test", username="******", password="******") self.p_con = ProjectSummaryConnection(dbname="projects-test", username="******", password="******") self.fc_con = FlowcellRunMetricsConnection(dbname="flowcells-test", username="******", password="******") def test_samplesheet(self): """Test samplesheet upload""" fc = self.fc_con.get_entry("120924_AC003CCCXX") self.assertEqual(fc["samplesheet_csv"][0]["Index"], "TGACCA") self.assertEqual(fc["samplesheet_csv"][0]["Description"], "J__Doe_00_01") self.assertEqual(fc["samplesheet_csv"][0]["FCID"], "C003CCCXX") self.assertEqual(fc["samplesheet_csv"][1]["SampleRef"], "hg19") self.assertEqual(fc["samplesheet_csv"][2]["SampleID"], "P002_101_index3") def test_qc_upload(self): """Test running qc upload to server. Slightly circular testing here - I setup the module with qc update so by definition the test must 'work'""" self.app = self.make_app( argv=['qc', 'upload-qc', flowcells[1], '--mtime', '100'], extensions=[ 'scilifelab.pm.ext.ext_qc', 'scilifelab.pm.ext.ext_couchdb' ]) self._run_app() s = self.s_con.get_entry("4_120924_AC003CCCXX_CGTTAA") self.assertIsNone(s["project_sample_name"]) self.assertEqual(s["project_id"], "P003") def test_qc_update(self): """Test running qc update of a project id""" s = self.s_con.get_entry("4_120924_AC003CCCXX_CGTTAA") s["project_id"] = None self.assertIsNone(s["project_id"]) self.s_con.save(s) self.app = self.make_app(argv=[ 'qc', 'update', '--sample_prj', projects[2], '--project_id', 'P003', '--debug', '--force' ], extensions=[ 'scilifelab.pm.ext.ext_qc', 'scilifelab.pm.ext.ext_couchdb' ]) self._run_app() s = self.s_con.get_entry("4_120924_AC003CCCXX_CGTTAA") self.assertEqual(s["project_id"], "P003") def test_qc_update_sample_names(self): """Test running qc update of project sample names""" s1 = self.s_con.get_entry("1_120924_AC003CCCXX_TGACCA") s2 = self.s_con.get_entry("2_120924_AC003CCCXX_ACAGTG") s1["project_sample_name"] = None s2["project_sample_name"] = None self.assertIsNone(s1["project_sample_name"]) self.assertIsNone(s2["project_sample_name"]) self.s_con.save(s1) self.s_con.save(s2) sample_map = { 'P001_101_index3': 'P001_101_index3', 'P001_102_index6': 'P001_102' } self.app = self.make_app(argv=[ 'qc', 'update', '--sample_prj', projects[0], '--names', "{}".format(sample_map), '--debug', '--force' ], extensions=[ 'scilifelab.pm.ext.ext_qc', 'scilifelab.pm.ext.ext_couchdb' ]) self._run_app() s1 = self.s_con.get_entry("1_120924_AC003CCCXX_TGACCA") s2 = self.s_con.get_entry("2_120924_AC003CCCXX_ACAGTG") self.assertEqual(s1["project_sample_name"], "P001_101_index3") self.assertEqual(s2["project_sample_name"], "P001_102")