def bpreport(self): if not self._check_pargs(["project"]): return kw = vars(self.pargs) basedir = os.path.abspath( os.path.join(self.app.controller._meta.root_path, self.app.controller._meta.path_id)) flist = find_samples(basedir, **vars(self.pargs)) if not len(flist) > 0: self.log.info("No samples/sample configuration files found") return if self.pargs.no_statusdb: sample_name_map = None else: if not self._check_pargs(["statusdb_project_name"]): return p_con = ProjectSummaryConnection(dbname=self.app.config.get( "db", "projects"), **vars(self.app.pargs)) s_con = SampleRunMetricsConnection(dbname=self.app.config.get( "db", "samples"), **vars(self.app.pargs)) sample_name_map = get_scilife_to_customer_name( self.pargs.statusdb_project_name, p_con, s_con) kw.update(project_name=self.pargs.project, flist=flist, basedir=basedir, sample_name_map=sample_name_map) out_data = best_practice_note(**kw) self.log.info( "Wrote report to directory {}; use Makefile to generate pdf report" .format(basedir)) self.app._output_data['stdout'].write(out_data['stdout'].getvalue()) self.app._output_data['stderr'].write(out_data['stderr'].getvalue()) self.app._output_data['debug'].write(out_data['debug'].getvalue())
def setUp(self): """FIXME: All other tests depend on data being uploaded, so these are not real unit tests. The setup to TestQCUpload has to be run prior to other tests, else unexpected failures will occur.""" self.app = self.make_app( argv=['qc', 'upload-qc', flowcells[0], '--mtime', '10000'], extensions=[ 'scilifelab.pm.ext.ext_qc', 'scilifelab.pm.ext.ext_couchdb' ]) self._run_app() self.app = self.make_app( argv=['qc', 'upload-qc', flowcells[1], '--mtime', '10000'], extensions=[ 'scilifelab.pm.ext.ext_qc', 'scilifelab.pm.ext.ext_couchdb' ]) self._run_app() self.s_con = SampleRunMetricsConnection(dbname="samples-test", username="******", password="******") self.p_con = ProjectSummaryConnection(dbname="projects-test", username="******", password="******") self.fc_con = FlowcellRunMetricsConnection(dbname="flowcells-test", username="******", password="******")
def setUpClass(cls): # Temporarily add new sample for use in exclusion tests s_con = SampleRunMetricsConnection(dbname="samples-test", username="******", password="******") s = s_con.get_entry("1_121015_BB002BBBXX_TGACCA") kw = copy.deepcopy(s) del kw["_id"] new_s = SampleRunMetricsDocument(**kw) new_s["sequence"] = "AGTTGA" new_s["name"] = "1_121015_BB002BBBXX_AGTTGA" s_con.save(new_s) kw = copy.deepcopy(s) del kw["_id"] new_s = SampleRunMetricsDocument(**kw) new_s["sample_prj"] = "j-doe_00_01" new_s["sequence"] = "CGAACG" new_s["name"] = "1_121015_BB002BBBXX_CGAACG" s_con.save(new_s) s = s_con.get_entry("3_120924_AC003CCCXX_ACAGTG") kw = copy.deepcopy(s) del kw["_id"] new_s = SampleRunMetricsDocument(**kw) new_s["sample_prj"] = "j-doe_00_02" new_s["sequence"] = "GGAAGG" new_s["name"] = "3_120924_AC003CCCXX_GGAAGG" s_con.save(new_s)
def test_2_make_note(self): """Make a note subset by example flowcell and project""" s_con = SampleRunMetricsConnection(username=self.user, password=self.pw, url=self.url) fc_con = FlowcellRunMetricsConnection(username=self.user, password=self.pw, url=self.url) p_con = ProjectSummaryConnection(username=self.user, password=self.pw, url=self.url) paragraphs = sample_note_paragraphs() headers = sample_note_headers() samples = s_con.get_samples(self.examples["flowcell"], self.examples["project"]) project = p_con.get_entry(self.examples["project"]) samples = p_con.map_srm_to_name(self.examples["project"], fc_id=self.examples["flowcell"], use_bc_map=True) for k,v in samples.items(): s_param = parameters s = s_con.get_entry(k) s_param.update({key:s[srm_to_parameter[key]] for key in srm_to_parameter.keys()}) fc = "{}_{}".format(s["date"], s["flowcell"]) s_param["phix_error_rate"] = fc_con.get_phix_error_rate(str(fc), s["lane"]) s_param['avg_quality_score'] = s_con.calc_avg_qv(s["name"]) s_param['rounded_read_count'] = round(float(s_param['rounded_read_count'])/1e6,1) if s_param['rounded_read_count'] else None s_param['customer_name'] = project['samples'][v["sample"]].get('customer_name', None) if project: s_param['ordered_amount'] = p_con.get_ordered_amount(self.examples["project"]) s_param['customer_reference'] = s_param.get('customer_reference', project['customer_reference']) s_param['uppnex_project_id'] = s_param.get('uppnex_project_id', project['uppnex_id']) s_param['success'] = sequencing_success(s_param, cutoffs) s_param.update({k:"N/A" for k in s_param.keys() if s_param[k] is None}) make_note("{}.pdf".format(s["barcode_name"]), headers, paragraphs, **s_param)
def test_3_get_sample_ids(self): """Test getting sample ids given flowcell and sample_prj""" sample_con = SampleRunMetricsConnection(username=self.user, password=self.pw, url=self.url) sample_ids = sample_con.get_sample_ids(fc_id=self.examples["flowcell"]) print "Number of samples before subsetting: " + str(len(sample_ids)) sample_ids = sample_con.get_sample_ids(fc_id=self.examples["flowcell"], sample_prj=self.examples["project"]) print "Number of samples after subsetting: " + str(len(sample_ids))
def test_get_flowcell(self): """Test getting a flowcell for a given sample""" sample_con = SampleRunMetricsConnection(dbname="samples-test", username=self.user, password=self.pw, url=self.url) fc = sample_con.get_entry(self.examples["sample"], "flowcell") self.assertEqual(str(fc), self.examples["flowcell"])
def test_connection(self): """Test database connection""" sample_con = SampleRunMetricsConnection(dbname="samples-test", username=self.user, password=self.pw, url=self.url) self.assertEqual(sample_con.url_string, "http://{}:5984".format(self.url))
def upload_qc(self): if not self._check_pargs(['flowcell']): return url = self.pargs.url if self.pargs.url else self.app.config.get( "db", "url") if not url: self.app.log.warn("Please provide a valid url: got {}".format(url)) return if not validate_fc_directory_format(self.pargs.flowcell): self.app.log.warn( "Path '{}' does not conform to bcbio flowcell directory format; aborting" .format(self.pargs.flowcell)) return runinfo_csv = os.path.join(os.path.abspath(self.pargs.flowcell), "{}.csv".format(fc_id(self.pargs.flowcell))) runinfo_yaml = os.path.join(os.path.abspath(self.pargs.flowcell), "run_info.yaml") (fc_date, fc_name) = fc_parts(self.pargs.flowcell) if int(fc_date) < 120815: self.log.info( "Assuming pre-casava based file structure for {}".format( fc_id(self.pargs.flowcell))) qc_objects = self._collect_pre_casava_qc() else: self.log.info("Assuming casava based file structure for {}".format( fc_id(self.pargs.flowcell))) qc_objects = self._collect_casava_qc() if len(qc_objects) == 0: self.log.info("No out-of-date qc objects for {}".format( fc_id(self.pargs.flowcell))) return else: self.log.info("Retrieved {} updated qc objects".format( len(qc_objects))) s_con = SampleRunMetricsConnection(dbname=self.app.config.get( "db", "samples"), **vars(self.app.pargs)) fc_con = FlowcellRunMetricsConnection(dbname=self.app.config.get( "db", "flowcells"), **vars(self.app.pargs)) p_con = ProjectSummaryConnection(dbname=self.app.config.get( "db", "projects"), **vars(self.app.pargs)) for obj in qc_objects: if self.app.pargs.debug: self.log.debug("{}: {}".format(str(obj), obj["_id"])) if isinstance(obj, FlowcellRunMetricsDocument): dry("Saving object {}".format(repr(obj)), fc_con.save(obj)) if isinstance(obj, SampleRunMetricsDocument): project_sample = p_con.get_project_sample( obj.get("sample_prj", None), obj.get("barcode_name", None), self.pargs.extensive_matching) if project_sample: obj["project_sample_name"] = project_sample['sample_name'] dry("Saving object {}".format(repr(obj)), s_con.save(obj))
def test_5_name_view(self): """Test unique ids in name view""" s_con = SampleRunMetricsConnection(username=self.user, password=self.pw, url=self.url) name_view = s_con.db.view("names/name", reduce=False) print s_con.name_view.keys()[0:10] print len(s_con.name_view.keys()) print len(list(set(s_con.name_view.keys()))) print len(name_view)
def test_4_srm_sample_map(self): """Make sample map from sample run metrics""" s_con = SampleRunMetricsConnection(username=self.user, password=self.pw, url=self.url) samples = s_con.get_project_samples(self.examples["project"]) sample_names = {x["name"]: x["_id"] for x in samples} print len(sample_names.keys()) print len(list(set(sample_names.keys()))) print sample_names
def test_get_samples_wrong_info(self): """Test getting samples when either flowcell or project id information is wrong""" sample_con = SampleRunMetricsConnection(dbname="samples-test", username=self.user, password=self.pw, url=self.url) samples = sample_con.get_samples(sample_prj="bogusproject", fc_id=self.examples["flowcell"]) LOG.info("Selecting on bogus project, subsetting on flowcell: " + str(len(samples))) self.assertEqual(len(samples), 0)
def update(self): if not self._check_pargs(["sample_prj"]): return url = self.pargs.url if self.pargs.url else self.app.config.get("db", "url") if not url: self.app.log.warn("Please provide a valid url: got {}".format(url)) return s_con = SampleRunMetricsConnection(dbname=self.app.config.get("db", "samples"), **vars(self.app.pargs)) samples = s_con.get_samples(sample_prj=self.pargs.sample_prj) if self.pargs.project_id: self.app.log.debug("Going to update 'project_id' to {} for sample runs with 'sample_prj' == {}".format(self.pargs.project_id, self.pargs.sample_prj)) for s in samples: if not s.get("project_id", None) is None: if not query_yes_no("'project_id':{} for sample {}; are you sure you want to overwrite?".format(s["project_id"], s["name"]), force=self.pargs.force): continue s["project_id"] = self.pargs.project_id s_con.save(s) if self.pargs.names: self.app.log.debug("Going to update 'project_sample_name' for sample runs with 'sample_prj' == {}".format(self.pargs.sample_prj)) if os.path.exists(self.pargs.names): with open(self.pargs.names) as fh: names_d = json.load(fh) else: names_d= ast.literal_eval(self.pargs.names) samples_sort = sorted(samples, key=lambda s:s["barcode_name"]) groups = {} for k, g in itertools.groupby(samples_sort, key=lambda x:x["barcode_name"]): groups[k] = list(g) for barcode_name in names_d: sample_list = groups.get(barcode_name, None) if not sample_list: continue for s in sample_list: if not s.get("project_sample_name", None) is None: if not query_yes_no("'project_sample_name':{} for sample {}; are you sure you want to overwrite?".format(s["project_sample_name"], s["name"]), force=self.pargs.force): continue s["project_sample_name"] = names_d[barcode_name] s_con.save(s) else: self.app.log.info("Trying to use extensive matching...") p_con = ProjectSummaryConnection(dbname=self.app.config.get("db", "projects"), **vars(self.app.pargs)) project_name = self.pargs.sample_prj if self.pargs.project_alias: project_name = self.pargs.project_alias for s in samples: project_sample = p_con.get_project_sample(project_name, s["barcode_name"], extensive_matching=True) if project_sample: self.app.log.info("using mapping '{} : {}'...".format(s["barcode_name"], project_sample["sample_name"])) s["project_sample_name"] = project_sample["sample_name"] s_con.save(s)
def tearDownClass(cls): s_con = SampleRunMetricsConnection(dbname="samples-test", username="******", password="******") s = s_con.get_entry("1_121015_BB002BBBXX_AGTTGA") doc = s_con.db.get(s["_id"]) s_con.db.delete(doc) s = s_con.get_entry("1_121015_BB002BBBXX_CGAACG") doc = s_con.db.get(s["_id"]) s_con.db.delete(doc) s = s_con.get_entry("3_120924_AC003CCCXX_GGAAGG") doc = s_con.db.get(s["_id"]) s_con.db.delete(doc)
def test_dbcon(self): """Test database connection and that we get expected values.""" s_con = SampleRunMetricsConnection(dbname="samples-test", username="******", password="******") samples = [s_con.get_entry(x) for x in s_con.name_view] samples_d = {x["name"]: x for x in samples} self.assertEqual(samples_d["1_120924_AC003CCCXX_TGACCA"]["date"], "120924") self.assertEqual(samples_d["1_121015_BB002BBBXX_TGACCA"]["flowcell"], "BB002BBBXX") self.assertEqual( samples_d["2_120924_AC003CCCXX_ACAGTG"]["entity_type"], "sample_run_metrics") self.assertEqual(samples_d["3_120924_AC003CCCXX_ACAGTG"]["lane"], "3") self.assertEqual(samples_d["4_120924_AC003CCCXX_CGTTAA"]["sequence"], "CGTTAA") self.assertEqual(samples_d["2_121015_BB002BBBXX_TGACCA"]["project_id"], "P002") fc_con = FlowcellRunMetricsConnection(dbname="flowcells-test", username="******", password="******") flowcells = [fc_con.get_entry(x) for x in fc_con.name_view] flowcells_d = {x["name"]: x for x in flowcells} self.assertEqual(flowcells_d["120924_AC003CCCXX"]["name"], "120924_AC003CCCXX") self.assertEqual(flowcells_d["121015_BB002BBBXX"]["name"], "121015_BB002BBBXX") self.assertEqual(flowcells_d["120924_AC003CCCXX"]["entity_type"], "flowcell_run_metrics") p_con = ProjectSummaryConnection(dbname="projects-test", username="******", password="******") projects = [p_con.get_entry(x) for x in p_con.name_view] projects_d = {x["project_name"]: x for x in projects} self.assertEqual( projects_d["J.Doe_00_01"]["min_m_reads_per_sample_ordered"], 0.1) self.assertEqual(projects_d["J.Doe_00_01"]["no_of_samples"], 2) self.assertEqual(set(projects_d["J.Doe_00_01"]["samples"].keys()), set(["P001_101_index3", "P001_102", "P001_103"])) self.assertEqual(projects_d["J.Doe_00_01"]["customer_reference"], "GnuGenome") self.assertEqual( projects_d["J.Doe_00_02"]["min_m_reads_per_sample_ordered"], 0.2) self.assertEqual(projects_d["J.Doe_00_03"]["samples"].keys(), ["3_index6"]) self.assertIn( "A", projects_d["J.Doe_00_03"]["samples"]["3_index6"]["library_prep"])
def test_get_sample_ids(self): """Test getting sample ids given flowcell and sample_prj""" sample_con = SampleRunMetricsConnection(dbname="samples-test", username=self.user, password=self.pw, url=self.url) sample_ids = sample_con.get_sample_ids(fc_id=self.examples["flowcell"]) LOG.info("Number of samples before subsetting: " + str(len(sample_ids))) self.assertEqual(len(sample_ids), 5) sample_ids = sample_con.get_sample_ids( fc_id=self.examples["flowcell"], sample_prj=self.examples["project"]) LOG.info("Number of samples after subsetting: " + str(len(sample_ids))) self.assertEqual(len(sample_ids), 2)
def test_get_project_sample_ids(self): """Test getting project sample ids""" sample_con = SampleRunMetricsConnection(dbname="samples-test", username=self.user, password=self.pw, url=self.url) sample_ids = sample_con.get_sample_ids( sample_prj=self.examples["project"]) sample_names = [sample_con.db.get(x)["name"] for x in sample_ids] self.assertEqual( set(sample_names), set([ '1_120924_AC003CCCXX_TGACCA', '2_120924_AC003CCCXX_ACAGTG', '1_121015_BB002BBBXX_TGACCA' ]))
def fastq_screen(project_name=None, flowcell=None, username=None, password=None, url=None, dbname="samples", **kw): """Perform application specific qc on a project. :param project_name: project name :param flowcell: flowcell identifier :param username: database username :param password: database password :param url: database url :param dbname: samples database name """ LOG.debug("Running fastq screen summary on project {}, flowcell ".format( project_name, flowcell)) output_data = {'stdout': StringIO(), 'stderr': StringIO()} s_con = SampleRunMetricsConnection(dbname=dbname, username=username, password=password, url=url) samples = s_con.get_samples(fc_id=flowcell, sample_prj=project_name) for s in samples: LOG.debug( "Checking fastq_screen data for sample {}, id {}, project {}". format(s.get("name", None), s.get("_id", None), s.get("sample_prj", None))) fqscreen_data = s.get("fastq_scr", {}) output_data["stdout"].write(s["barcode_name"] + "\n") if fqscreen_data: header = [[x for x in v.keys()] for k, v in fqscreen_data.iteritems()] output_data["stdout"].write("\t\t" + "".join("{:>27}".format(x) for x in header[0]) + "\n") vals = [ "{:>12}\t{}\n".format( k, "".join(["{:>27}".format(x) for x in v.values()])) for k, v in fqscreen_data.iteritems() ] for v in vals: output_data["stdout"].write(v) return output_data
def test_2_map_srmseqid_to_srmid(self): """Map srm seq id names to srm ids""" sample_con = SampleRunMetricsConnection(username=self.user, password=self.pw, url=self.url) sample_map = {} for k in sample_con.db: obj = sample_con.db.get(k) sample_seq_id = "{}_{}_{}_{}".format( obj.get("lane"), obj.get("date"), obj.get("flowcell"), obj.get("sequence", "NoIndex")) if not sample_seq_id in sample_map.keys(): sample_map[sample_seq_id] = [k] else: print "WARNING: duplicate for {}".format(sample_seq_id) sample_map[sample_seq_id].append(k) for k, v in sample_map.items(): if len(v) > 1: print k, v
def test_get_samples(self): """Test getting samples given flowcell and sample_prj.""" sample_con = SampleRunMetricsConnection(dbname="samples-test", username=self.user, password=self.pw, url=self.url) samples = sample_con.get_samples(fc_id=self.examples["flowcell"]) LOG.info("Selecting on flowcell: " + str(len(samples))) self.assertEqual(len(samples), 5) samples = sample_con.get_samples(fc_id=self.examples["flowcell"], sample_prj=self.examples["project"]) LOG.info("Selecting on flowcell, subsetting on project: " + str(len(samples))) self.assertEqual(len(samples), 2) samples = sample_con.get_samples(sample_prj=self.examples["project"]) LOG.info("Selecting on project: " + str(len(samples))) self.assertEqual(len(samples), 3) samples = sample_con.get_samples(sample_prj=self.examples["project"], fc_id=self.examples["flowcell"]) LOG.info("Selecting on project, subsetting on flowcell: " + str(len(samples))) self.assertEqual(len(samples), 2)
def _project_status_note_table(project_name=None, username=None, password=None, url=None, use_ps_map=True, use_bc_map=False, check_consistency=False, ordered_million_reads=None, uppnex_id=None, customer_reference=None, exclude_sample_ids={}, project_alias=None, sample_aliases={}, projectdb="projects", samplesdb="samples", flowcelldb="flowcells", include_all_samples=False, param={}, **kw): # mapping project_summary to parameter keys ps_to_parameter = { "scilife_name": "scilife_name", "customer_name": "customer_name", "project_name": "project_name" } # mapping project sample to table table_keys = [ 'ScilifeID', 'SubmittedID', 'BarcodeSeq', 'MSequenced', 'MOrdered' ] output_data = { 'stdout': StringIO(), 'stderr': StringIO(), 'debug': StringIO() } # Connect and run s_con = SampleRunMetricsConnection(dbname=samplesdb, username=username, password=password, url=url) fc_con = FlowcellRunMetricsConnection(dbname=flowcelldb, username=username, password=password, url=url) p_con = ProjectSummaryConnection(dbname=projectdb, username=username, password=password, url=url) #Get the information source for this project source = p_con.get_info_source(project_name) # Get project summary from project database sample_aliases = _literal_eval_option(sample_aliases, default={}) prj_summary = p_con.get_entry(project_name) if not prj_summary: LOG.warn("No such project '{}'".format(project_name)) return LOG.debug("Working on project '{}'.".format(project_name)) # Determine if project is finished by getting all samples sequenced date try: all_samples_sequenced = prj_summary['project_summary'][ 'all_samples_sequenced'] except (TypeError, KeyError): all_samples_sequenced = False # Get sample run list and loop samples to make mapping sample -> {sampleruns} sample_run_list = _set_sample_run_list(project_name, flowcell=None, project_alias=project_alias, s_con=s_con) samples = {} for s in sample_run_list: prj_sample = p_con.get_project_sample( project_name, s.get("project_sample_name", None)) if prj_sample: sample_name = prj_sample['project_sample'].get( "scilife_name", None) s_d = {s["name"]: {'sample': sample_name, 'id': s["_id"]}} samples.update(s_d) else: if s["barcode_name"] in sample_aliases: s_d = { sample_aliases[s["barcode_name"]]: { 'sample': sample_aliases[s["barcode_name"]], 'id': s["_id"] } } samples.update(s_d) else: s_d = { s["name"]: { 'sample': s["name"], 'id': s["_id"], 'barcode_name': s["barcode_name"] } } LOG.warn( "No mapping found for sample run:\n '{}'".format(s_d)) # Convert to mapping from desired sample name to list of aliases # Less important for the moment; one solution is to update the # Google docs summary table to use the P names sample_dict = prj_summary['samples'] param.update({ key: prj_summary.get(ps_to_parameter[key], None) for key in ps_to_parameter.keys() }) param["ordered_amount"] = param.get( "ordered_amount", p_con.get_ordered_amount(project_name, samples=sample_dict)) if not param.get('customer_reference'): try: param['customer_reference'] = prj_summary['details'][ 'customer_project_reference'] except (TypeError, KeyError): param['customer_reference'] = prj_summary.get('customer_reference') param['uppnex_project_id'] = param.get('uppnex_project_id', prj_summary.get('uppnex_id')) # Override database values if options passed at command line if uppnex_id: param["uppnex_project_id"] = uppnex_id if customer_reference: param["customer_reference"] = customer_reference # Process options ordered_million_reads = _literal_eval_option(ordered_million_reads) exclude_sample_ids = _literal_eval_option(exclude_sample_ids, default={}) ## Start collecting the data sample_table = [] samples_excluded = [] last_library_preps = p_con.get_latest_library_prep(project_name) last_library_preps_srm = [ x for l in last_library_preps.values() for x in l ] LOG.debug( "Looping through sample map that maps project sample names to sample run metrics ids" ) for k, v in samples.items(): LOG.debug("project sample '{}' maps to '{}'".format(k, v)) if not include_all_samples: if v['sample'] not in last_library_preps.keys(): LOG.info( "No library prep information for sample {}; keeping in report" .format(v['sample'])) else: if k not in last_library_preps_srm: LOG.info( "Sample run {} ('{}') is not latest library prep ({}) for project sample {}: excluding from report" .format( k, v["id"], ",".join( list( set(last_library_preps[ v['sample']].values()))), v['sample'])) continue else: pass if re.search("Unexpected", k): continue barcode_seq = s_con.get_entry(k, "sequence") # Exclude sample id? if _exclude_sample_id(exclude_sample_ids, v['sample'], barcode_seq): samples_excluded.append(v['sample']) continue # Get the project sample name from the sample run and set table values project_sample = sample_dict[v['sample']] vals = _set_sample_table_values(v['sample'], project_sample, barcode_seq, ordered_million_reads, param) sample_table.append([vals[k] for k in table_keys]) # Loop through samples in sample_dict for which there is no sample run information samples_in_table_or_excluded = list(set([x[0] for x in sample_table ])) + samples_excluded samples_not_in_table = list( set(sample_dict.keys()) - set(samples_in_table_or_excluded)) for sample in samples_not_in_table: if re.search("Unexpected", sample): continue project_sample = sample_dict[sample] # Set project_sample_d: a dictionary mapping from sample run metrics name to sample run metrics database id project_sample_d = _set_project_sample_dict(project_sample, source) if project_sample_d: for k, v in project_sample_d.iteritems(): barcode_seq = s_con.get_entry(k, "sequence") vals = _set_sample_table_values(sample, project_sample, barcode_seq, ordered_million_reads, param) sample_table.append([vals[k] for k in table_keys]) else: barcode_seq = None vals = _set_sample_table_values(sample, project_sample, barcode_seq, ordered_million_reads, param) sample_table.append([vals[k] for k in table_keys]) if all_samples_sequenced: param["finished"] = 'All samples for this project have been sequenced.' sample_table.sort() sample_table = list(sample_table for sample_table, _ in itertools.groupby(sample_table)) sample_table.insert( 0, ['ScilifeID', 'SubmittedID', 'BarcodeSeq', 'MSequenced', 'MOrdered']) return output_data, sample_table, param
def data_delivery_note(**kw): """Create an easily parseable information file with information about the data delivery """ output_data = { 'stdout': StringIO(), 'stderr': StringIO(), 'debug': StringIO() } project_name = kw.get('project_name', None) flowcell = kw.get('flowcell', None) LOG.debug("Generating data delivery note for project {}{}.".format( project_name, ' and flowcell {}'.format(flowcell if flowcell else ''))) # Get a connection to the project and sample databases p_con = ProjectSummaryConnection(**kw) assert p_con, "Could not connect to project database" s_con = SampleRunMetricsConnection(**kw) assert s_con, "Could not connect to sample database" # Get the entry for the project and samples from the database LOG.debug("Fetching samples from sample database") samples = s_con.get_samples(sample_prj=project_name, fc_id=flowcell) LOG.debug("Got {} samples from database".format(len(samples))) # Get the customer sample names from the project database LOG.debug("Fetching samples from project database") project_samples = p_con.get_entry(project_name, "samples") customer_names = { sample_name: sample.get('customer_name', 'N/A') for sample_name, sample in project_samples.items() } data = [[ 'SciLifeLab ID', 'Submitted ID', 'Flowcell', 'Lane', 'Barcode', 'Read', 'Path', 'MD5', 'Size (bytes)', 'Timestamp' ]] for sample in samples: sname = sample.get('project_sample_name', 'N/A') cname = customer_names.get(sname, 'N/A') fc = sample.get('flowcell', 'N/A') lane = sample.get('lane', 'N/A') barcode = sample.get('sequence', 'N/A') if 'raw_data_delivery' not in sample: data.append([sname, cname, '', '', '', '', '', '', '', '']) continue delivery = sample['raw_data_delivery'] tstamp = delivery.get('timestamp', 'N/A') for read, file in delivery.get('files', {}).items(): data.append([ sname, cname, fc, lane, barcode, read, file.get('path', 'N/A'), file.get('md5', 'N/A'), file.get('size_in_bytes', 'N/A'), tstamp, ]) # Write the data to a csv file outfile = "{}{}_data_delivery.csv".format( project_name, '_{}'.format(flowcell) if flowcell else '') LOG.debug("Writing delivery data to {}".format(outfile)) with open(outfile, "w") as outh: csvw = csv.writer(outh) for row in data: csvw.writerow(row) # Write Texttable formatted output to stdout tt = texttable.Texttable(180) tt.add_rows(data) output_data['stdout'].write(tt.draw()) return output_data
def sample_status_note(project_name=None, flowcell=None, username=None, password=None, url=None, ordered_million_reads=None, uppnex_id=None, customer_reference=None, bc_count=None, project_alias=[], projectdb="projects", samplesdb="samples", flowcelldb="flowcells", phix=None, is_paired=True, **kw): """Make a sample status note. Used keywords: :param project_name: project name :param flowcell: flowcell id :param username: db username :param password: db password :param url: db url :param ordered_million_reads: number of ordered reads in millions :param uppnex_id: the uppnex id :param customer_reference: customer project name :param project_alias: project alias name :param phix: phix error rate :param is_paired: True if run is paired-end, False for single-end """ # Cutoffs cutoffs = { "phix_err_cutoff": 2.0, "qv_cutoff": 30, } instrument = _parse_instrument_config( os.path.expanduser(kw.get("instrument_config", ""))) instrument_dict = {i['instrument_id']: i for i in instrument} # parameters parameters = { "project_name": None, "start_date": None, "FC_id": None, "scilifelab_name": None, "rounded_read_count": None, "phix_error_rate": None, "avg_quality_score": None, "pct_q30_bases": None, "success": None, "run_mode": None, "is_paired": True } # key mapping from sample_run_metrics to parameter keys srm_to_parameter = { "project_name": "sample_prj", "FC_id": "flowcell", "scilifelab_name": "barcode_name", "start_date": "date", "rounded_read_count": "bc_count", "lane": "lane" } LOG.debug("got parameters {}".format(parameters)) output_data = { 'stdout': StringIO(), 'stderr': StringIO(), 'debug': StringIO() } if not _assert_flowcell_format(flowcell): LOG.warn( "Wrong flowcell format {}; skipping. Please use the flowcell id (format \"[A-Z0-9\-]+\")" .format(flowcell)) return output_data output_data = _update_sample_output_data(output_data, cutoffs) # Connect and run s_con = SampleRunMetricsConnection(dbname=samplesdb, username=username, password=password, url=url) fc_con = FlowcellRunMetricsConnection(dbname=flowcelldb, username=username, password=password, url=url) p_con = ProjectSummaryConnection(dbname=projectdb, username=username, password=password, url=url) # Set up paragraphs paragraphs = sample_note_paragraphs() headers = sample_note_headers() # Get project project = p_con.get_entry(project_name) source = p_con.get_info_source(project_name) if not project: LOG.warn("No such project '{}'".format(project_name)) return output_data # Set samples list sample_run_list = _set_sample_run_list(project_name, flowcell, project_alias, s_con) if len(sample_run_list) == 0: LOG.warn( "No samples for project '{}', flowcell '{}'. Maybe there are no sample run metrics in statusdb?" .format(project_name, flowcell)) return output_data # Set options ordered_million_reads = _literal_eval_option(ordered_million_reads) bc_count = _literal_eval_option(bc_count) phix = _literal_eval_option(phix) # Count number of times a sample has been run on a flowcell; if several, make lane-specific reports sample_count = Counter([x.get("barcode_name") for x in sample_run_list]) # Loop samples and collect information s_param_out = [] fcdoc = None for s in sample_run_list: s_param = {} LOG.debug( "working on sample '{}', sample run metrics name '{}', id '{}'". format(s.get("barcode_name", None), s.get("name", None), s.get("_id", None))) s_param.update(parameters) s_param.update( {key: s[srm_to_parameter[key]] for key in srm_to_parameter.keys()}) fc = "{}_{}".format(s.get("date"), s.get("flowcell")) # Get instrument try: s_param.update(instrument_dict[fc_con.get_instrument(str(fc))]) except: LOG.warn( "Failed to set instrument and software versions for flowcell {} in report due to missing RunInfo -> Instrument field in statusdb. Either rerun 'pm qc update-qc' or search-and-replace 'NN' in the sample report." .format(fc)) s_param.update(instrument_dict['default']) # Get run mode if not fcdoc or fcdoc.get("name") != fc: fcdoc = fc_con.get_entry(fc) runp = fcdoc.get("RunParameters", {}) s_param[ "sequencing_platform"] = "MiSeq" if "MCSVersion" in runp else "HiSeq2500" s_param["clustering_method"] = "onboard clustering" if runp.get( "ClusteringChoice", "") == "OnBoardClustering" or s_param[ "sequencing_platform"] == "MiSeq" else "cBot" s_param["sequencing_setup"] = fcdoc.get("run_setup") s_param["sequencing_mode"] = runp.get("RunMode", "High Output") s_param["sequencing_software"] = "RTA {}".format( runp.get("RTAVersion")) if s_param["sequencing_platform"] == "MiSeq": s_param["sequencing_software"] = "MCS {}/{}".format( runp.get("MCSVersion"), s_param["sequencing_software"]) else: s_param["sequencing_software"] = "{} {}/{}".format( runp.get("ApplicationName"), runp.get("ApplicationVersion"), s_param["sequencing_software"]) s_param["is_paired"] = fc_con.is_paired_end(str(fc)) if s_param["is_paired"] is None: LOG.warn( "Could not determine run setup for flowcell {}. Will assume paired-end." .format(fc)) s_param["is_paired"] = True s_param.update(software_versions) s_param["phix_error_rate"] = fc_con.get_phix_error_rate( str(fc), s["lane"]) if phix: s_param["phix_error_rate"] = _get_phix_error_rate(s["lane"], phix) # Get quality score from demultiplex stats, if that fails # (which it shouldn't), fall back on fastqc data. (avg_quality_score, pct_q30_bases) = fc_con.get_barcode_lane_statistics( project_name, s.get("barcode_name"), fc, s["lane"]) s_param[ 'avg_quality_score'] = avg_quality_score if avg_quality_score else calc_avg_qv( s) if not s_param['avg_quality_score']: LOG.warn( "Setting average quality failed for sample {}, id {}".format( s.get("name"), s.get("_id"))) s_param['pct_q30_bases'] = pct_q30_bases if not s_param['pct_q30_bases']: LOG.warn( "Setting % of >= Q30 Bases (PF) failed for sample {}, id {}". format(s.get("name"), s.get("_id"))) # Compare phix error and qv to cutoffs err_stat = "OK" qv_stat = "OK" if s_param["phix_error_rate"] > cutoffs["phix_err_cutoff"]: err_stat = "HIGH" elif s_param["phix_error_rate"] == -1: err_stat = "N/A" if s_param["avg_quality_score"] < cutoffs["qv_cutoff"]: qv_stat = "LOW" output_data["stdout"].write( "{:>18}\t{:>6}\t{:>12}\t{:>12}\t{:>12}\t{:>12}\n".format( s["barcode_name"], s["lane"], s_param["phix_error_rate"], err_stat, s_param["avg_quality_score"], qv_stat)) # Update/set remaning sample run parameters, falling back on project defaults if *key* is missing s_param['ordered_amount'] = s_param.get( 'ordered_amount', p_con.get_ordered_amount(project_name, samples=p_con.get_entry( project_name, 'samples'))) s_param['customer_reference'] = s_param.get( 'customer_reference', project.get('customer_reference')) s_param['uppnex_project_id'] = s_param.get('uppnex_project_id', project.get('uppnex_id')) # Override database settings if options passed at command line if ordered_million_reads: s_param["ordered_amount"] = _get_ordered_million_reads( s["barcode_name"], ordered_million_reads) if bc_count: s_param["rounded_read_count"] = _round_read_count_in_millions( _get_bc_count(s["barcode_name"], bc_count, s)) else: s_param["rounded_read_count"] = _round_read_count_in_millions( s_param["rounded_read_count"]) if uppnex_id: s_param["uppnex_project_id"] = uppnex_id if customer_reference: s_param["customer_reference"] = customer_reference # Get the project sample name corresponding to the sample run project_sample = p_con.get_project_sample( project_name, s.get("project_sample_name", None)) if project_sample: LOG.debug( "project sample run metrics mapping found: '{}' : '{}'".format( s["name"], project_sample["sample_name"])) project_sample_item = project_sample['project_sample'] # Set project_sample_d: a dictionary mapping from sample run metrics name to sample run metrics database id project_sample_d = _set_project_sample_dict( project_sample_item, source) if not project_sample_d: LOG.warn( "No sample_run_metrics information for sample '{}', barcode name '{}', id '{}'\n\tProject summary information {}" .format(s["name"], s["barcode_name"], s["_id"], project_sample)) # Check if sample run metrics name present in project database: if so, verify that database ids are consistent if s["name"] not in project_sample_d.keys(): LOG.warn( "no such sample run metrics '{}' in project sample run metrics dictionary" .format(s["name"])) else: if s["_id"] == project_sample_d[s["name"]]: LOG.debug( "project sample run metrics mapping found: '{}' : '{}'" .format(s["name"], project_sample_d[s["name"]])) else: LOG.warn( "inconsistent mapping for '{}': '{}' != '{}' (project summary id)" .format(s["name"], s["_id"], project_sample_d[s["name"]])) s_param['customer_name'] = project_sample_item.get( "customer_name", None) # Always normalize submitted id, since module textttable does not support unicode if type(s_param['customer_name']) is unicode: s_param['customer_name'] = unicodedata.normalize( 'NFKD', s_param['customer_name']).encode('ascii', 'ignore') # No project sample found. Manual upload to database necessary. else: s_param['customer_name'] = None LOG.warn( "No project sample name found for sample run name '{}'".format( s["barcode_name"])) LOG.info( "Please run 'pm qc upload-qc FLOWCELL_ID --extensive-matching' to update project sample names " ) LOG.info( "or 'pm qc update --sample_prj PROJECT_NAME --names BARCODE_TO_SAMPLE_MAP to update project sample names." ) LOG.info("Please refer to the pm documentation for examples.") query_ok(force=kw.get("force", False)) # Finally assess sequencing success, update parameters and set outputs s_param['success'] = sequencing_success(s_param, cutoffs) s_param.update({ k: "N/A" for k in s_param.keys() if s_param[k] is None or s_param[k] == "" or s_param[k] == -1.0 }) if sample_count[s.get("barcode_name")] > 1: outfile = "{}_{}_{}_{}.pdf".format(s["barcode_name"], s["date"], s["flowcell"], s["lane"]) else: outfile = "{}_{}_{}.pdf".format(s["barcode_name"], s["date"], s["flowcell"]) s_param["outfile"] = outfile s_param_out.append(s_param) # Write final output to reportlab and rst files output_data["debug"].write( json.dumps({ 's_param': s_param_out, 'sample_runs': {s["name"]: s["barcode_name"] for s in sample_run_list} })) notes = [ make_note(headers=headers, paragraphs=paragraphs, **sp) for sp in s_param_out ] rest_notes = make_sample_rest_notes( "{}_{}_{}_sample_summary.rst".format(project_name, s.get("date", None), s.get("flowcell", None)), s_param_out) concatenate_notes( notes, "{}_{}_{}_sample_summary.pdf".format(project_name, s.get("date", None), s.get("flowcell", None))) return output_data
def raw_data(self): if not self._check_pargs(["project"]): return # if necessary, reformat flowcell identifier if self.pargs.flowcell: self.pargs.flowcell = self.pargs.flowcell.split("_")[-1] # get the uid and gid to use for destination files uid = os.getuid() gid = os.getgid() if self.pargs.group is not None and len(self.pargs.group) > 0: gid = grp.getgrnam(group).gr_gid self.log.debug("Connecting to project database") p_con = ProjectSummaryConnection(**vars(self.pargs)) assert p_con, "Could not get connection to project databse" self.log.debug("Connecting to samples database") s_con = SampleRunMetricsConnection(**vars(self.pargs)) assert s_con, "Could not get connection to samples databse" # Fetch the Uppnex project to deliver to if not self.pargs.uppmax_project: self.pargs.uppmax_project = p_con.get_entry( self.pargs.project, "uppnex_id") if not self.pargs.uppmax_project: self.log.error( "Uppmax project was not specified and could not be fetched from project database" ) return # Extract the list of samples and runs associated with the project and sort them samples = sorted(s_con.get_samples(fc_id=self.pargs.flowcell, sample_prj=self.pargs.project), key=lambda k: (k.get('project_sample_name', 'NA'), k.get('flowcell', 'NA'), k.get('lane', 'NA'))) # Setup paths and verify parameters self._meta.production_root = self.app.config.get("production", "root") self._meta.root_path = self._meta.production_root proj_base_dir = os.path.join(self._meta.root_path, self.pargs.project) assert os.path.exists( self._meta.production_root ), "No such directory {}; check your production config".format( self._meta.production_root) assert os.path.exists( proj_base_dir), "No project {} in production path {}".format( self.pargs.project, self._meta.root_path) try: self._meta.uppnex_project_root = self.app.config.get( "deliver", "uppnex_project_root") except Exception as e: self.log.warn( "{}, will use '/proj' as uppnext_project_root".format(e)) self._meta.uppnex_project_root = '/proj' try: self._meta.uppnex_delivery_dir = self.app.config.get( "deliver", "uppnex_project_delivery_path") except Exception as e: self.log.warn( "{}, will use 'INBOX' as uppnext_project_delivery_path".format( e)) self._meta.uppnex_delivery_dir = 'INBOX' destination_root = os.path.join(self._meta.uppnex_project_root, self.pargs.uppmax_project, self._meta.uppnex_delivery_dir) assert os.path.exists( destination_root ), "Delivery destination folder {} does not exist".format( destination_root) destination_root = os.path.join(destination_root, self.pargs.project) # If interactively select, build a list of samples to skip if self.pargs.interactive: to_process = [] for sample in samples: sname = sample.get("project_sample_name") index = sample.get("sequence") fcid = sample.get("flowcell") lane = sample.get("lane") date = sample.get("date") self.log.info( "Sample: {}, Barcode: {}, Flowcell: {}, Lane: {}, Started on: {}" .format(sname, index, fcid, lane, date)) if query_yes_no("Deliver sample?", default="no"): to_process.append(sample) samples = to_process # Find uncompressed fastq uncompressed = self._find_uncompressed_fastq_files( proj_base_dir, samples) if len(uncompressed) > 0: self.log.warn( "The following samples have uncompressed *.fastq files that cannot be delivered: {}" .format(",".join(uncompressed))) if not query_yes_no("Continue anyway?", default="no"): return self.log.info( "Will deliver data for {} samples from project {} to {}".format( len(samples), self.pargs.project, destination_root)) if not query_yes_no("Continue?"): return # Get the list of files to transfer and the destination self.log.debug("Gathering list of files to copy") to_copy = self.get_file_copy_list(proj_base_dir, destination_root, samples) # Make sure that transfer will be with rsync if not self.pargs.rsync: self.log.warn("Files must be transferred using rsync") if not query_yes_no( "Do you wish to continue delivering using rsync?", default="yes"): return self.pargs.rsync = True # Process each sample run for id, files in to_copy.items(): # get the sample database object [sample] = [s for s in samples if s.get('_id') == id] self.log.info("Processing sample {} and flowcell {}".format( sample.get("project_sample_name", "NA"), sample.get("flowcell", "NA"))) # calculate md5sums on the source side and write it on the destination md5 = [] for f in files: m = md5sum(f[0]) mfile = "{}.md5".format(f[1]) md5.append([m, mfile, f[2], f[0]]) self.log.debug("md5sum for source file {}: {}".format(f[0], m)) # transfer files self.log.debug("Transferring {} fastq files".format(len(files))) self._transfer_files([f[0] for f in files], [f[1] for f in files]) # write the md5sum to a file at the destination and verify the transfer passed = True for m, mfile, read, srcpath in md5: dstfile = os.path.splitext(mfile)[0] self.log.debug("Writing md5sum to file {}".format(mfile)) self.app.cmd.write( mfile, "{} {}".format(m, os.path.basename(dstfile)), True) self.log.debug("Verifying md5sum for file {}".format(dstfile)) # if dry-run, make sure verification pass if self.pargs.dry_run: dm = m else: dm = md5sum(dstfile) self.log.debug("md5sum for destination file {}: {}".format( dstfile, dm)) if m != dm: self.log.warn( "md5sum verification FAILED for {}. Source: {}, Target: {}" .format(dstfile, m, dm)) self.log.warn( "Improperly transferred file {} is removed from destination, please retry transfer of this file" .format(dstfile)) self.app.cmd.safe_unlink(dstfile) self.app.cmd.safe_unlink(mfile) passed = False continue # Modify the permissions to ug+rw for f in [dstfile, mfile]: self.app.cmd.chmod( f, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP) # touch the flag to trigger uppmax inbox permission fix self.app.cmd.safe_touchfile( os.path.join("/sw", "uppmax", "var", "inboxfix", "schedule", self.pargs.uppmax_project)) # log the transfer to statusdb if verification passed if passed: self.log.info( "Logging delivery to StatusDB document {}".format(id)) data = { 'raw_data_delivery': { 'timestamp': utc_time(), 'files': { 'R{}'.format(read): { 'md5': m, 'path': os.path.splitext(mfile)[0], 'size_in_bytes': self._getsize(os.path.splitext(mfile)[0]), 'source_location': srcpath } for m, mfile, read, srcpath in md5 }, } } jsonstr = json.dumps(data) jsonfile = os.path.join( os.path.dirname(md5[0][3]), "{}_{}_{}_{}_L{}_raw_data_delivery.json".format( sample.get("date"), sample.get("flowcell"), sample.get("project_sample_name"), sample.get("sequence"), sample.get("lane"))) self.log.debug( "Writing delivery to json file {}".format(jsonfile)) self.app.cmd.write(jsonfile, data=jsonstr, overwrite=True) self.log.debug( "Saving delivery in StatusDB document {}".format(id)) sample.update(data) self._save(s_con, sample) self.log.debug(jsonstr)
def test_2_make_project_note(self): """Make a project note subset by flowcell and project""" s_con = SampleRunMetricsConnection(username=self.user, password=self.pw, url=self.url) fc_con = FlowcellRunMetricsConnection(username=self.user, password=self.pw, url=self.url) p_con = ProjectSummaryConnection(username=self.user, password=self.pw, url=self.url) paragraphs = project_note_paragraphs() headers = project_note_headers() param = parameters project = p_con.get_entry(self.examples["project"]) if not project: print "No project named {}".format(self.examples["project"]) return if project: ordered_amount = p_con.get_ordered_amount(self.examples["project"]) else: return ordered_amount = self.pargs.ordered_million_reads ## Start collecting the data sample_table = [] sample_list = project['samples'] param.update({ key: project.get(ps_to_parameter[key], None) for key in ps_to_parameter.keys() }) samples = p_con.map_name_to_srm(self.examples["project"], check_consistency=True, use_bc_map=True) all_passed = True for k, v in samples.items(): if k == "Unexpected": continue project_sample = sample_list[k] vals = { x: project_sample.get(prjs_to_table[x], None) for x in prjs_to_table.keys() } vals['MOrdered'] = ordered_amount vals['BarcodeSeq'] = s_con.get_entry(v.keys()[0], "sequence") ## Set status vals['Status'] = set_status( vals) if vals['Status'] is None else vals['Status'] vals.update({k: "N/A" for k in vals.keys() if vals[k] is None}) if vals['Status'] == "N/A" or vals['Status'] == "NP": all_passed = False sample_table.append([vals[k] for k in table_keys]) if all_passed: param["finished"] = 'Project finished.' sample_table.sort() sample_table = list( sample_table for sample_table, _ in itertools.groupby(sample_table)) sample_table.insert(0, [ 'ScilifeID', 'CustomerID', 'BarcodeSeq', 'MSequenced', 'MOrdered', 'Status' ]) paragraphs["Samples"]["tpl"] = make_sample_table(sample_table) make_note("{}.pdf".format(self.examples["project"]), headers, paragraphs, **param)
def project_status_note(project_name=None, username=None, password=None, url=None, use_ps_map=True, use_bc_map=False, check_consistency=False, ordered_million_reads=None, uppnex_id=None, customer_reference=None, exclude_sample_ids={}, project_alias=None, sample_aliases={}, projectdb="projects", samplesdb="samples", flowcelldb="flowcells", include_all_samples=False, **kw): """Make a project status note. Used keywords: :param project_name: project name :param user: db user name :param password: db password :param url: db url :param use_ps_map: use project summary mapping :param use_bc_map: use project to barcode name mapping :param check_consistency: check consistency between mappings :param ordered_million_reads: number of ordered reads in millions :param uppnex_id: the uppnex id :param customer_reference: customer project name :param exclude_sample_ids: exclude some sample ids from project note :param project_alias: project alias name :param sample_aliases: sample alias names :param projectdb: project db name :param samplesdb: samples db name :param flowcelldb: flowcells db name :param include_all_samples: include all samples in report """ # parameters parameters = { "project_name": project_name, "finished": "Not finished, or cannot yet assess if finished.", } # mapping project_summary to parameter keys ps_to_parameter = { "scilife_name": "scilife_name", "customer_name": "customer_name", "project_name": "project_name" } # mapping project sample to table table_keys = [ 'ScilifeID', 'CustomerID', 'BarcodeSeq', 'MSequenced', 'MOrdered', 'Status' ] output_data = { 'stdout': StringIO(), 'stderr': StringIO(), 'debug': StringIO() } # Connect and run s_con = SampleRunMetricsConnection(dbname=samplesdb, username=username, password=password, url=url) fc_con = FlowcellRunMetricsConnection(dbname=flowcelldb, username=username, password=password, url=url) p_con = ProjectSummaryConnection(dbname=projectdb, username=username, password=password, url=url) # Set report paragraphs paragraphs = project_note_paragraphs() headers = project_note_headers() # Set local param variable param = parameters # Get project summary from project database sample_aliases = _literal_eval_option(sample_aliases, default={}) prj_summary = p_con.get_entry(project_name) if not prj_summary: LOG.warn("No such project '{}'".format(project_name)) return LOG.debug("Working on project '{}'.".format(project_name)) # Get sample run list and loop samples to make mapping sample -> {sampleruns} sample_run_list = _set_sample_run_list(project_name, flowcell=None, project_alias=project_alias, s_con=s_con) samples = {} for s in sample_run_list: prj_sample = p_con.get_project_sample( project_name, s.get("project_sample_name", None)) if prj_sample: sample_name = prj_sample['project_sample'].get( "scilife_name", None) s_d = {s["name"]: {'sample': sample_name, 'id': s["_id"]}} samples.update(s_d) else: if s["barcode_name"] in sample_aliases: s_d = { sample_aliases[s["barcode_name"]]: { 'sample': sample_aliases[s["barcode_name"]], 'id': s["_id"] } } samples.update(s_d) else: s_d = { s["name"]: { 'sample': s["name"], 'id': s["_id"], 'barcode_name': s["barcode_name"] } } LOG.warn( "No mapping found for sample run:\n '{}'".format(s_d)) # Convert to mapping from desired sample name to list of aliases # Less important for the moment; one solution is to update the # Google docs summary table to use the P names sample_dict = prj_summary['samples'] param.update({ key: prj_summary.get(ps_to_parameter[key], None) for key in ps_to_parameter.keys() }) param["ordered_amount"] = param.get("ordered_amount", p_con.get_ordered_amount(project_name)) param['customer_reference'] = param.get( 'customer_reference', prj_summary.get('customer_reference')) param['uppnex_project_id'] = param.get('uppnex_project_id', prj_summary.get('uppnex_id')) # Override database values if options passed at command line if uppnex_id: param["uppnex_project_id"] = uppnex_id if customer_reference: param["customer_reference"] = customer_reference # Process options ordered_million_reads = _literal_eval_option(ordered_million_reads) exclude_sample_ids = _literal_eval_option(exclude_sample_ids, default={}) ## Start collecting the data sample_table = [] samples_excluded = [] all_passed = True last_library_preps = p_con.get_latest_library_prep(project_name) last_library_preps_srm = [ x for l in last_library_preps.values() for x in l ] LOG.debug( "Looping through sample map that maps project sample names to sample run metrics ids" ) for k, v in samples.items(): LOG.debug("project sample '{}' maps to '{}'".format(k, v)) if not include_all_samples: if v['sample'] not in last_library_preps.keys(): LOG.info( "No library prep information for sample {}; keeping in report" .format(v['sample'])) else: if k not in last_library_preps_srm: LOG.info( "Sample run {} ('{}') is not latest library prep ({}) for project sample {}: excluding from report" .format(k, v["id"], last_library_preps[v['sample']].values()[0], v['sample'])) continue else: pass if re.search("Unexpected", k): continue barcode_seq = s_con.get_entry(k, "sequence") # Exclude sample id? if _exclude_sample_id(exclude_sample_ids, v['sample'], barcode_seq): samples_excluded.append(v['sample']) continue # Get the project sample name from the sample run and set table values project_sample = sample_dict[v['sample']] vals = _set_sample_table_values(v['sample'], project_sample, barcode_seq, ordered_million_reads, param) if vals['Status'] == "N/A" or vals['Status'] == "NP": all_passed = False sample_table.append([vals[k] for k in table_keys]) # Loop through samples in sample_dict for which there is no sample run information samples_in_table_or_excluded = list(set([x[0] for x in sample_table ])) + samples_excluded samples_not_in_table = list( set(sample_dict.keys()) - set(samples_in_table_or_excluded)) for sample in samples_not_in_table: if re.search("Unexpected", sample): continue project_sample = sample_dict[sample] # Set project_sample_d: a dictionary mapping from sample run metrics name to sample run metrics database id project_sample_d = _set_project_sample_dict(project_sample) if project_sample_d: for k, v in project_sample_d.iteritems(): barcode_seq = s_con.get_entry(k, "sequence") vals = _set_sample_table_values(sample, project_sample, barcode_seq, ordered_million_reads, param) if vals['Status'] == "N/A" or vals['Status'] == "NP": all_passed = False sample_table.append([vals[k] for k in table_keys]) else: barcode_seq = None vals = _set_sample_table_values(sample, project_sample, barcode_seq, ordered_million_reads, param) if vals['Status'] == "N/A" or vals['Status'] == "NP": all_passed = False sample_table.append([vals[k] for k in table_keys]) if all_passed: param["finished"] = 'Project finished.' sample_table.sort() sample_table = list(sample_table for sample_table, _ in itertools.groupby(sample_table)) sample_table.insert(0, [ 'ScilifeID', 'CustomerID', 'BarcodeSeq', 'MSequenced', 'MOrdered', 'Status' ]) paragraphs["Samples"]["tpl"] = make_sample_table(sample_table) make_note("{}_project_summary.pdf".format(project_name), headers, paragraphs, **param) make_rest_note("{}_project_summary.rst".format(project_name), sample_table=sample_table, report="project_report", **param) param.update( {k: "N/A" for k in param.keys() if param[k] is None or param[k] == ""}) output_data["debug"].write( json.dumps({ 'param': param, 'table': sample_table })) return output_data
def sample_status_note(project_name=None, flowcell=None, username=None, password=None, url=None, ordered_million_reads=None, uppnex_id=None, customer_reference=None, bc_count=None, project_alias=[], projectdb="projects", samplesdb="samples", flowcelldb="flowcells", phix=None, **kw): """Make a sample status note. Used keywords: :param project_name: project name :param flowcell: flowcell id :param username: db username :param password: db password :param url: db url :param ordered_million_reads: number of ordered reads in millions :param uppnex_id: the uppnex id :param customer_reference: customer project name :param project_alias: project alias name :param phix: phix error rate """ # Cutoffs cutoffs = { "phix_err_cutoff": 2.0, "qv_cutoff": 30, } # parameters parameters = { "project_name": None, "start_date": None, "FC_id": None, "scilifelab_name": None, "rounded_read_count": None, "phix_error_rate": None, "avg_quality_score": None, "success": None, "run_mode": None, } # key mapping from sample_run_metrics to parameter keys srm_to_parameter = { "project_name": "sample_prj", "FC_id": "flowcell", "scilifelab_name": "barcode_name", "start_date": "date", "rounded_read_count": "bc_count" } LOG.debug("got parameters {}".format(parameters)) output_data = { 'stdout': StringIO(), 'stderr': StringIO(), 'debug': StringIO() } if not _assert_flowcell_format(flowcell): LOG.warn( "Wrong flowcell format {}; skipping. Please use the flowcell id (format \"[A-Z0-9]+XX\")" .format(flowcell)) return output_data output_data = _update_sample_output_data(output_data, cutoffs) # Connect and run s_con = SampleRunMetricsConnection(dbname=samplesdb, username=username, password=password, url=url) fc_con = FlowcellRunMetricsConnection(dbname=flowcelldb, username=username, password=password, url=url) p_con = ProjectSummaryConnection(dbname=projectdb, username=username, password=password, url=url) # Set up paragraphs paragraphs = sample_note_paragraphs() headers = sample_note_headers() # Get project project = p_con.get_entry(project_name) if not project: LOG.warn("No such project '{}'".format(project_name)) return output_data # Set samples list sample_run_list = _set_sample_run_list(project_name, flowcell, project_alias, s_con) if len(sample_run_list) == 0: LOG.warn( "No samples for project '{}', flowcell '{}'. Maybe there are no sample run metrics in statusdb?" .format(project_name, flowcell)) return output_data # Set options ordered_million_reads = _literal_eval_option(ordered_million_reads) bc_count = _literal_eval_option(bc_count) phix = _literal_eval_option(phix) # Count number of times a sample has been run on a flowcell; if several, make lane-specific reports sample_count = Counter([x.get("barcode_name") for x in sample_run_list]) # Loop samples and collect information s_param_out = [] for s in sample_run_list: s_param = {} LOG.debug( "working on sample '{}', sample run metrics name '{}', id '{}'". format(s.get("barcode_name", None), s.get("name", None), s.get("_id", None))) s_param.update(parameters) s_param.update( {key: s[srm_to_parameter[key]] for key in srm_to_parameter.keys()}) fc = "{}_{}".format(s.get("date"), s.get("flowcell")) # Get instrument try: s_param.update(instrument[fc_con.get_instrument(str(fc))]) except: LOG.warn( "Failed to set instrument and software versions for flowcell {} in report due to missing RunInfo -> Instrument field in statusdb. Either rerun 'pm qc update-qc' or search-and-replace 'NN' in the sample report." .format(fc)) s_param.update(instrument['default']) # Get run mode s_param["run_mode"] = fc_con.get_run_mode(str(fc)) s_param.update(software_versions) s_param["phix_error_rate"] = fc_con.get_phix_error_rate( str(fc), s["lane"]) if phix: s_param["phix_error_rate"] = _get_phix_error_rate(s["lane"], phix) s_param['avg_quality_score'] = calc_avg_qv(s) if not s_param['avg_quality_score']: LOG.warn( "Calculation of average quality failed for sample {}, id {}". format(s.get("name"), s.get("_id"))) # Compare phix error and qv to cutoffs err_stat = "OK" qv_stat = "OK" if s_param["phix_error_rate"] > cutoffs["phix_err_cutoff"]: err_stat = "HIGH" elif s_param["phix_error_rate"] == -1: err_stat = "N/A" if s_param["avg_quality_score"] < cutoffs["qv_cutoff"]: qv_stat = "LOW" output_data["stdout"].write( "{:>18}\t{:>6}\t{:>12}\t{:>12}\t{:>12}\t{:>12}\n".format( s["barcode_name"], s["lane"], s_param["phix_error_rate"], err_stat, s_param["avg_quality_score"], qv_stat)) # Update/set remaning sample run parameters, falling back on project defaults if *key* is missing s_param['ordered_amount'] = s_param.get( 'ordered_amount', p_con.get_ordered_amount(project_name)) s_param['customer_reference'] = s_param.get( 'customer_reference', project.get('customer_reference')) s_param['uppnex_project_id'] = s_param.get('uppnex_project_id', project.get('uppnex_id')) # Override database settings if options passed at command line if ordered_million_reads: s_param["ordered_amount"] = _get_ordered_million_reads( s["barcode_name"], ordered_million_reads) if bc_count: s_param["rounded_read_count"] = _round_read_count_in_millions( _get_bc_count(s["barcode_name"], bc_count, s)) else: s_param["rounded_read_count"] = _round_read_count_in_millions( s_param["rounded_read_count"]) if uppnex_id: s_param["uppnex_project_id"] = uppnex_id if customer_reference: s_param["customer_reference"] = customer_reference # Get the project sample name corresponding to the sample run project_sample = p_con.get_project_sample( project_name, s.get("project_sample_name", None)) if project_sample: LOG.debug( "project sample run metrics mapping found: '{}' : '{}'".format( s["name"], project_sample["sample_name"])) project_sample_item = project_sample['project_sample'] # Set project_sample_d: a dictionary mapping from sample run metrics name to sample run metrics database id project_sample_d = _set_project_sample_dict(project_sample_item) if not project_sample_d: LOG.warn( "No sample_run_metrics information for sample '{}', barcode name '{}', id '{}'\n\tProject summary information {}" .format(s["name"], s["barcode_name"], s["_id"], project_sample)) # Check if sample run metrics name present in project database: if so, verify that database ids are consistent if s["name"] not in project_sample_d.keys(): LOG.warn( "no such sample run metrics '{}' in project sample run metrics dictionary" .format(s["name"])) else: if s["_id"] == project_sample_d[s["name"]]: LOG.debug( "project sample run metrics mapping found: '{}' : '{}'" .format(s["name"], project_sample_d[s["name"]])) else: LOG.warn( "inconsistent mapping for '{}': '{}' != '{}' (project summary id)" .format(s["name"], s["_id"], project_sample_d[s["name"]])) s_param['customer_name'] = project_sample_item.get( "customer_name", None) # No project sample found. Manual upload to database necessary. else: s_param['customer_name'] = None LOG.warn( "No project sample name found for sample run name '{}'".format( s["barcode_name"])) LOG.info( "Please run 'pm qc upload-qc FLOWCELL_ID --extensive-matching' to update project sample names " ) LOG.info( "or 'pm qc update --sample_prj PROJECT_NAME --names BARCODE_TO_SAMPLE_MAP to update project sample names." ) LOG.info("Please refer to the pm documentation for examples.") query_ok(force=kw.get("force", False)) # Finally assess sequencing success, update parameters and set outputs s_param['success'] = sequencing_success(s_param, cutoffs) s_param.update({ k: "N/A" for k in s_param.keys() if s_param[k] is None or s_param[k] == "" or s_param[k] == -1.0 }) if sample_count[s.get("barcode_name")] > 1: outfile = "{}_{}_{}_{}.pdf".format(s["barcode_name"], s["date"], s["flowcell"], s["lane"]) else: outfile = "{}_{}_{}.pdf".format(s["barcode_name"], s["date"], s["flowcell"]) s_param["outfile"] = outfile s_param_out.append(s_param) # Write final output to reportlab and rst files output_data["debug"].write( json.dumps({ 's_param': s_param_out, 'sample_runs': {s["name"]: s["barcode_name"] for s in sample_run_list} })) notes = [ make_note(headers=headers, paragraphs=paragraphs, **sp) for sp in s_param_out ] rest_notes = make_sample_rest_notes( "{}_{}_{}_sample_summary.rst".format(project_name, s.get("date", None), s.get("flowcell", None)), s_param_out) concatenate_notes( notes, "{}_{}_{}_sample_summary.pdf".format(project_name, s.get("date", None), s.get("flowcell", None))) return output_data
def application_qc(project_name=None, flowcell=None, application=None, username=None, password=None, url=None, sampledb="samples", projectdb="projects", **kw): """Perform application specific qc on a project. :param project_name: project name :param flowcell: flowcell identifier :param application: application for which to perform qc :param username: database username :param password: database password :param url: database url :param sampledb: samples database name :param projectdb: project database name """ LOG.debug("Doing application qc for project {}, flowcell {}".format( project_name, flowcell)) output_data = {'stdout': StringIO(), 'stderr': StringIO()} p_con = ProjectSummaryConnection(dbname=projectdb, username=username, password=password, url=url) s_con = SampleRunMetricsConnection(dbname=sampledb, username=username, password=password, url=url) prj_summary = p_con.get_entry(project_name) qc_data = get_qc_data(project_name, p_con, s_con, flowcell) if not prj_summary is None: qc_data = get_qc_data(project_name, p_con, s_con, flowcell) if prj_summary.get("application") not in APPLICATION_MAP.keys(): if not application: LOG.warn( "No such application {}. Please use the application option (available choices {})" .format(application, ",".join(QC_CUTOFF.keys()))) return output_data application = application else: application = APPLICATION_MAP[prj_summary.get("application")] else: LOG.info( "No such project {} in project summary. Trying to get qc data anyway." .format(project_name)) if not application: LOG.warn( "No application provided. Please use the application option (available choices {})" .format(",".join(QC_CUTOFF.keys()))) return output_data qc_data = _get_sample_qc_data(project_name, application, s_con, flowcell) output_data = _qc_info_header(project_name, application, output_data) for k, v in sorted(qc_data.iteritems()): y = [str(x) for x in assess_qc(v, application)] output_data["stdout"].write("".join(y) + "\n") return output_data