def setUpModule(): """Create test databases in local server""" if not has_couchdb: return server = couchdb.Server() ## Create databases for x in DATABASES: if not server.__contains__(x): LOG.info("Creating database {}".format(x)) server.create(x) ## Create views for flowcells and samples for dbname in DATABASES: dblab = dbname.replace("-test", "") db = server[dbname] for k, v in VIEWS[dblab].items(): for title, view in v.items(): viewdef = ViewDefinition(k, title, view) viewdef.sync(db) ## Create and upload project summary with open(os.path.join(filedir, "data", "config", "project_summary.yaml")) as fh: prj_sum = yaml.load(fh) db = server["samples-test"] p_con = ProjectSummaryConnection(dbname="projects-test", username="******", password="******") for p in prj_sum: prj = ProjectSummaryDocument(**p) p_con.save(prj, key="project_name")
def test_dbcon(self): """Test database connection and that we get expected values.""" s_con = SampleRunMetricsConnection(dbname="samples-test", username="******", password="******") samples = [s_con.get_entry(x) for x in s_con.name_view] samples_d = {x["name"]: x for x in samples} self.assertEqual(samples_d["1_120924_AC003CCCXX_TGACCA"]["date"], "120924") self.assertEqual(samples_d["1_121015_BB002BBBXX_TGACCA"]["flowcell"], "BB002BBBXX") self.assertEqual(samples_d["2_120924_AC003CCCXX_ACAGTG"]["entity_type"], "sample_run_metrics") self.assertEqual(samples_d["3_120924_AC003CCCXX_ACAGTG"]["lane"], "3") self.assertEqual(samples_d["4_120924_AC003CCCXX_CGTTAA"]["sequence"], "CGTTAA") self.assertEqual(samples_d["2_121015_BB002BBBXX_TGACCA"]["project_id"], "P002") fc_con = FlowcellRunMetricsConnection(dbname="flowcells-test", username="******", password="******") flowcells = [fc_con.get_entry(x) for x in fc_con.name_view] flowcells_d = {x["name"]: x for x in flowcells} self.assertEqual(flowcells_d["120924_AC003CCCXX"]["name"], "120924_AC003CCCXX") self.assertEqual(flowcells_d["121015_BB002BBBXX"]["name"], "121015_BB002BBBXX") self.assertEqual(flowcells_d["120924_AC003CCCXX"]["entity_type"], "flowcell_run_metrics") p_con = ProjectSummaryConnection(dbname="projects-test", username="******", password="******") projects = [p_con.get_entry(x) for x in p_con.name_view] projects_d = {x["project_name"]: x for x in projects} self.assertEqual(projects_d["J.Doe_00_01"]["min_m_reads_per_sample_ordered"], 0.1) self.assertEqual(projects_d["J.Doe_00_01"]["no_of_samples"], 2) self.assertEqual( set(projects_d["J.Doe_00_01"]["samples"].keys()), set(["P001_101_index3", "P001_102", "P001_103"]) ) self.assertEqual(projects_d["J.Doe_00_01"]["customer_reference"], "GnuGenome") self.assertEqual(projects_d["J.Doe_00_02"]["min_m_reads_per_sample_ordered"], 0.2) self.assertEqual(projects_d["J.Doe_00_03"]["samples"].keys(), ["3_index6"]) self.assertIn("A", projects_d["J.Doe_00_03"]["samples"]["3_index6"]["library_prep"])
def test_2_make_note(self): """Make a note subset by example flowcell and project""" s_con = SampleRunMetricsConnection(username=self.user, password=self.pw, url=self.url) fc_con = FlowcellRunMetricsConnection(username=self.user, password=self.pw, url=self.url) p_con = ProjectSummaryConnection(username=self.user, password=self.pw, url=self.url) paragraphs = sample_note_paragraphs() headers = sample_note_headers() samples = s_con.get_samples(self.examples["flowcell"], self.examples["project"]) project = p_con.get_entry(self.examples["project"]) samples = p_con.map_srm_to_name(self.examples["project"], fc_id=self.examples["flowcell"], use_bc_map=True) for k,v in samples.items(): s_param = parameters s = s_con.get_entry(k) s_param.update({key:s[srm_to_parameter[key]] for key in srm_to_parameter.keys()}) fc = "{}_{}".format(s["date"], s["flowcell"]) s_param["phix_error_rate"] = fc_con.get_phix_error_rate(str(fc), s["lane"]) s_param['avg_quality_score'] = s_con.calc_avg_qv(s["name"]) s_param['rounded_read_count'] = round(float(s_param['rounded_read_count'])/1e6,1) if s_param['rounded_read_count'] else None s_param['customer_name'] = project['samples'][v["sample"]].get('customer_name', None) if project: s_param['ordered_amount'] = p_con.get_ordered_amount(self.examples["project"]) s_param['customer_reference'] = s_param.get('customer_reference', project['customer_reference']) s_param['uppnex_project_id'] = s_param.get('uppnex_project_id', project['uppnex_id']) s_param['success'] = sequencing_success(s_param, cutoffs) s_param.update({k:"N/A" for k in s_param.keys() if s_param[k] is None}) make_note("{}.pdf".format(s["barcode_name"]), headers, paragraphs, **s_param)
def list_projects(self): if not self._check_pargs(["flowcell"]): return url = self.pargs.url if self.pargs.url else self.app.config.get("db", "url") if not url: self.app.log.warn("Please provide a valid url: got {}".format(url)) return if not validate_fc_directory_format(self.pargs.flowcell): self.app.log.warn( "Path '{}' does not conform to bcbio flowcell directory format; aborting".format(self.pargs.flowcell) ) return out_data = [[self.pargs.flowcell]] s = self.pargs.flowcell.split("_") fcid = "_".join([s[0], s[-1]]) self.log.debug("Establishing FlowcellRunMetricsConnection") fc_con = FlowcellRunMetricsConnection(dbname=self.app.config.get("db", "flowcells"), **vars(self.app.pargs)) self.log.debug("Establishing ProjectSummaryConnection") p_con = ProjectSummaryConnection(dbname=self.app.config.get("db", "projects"), **vars(self.app.pargs)) self.log.debug("Fetching flowcell metric document for flowcell {}".format(fcid)) fc = fc_con.get_entry(fcid) if fc is None: self.log.warn("No flowcell metric document for flowcell {}".format(fcid)) return self.log.debug("Fetching csv samplesheet data for flowcell {}".format(fcid)) ssheet_data = self._get_samplesheet_sample_data(fc) if len(ssheet_data) == 0: self.log.warn("No csv samplesheet data for flowcell {}".format(fcid)) return self.log.debug("Fetch runParameter data for flowcell {}".format(fcid)) run_data = self._get_run_parameter_data(fc) if len(run_data) == 0: self.log.warn("No runParameter data for flowcell {}".format(fcid)) out_data = [ [self.pargs.flowcell, run_data.get("InstrumentType", "HiSeq2000"), run_data.get("RunMode", "High Output")] ] # Extract the project names projects = set([proj[0].replace("__", ".") for data in ssheet_data.values() for proj in data.values()]) # Extract application for each project for project in projects: self.log.debug("Fetching project data document for project {}".format(project)) pdoc = p_con.get_entry(project) if pdoc is None: self.log.warn("No project data document for project {}".format(project)) pdoc = {} application = pdoc.get("application", "N/A") out_data.append([project, application]) self.app._output_data["stdout"].write("\n".join(["\t".join([str(r) for r in row]) for row in out_data]))
def upload_qc(self): if not self._check_pargs(['flowcell']): return url = self.pargs.url if self.pargs.url else self.app.config.get( "db", "url") if not url: self.app.log.warn("Please provide a valid url: got {}".format(url)) return if not validate_fc_directory_format(self.pargs.flowcell): self.app.log.warn( "Path '{}' does not conform to bcbio flowcell directory format; aborting" .format(self.pargs.flowcell)) return runinfo_csv = os.path.join(os.path.abspath(self.pargs.flowcell), "{}.csv".format(fc_id(self.pargs.flowcell))) runinfo_yaml = os.path.join(os.path.abspath(self.pargs.flowcell), "run_info.yaml") (fc_date, fc_name) = fc_parts(self.pargs.flowcell) if int(fc_date) < 120815: self.log.info( "Assuming pre-casava based file structure for {}".format( fc_id(self.pargs.flowcell))) qc_objects = self._collect_pre_casava_qc() else: self.log.info("Assuming casava based file structure for {}".format( fc_id(self.pargs.flowcell))) qc_objects = self._collect_casava_qc() if len(qc_objects) == 0: self.log.info("No out-of-date qc objects for {}".format( fc_id(self.pargs.flowcell))) return else: self.log.info("Retrieved {} updated qc objects".format( len(qc_objects))) s_con = SampleRunMetricsConnection(dbname=self.app.config.get( "db", "samples"), **vars(self.app.pargs)) fc_con = FlowcellRunMetricsConnection(dbname=self.app.config.get( "db", "flowcells"), **vars(self.app.pargs)) p_con = ProjectSummaryConnection(dbname=self.app.config.get( "db", "projects"), **vars(self.app.pargs)) for obj in qc_objects: if self.app.pargs.debug: self.log.debug("{}: {}".format(str(obj), obj["_id"])) if isinstance(obj, FlowcellRunMetricsDocument): dry("Saving object {}".format(repr(obj)), fc_con.save(obj)) if isinstance(obj, SampleRunMetricsDocument): project_sample = p_con.get_project_sample( obj.get("sample_prj", None), obj.get("barcode_name", None), self.pargs.extensive_matching) if project_sample: obj["project_sample_name"] = project_sample['sample_name'] dry("Saving object {}".format(repr(obj)), s_con.save(obj))
def bcbb_configuration_from_samplesheet(csv_samplesheet, couch_credentials): """Parse an illumina csv-samplesheet and return a dictionary suitable for the bcbb-pipeline """ tfh, yaml_file = tempfile.mkstemp('.yaml','samplesheet') os.close(tfh) yaml_file = bcbio.solexa.samplesheet.csv2yaml(csv_samplesheet,yaml_file) with open(yaml_file) as fh: config = yaml.load(fh) application_setup = { 'Amplicon': {'analysis': 'Align_standard'}, 'ChIP-seq': {'analysis': 'RNA-seq'}, 'Custom capture': {'analysis': 'Align_standard_seqcap'}, 'de novo': {'analysis': 'Align_standard', 'genome_build': 'unknown'}, 'Exome capture': {'analysis': 'Align_standard_seqcap'}, 'Finished library': {'analysis': 'Align_standard', 'genome_build': 'unknown'}, 'Mate-pair': {'analysis': 'Align_standard', 'genome_build': 'unknown'}, 'Metagenome': {'analysis': 'Align_standard', 'genome_build': 'unknown'}, 'miRNA-seq': {'analysis': 'Align_standard', 'genome_build': 'unknown'}, 'RNA-seq (mRNA)': {'analysis': 'RNA-seq'}, 'RNA-seq (total RNA)': {'analysis': 'RNA-seq'}, 'WG re-seq': {'analysis': 'Align_standard'}, 'default': {'analysis': 'Align_standard'}, } #Connect to maggie to get project application try: p_con = ProjectSummaryConnection(**couch_credentials) except: print "Can't connect to maggie to get application" p_con = None # Replace the default analysis ## TODO: This is an ugly hack, should be replaced by a custom config for lane in config: for plex in lane.get('multiplex',[]): application='default' if p_con is not None: try: Proj=plex.get('sample_prj','') project = p_con.get_entry(Proj) if project is not None: application = project.get("application", 'default').strip() except: application='default' setup = application_setup.get(application,application_setup['default']) for key, val in setup.items(): plex[key] = val # Remove the yaml file, we will write a new one later os.remove(yaml_file) return config
def list_projects(self): if not self._check_pargs(["flowcell"]): return url = self.pargs.url if self.pargs.url else self.app.config.get("db", "url") if not url: self.app.log.warn("Please provide a valid url: got {}".format(url)) return if not validate_fc_directory_format(self.pargs.flowcell): self.app.log.warn("Path '{}' does not conform to bcbio flowcell directory format; aborting".format(self.pargs.flowcell)) return out_data = [[self.pargs.flowcell]] s = self.pargs.flowcell.split("_") fcid = "_".join([s[0],s[-1]]) self.log.debug("Establishing FlowcellRunMetricsConnection") fc_con = FlowcellRunMetricsConnection(dbname=self.app.config.get("db", "flowcells"), **vars(self.app.pargs)) self.log.debug("Establishing ProjectSummaryConnection") p_con = ProjectSummaryConnection(dbname=self.app.config.get("db", "projects"), **vars(self.app.pargs)) self.log.debug("Fetching flowcell metric document for flowcell {}".format(fcid)) fc = fc_con.get_entry(fcid) if fc is None: self.log.warn("No flowcell metric document for flowcell {}".format(fcid)) return self.log.debug("Fetching csv samplesheet data for flowcell {}".format(fcid)) ssheet_data = self._get_samplesheet_sample_data(fc) if len(ssheet_data) == 0: self.log.warn("No csv samplesheet data for flowcell {}".format(fcid)) return self.log.debug("Fetch runParameter data for flowcell {}".format(fcid)) run_data = self._get_run_parameter_data(fc) if len(run_data) == 0: self.log.warn("No runParameter data for flowcell {}".format(fcid)) out_data = [[self.pargs.flowcell, run_data.get("InstrumentType","HiSeq2000"), run_data.get("RunMode","High Output")]] # Extract the project names projects = set([proj[0].replace("__",".") for data in ssheet_data.values() for proj in data.values()]) # Extract application for each project for project in projects: self.log.debug("Fetching project data document for project {}".format(project)) pdoc = p_con.get_entry(project) if pdoc is None: self.log.warn("No project data document for project {}".format(project)) pdoc = {} application = pdoc.get("application","N/A") type = pdoc.get("type","Check GPL") out_data.append([project,application,type]) self.app._output_data['stdout'].write("\n".join(["\t".join([str(r) for r in row]) for row in out_data]))
def test_3_sample_map(self): """Test getting a sample mapping""" p_con = ProjectSummaryConnection(username=self.user, password=self.pw, url=self.url) sample_map = p_con.map_name_to_srm(self.examples["project"], use_ps_map=False, check_consistency=True) print sample_map
def test_4_srm_map(self): """Test getting a sample mapping from srm to project samples""" p_con = ProjectSummaryConnection(username=self.user, password=self.pw, url=self.url) samples = p_con.map_srm_to_name(self.examples["project"], fc_id=self.examples["flowcell"], use_ps_map=False, check_consistency=True) print samples
def update(self): if not self._check_pargs(["sample_prj"]): return url = self.pargs.url if self.pargs.url else self.app.config.get("db", "url") if not url: self.app.log.warn("Please provide a valid url: got {}".format(url)) return s_con = SampleRunMetricsConnection(dbname=self.app.config.get("db", "samples"), **vars(self.app.pargs)) samples = s_con.get_samples(sample_prj=self.pargs.sample_prj) if self.pargs.project_id: self.app.log.debug("Going to update 'project_id' to {} for sample runs with 'sample_prj' == {}".format(self.pargs.project_id, self.pargs.sample_prj)) for s in samples: if not s.get("project_id", None) is None: if not query_yes_no("'project_id':{} for sample {}; are you sure you want to overwrite?".format(s["project_id"], s["name"]), force=self.pargs.force): continue s["project_id"] = self.pargs.project_id s_con.save(s) if self.pargs.names: self.app.log.debug("Going to update 'project_sample_name' for sample runs with 'sample_prj' == {}".format(self.pargs.sample_prj)) if os.path.exists(self.pargs.names): with open(self.pargs.names) as fh: names_d = json.load(fh) else: names_d= ast.literal_eval(self.pargs.names) samples_sort = sorted(samples, key=lambda s:s["barcode_name"]) groups = {} for k, g in itertools.groupby(samples_sort, key=lambda x:x["barcode_name"]): groups[k] = list(g) for barcode_name in names_d: sample_list = groups.get(barcode_name, None) if not sample_list: continue for s in sample_list: if not s.get("project_sample_name", None) is None: if not query_yes_no("'project_sample_name':{} for sample {}; are you sure you want to overwrite?".format(s["project_sample_name"], s["name"]), force=self.pargs.force): continue s["project_sample_name"] = names_d[barcode_name] s_con.save(s) else: self.app.log.info("Trying to use extensive matching...") p_con = ProjectSummaryConnection(dbname=self.app.config.get("db", "projects"), **vars(self.app.pargs)) project_name = self.pargs.sample_prj if self.pargs.project_alias: project_name = self.pargs.project_alias for s in samples: project_sample = p_con.get_project_sample(project_name, s["barcode_name"], extensive_matching=True) if project_sample: self.app.log.info("using mapping '{} : {}'...".format(s["barcode_name"], project_sample["sample_name"])) s["project_sample_name"] = project_sample["sample_name"] s_con.save(s)
def setUp(self): self.user = "******" self.pw = "pw" self.url = "localhost" self.examples = { "sample": "1_120924_AC003CCCXX_TGACCA", "flowcell": "AC003CCCXX", "project": "J.Doe_00_01" } self.p_con = ProjectSummaryConnection(dbname="projects-test", username=self.user, password=self.pw, url=self.url)
def test_dbcon(self): """Test database connection and that we get expected values.""" s_con = SampleRunMetricsConnection(dbname="samples-test", username="******", password="******") samples = [s_con.get_entry(x) for x in s_con.name_view] samples_d = {x["name"]: x for x in samples} self.assertEqual(samples_d["1_120924_AC003CCCXX_TGACCA"]["date"], "120924") self.assertEqual(samples_d["1_121015_BB002BBBXX_TGACCA"]["flowcell"], "BB002BBBXX") self.assertEqual( samples_d["2_120924_AC003CCCXX_ACAGTG"]["entity_type"], "sample_run_metrics") self.assertEqual(samples_d["3_120924_AC003CCCXX_ACAGTG"]["lane"], "3") self.assertEqual(samples_d["4_120924_AC003CCCXX_CGTTAA"]["sequence"], "CGTTAA") self.assertEqual(samples_d["2_121015_BB002BBBXX_TGACCA"]["project_id"], "P002") fc_con = FlowcellRunMetricsConnection(dbname="flowcells-test", username="******", password="******") flowcells = [fc_con.get_entry(x) for x in fc_con.name_view] flowcells_d = {x["name"]: x for x in flowcells} self.assertEqual(flowcells_d["120924_AC003CCCXX"]["name"], "120924_AC003CCCXX") self.assertEqual(flowcells_d["121015_BB002BBBXX"]["name"], "121015_BB002BBBXX") self.assertEqual(flowcells_d["120924_AC003CCCXX"]["entity_type"], "flowcell_run_metrics") p_con = ProjectSummaryConnection(dbname="projects-test", username="******", password="******") projects = [p_con.get_entry(x) for x in p_con.name_view] projects_d = {x["project_name"]: x for x in projects} self.assertEqual( projects_d["J.Doe_00_01"]["min_m_reads_per_sample_ordered"], 0.1) self.assertEqual(projects_d["J.Doe_00_01"]["no_of_samples"], 2) self.assertEqual(set(projects_d["J.Doe_00_01"]["samples"].keys()), set(["P001_101_index3", "P001_102", "P001_103"])) self.assertEqual(projects_d["J.Doe_00_01"]["customer_reference"], "GnuGenome") self.assertEqual( projects_d["J.Doe_00_02"]["min_m_reads_per_sample_ordered"], 0.2) self.assertEqual(projects_d["J.Doe_00_03"]["samples"].keys(), ["3_index6"]) self.assertIn( "A", projects_d["J.Doe_00_03"]["samples"]["3_index6"]["library_prep"])
def bpreport(self): if not self._check_pargs(["project"]): return kw = vars(self.pargs) basedir = os.path.abspath( os.path.join(self.app.controller._meta.root_path, self.app.controller._meta.path_id)) flist = find_samples(basedir, **vars(self.pargs)) if not len(flist) > 0: self.log.info("No samples/sample configuration files found") return if self.pargs.no_statusdb: sample_name_map = None else: if not self._check_pargs(["statusdb_project_name"]): return p_con = ProjectSummaryConnection(dbname=self.app.config.get( "db", "projects"), **vars(self.app.pargs)) s_con = SampleRunMetricsConnection(dbname=self.app.config.get( "db", "samples"), **vars(self.app.pargs)) sample_name_map = get_scilife_to_customer_name( self.pargs.statusdb_project_name, p_con, s_con) kw.update(project_name=self.pargs.project, flist=flist, basedir=basedir, sample_name_map=sample_name_map) out_data = best_practice_note(**kw) self.log.info( "Wrote report to directory {}; use Makefile to generate pdf report" .format(basedir)) self.app._output_data['stdout'].write(out_data['stdout'].getvalue()) self.app._output_data['stderr'].write(out_data['stderr'].getvalue()) self.app._output_data['debug'].write(out_data['debug'].getvalue())
def __init__(self, config): super(GDocsUpdater, self).__init__(config) # Connect to the Google Docs api gdconf = self.config.get("gdocs",{}) creds = os.path.expanduser(gdconf.get("credentials_file","")) assert os.path.exists(creds), "Supplied GDocs credentials file does not exist" self.gdcon = SpreadSheet(get_credentials(creds)) assert self.gdcon, "Could not get a SpreadSheet object, please verify gdocs credentials" doc = gdconf.get("qc_checklist",None) assert doc, "No QC checklist specified in configuration, please specify" ssheet = self.gdcon.get_spreadsheet(doc) assert ssheet, "Could not locate QC checklist '{}' on Google Docs. Please make sure it exists".format(doc) self.gdcon.ssheet = ssheet # Get the Ongoing, Finished and Coming worksheets self.ongoing = self.gdcon.get_worksheet("Ongoing") self.coming = self.gdcon.get_worksheet("Coming") self.finished = self.gdcon.get_worksheet("Finished") assert self.ongoing and self.coming and self.finished, "Could not get 'Ongoing', 'Finished' and 'Coming' worksheets from '{}'. Please make sure that they exist".format(doc) # Get a connection to the StatusDB project database dbconf = self.config.get("statusdb",{}) try: self.pcon = ProjectSummaryConnection(url=dbconf.get("url","localhost"), username=dbconf.get("user","user"), password=dbconf.get("password","pass")) except ConnectionError: self.pcon = None
def setUp(self): """FIXME: All other tests depend on data being uploaded, so these are not real unit tests. The setup to TestQCUpload has to be run prior to other tests, else unexpected failures will occur.""" self.app = self.make_app( argv=['qc', 'upload-qc', flowcells[0], '--mtime', '10000'], extensions=[ 'scilifelab.pm.ext.ext_qc', 'scilifelab.pm.ext.ext_couchdb' ]) self._run_app() self.app = self.make_app( argv=['qc', 'upload-qc', flowcells[1], '--mtime', '10000'], extensions=[ 'scilifelab.pm.ext.ext_qc', 'scilifelab.pm.ext.ext_couchdb' ]) self._run_app() self.s_con = SampleRunMetricsConnection(dbname="samples-test", username="******", password="******") self.p_con = ProjectSummaryConnection(dbname="projects-test", username="******", password="******") self.fc_con = FlowcellRunMetricsConnection(dbname="flowcells-test", username="******", password="******")
def closed_projects(report, from_date=None, to_date=None, **kw): finished = [] # Get a connection to the database pcon = ProjectSummaryConnection(**kw) if not pcon: report.log.error( "Could not get connection to database".format(project)) return False # Loop over the entries in the project_dates view for item in pcon.db.view("project/project_dates", reduce=False): try: project = item.key closed = datetime.datetime.strptime( item.value.get("close_date", "0000-00-00"), "%Y-%m-%d") # Skip the entry if the date is outside the range we're insterested in if from_date is not None and closed < from_date: continue if to_date is not None and closed > to_date: continue finished.append({ 'name': project, 'closed': datetime.datetime.strftime(closed, "%Y-%m-%d") }) except ValueError: continue return finished
def setUp(self): self.user = "******" self.pw = "pw" self.url = "localhost" self.examples = {"sample": "1_120924_AC003CCCXX_TGACCA", "flowcell": "AC003CCCXX", "project": "J.Doe_00_01"} self.p_con = ProjectSummaryConnection( dbname="projects-test", username=self.user, password=self.pw, url=self.url )
def upload_qc(self): if not self._check_pargs(["flowcell"]): return url = self.pargs.url if self.pargs.url else self.app.config.get("db", "url") if not url: self.app.log.warn("Please provide a valid url: got {}".format(url)) return if not validate_fc_directory_format(self.pargs.flowcell): self.app.log.warn( "Path '{}' does not conform to bcbio flowcell directory format; aborting".format(self.pargs.flowcell) ) return runinfo_csv = os.path.join(os.path.abspath(self.pargs.flowcell), "{}.csv".format(fc_id(self.pargs.flowcell))) runinfo_yaml = os.path.join(os.path.abspath(self.pargs.flowcell), "run_info.yaml") (fc_date, fc_name) = fc_parts(self.pargs.flowcell) if int(fc_date) < 120815: self.log.info("Assuming pre-casava based file structure for {}".format(fc_id(self.pargs.flowcell))) qc_objects = self._collect_pre_casava_qc() else: self.log.info("Assuming casava based file structure for {}".format(fc_id(self.pargs.flowcell))) qc_objects = self._collect_casava_qc() if len(qc_objects) == 0: self.log.info("No out-of-date qc objects for {}".format(fc_id(self.pargs.flowcell))) return else: self.log.info("Retrieved {} updated qc objects".format(len(qc_objects))) s_con = SampleRunMetricsConnection(dbname=self.app.config.get("db", "samples"), **vars(self.app.pargs)) fc_con = FlowcellRunMetricsConnection(dbname=self.app.config.get("db", "flowcells"), **vars(self.app.pargs)) p_con = ProjectSummaryConnection(dbname=self.app.config.get("db", "projects"), **vars(self.app.pargs)) for obj in qc_objects: if self.app.pargs.debug: self.log.debug("{}: {}".format(str(obj), obj["_id"])) if isinstance(obj, FlowcellRunMetricsDocument): dry("Saving object {}".format(repr(obj)), fc_con.save(obj)) if isinstance(obj, SampleRunMetricsDocument): project_sample = p_con.get_project_sample( obj.get("sample_prj", None), obj.get("barcode_name", None), self.pargs.extensive_matching ) if project_sample: obj["project_sample_name"] = project_sample["sample_name"] dry("Saving object {}".format(repr(obj)), s_con.save(obj))
def application_qc(project_name=None, flowcell=None, application=None, username=None, password=None, url=None, sampledb="samples", projectdb="projects", **kw): """Perform application specific qc on a project. :param project_name: project name :param flowcell: flowcell identifier :param application: application for which to perform qc :param username: database username :param password: database password :param url: database url :param sampledb: samples database name :param projectdb: project database name """ LOG.debug("Doing application qc for project {}, flowcell {}".format(project_name, flowcell)) output_data = {'stdout':StringIO(), 'stderr':StringIO()} p_con = ProjectSummaryConnection(dbname=projectdb, username=username, password=password, url=url) s_con = SampleRunMetricsConnection(dbname=sampledb, username=username, password=password, url=url) prj_summary = p_con.get_entry(project_name) qc_data = get_qc_data(project_name, p_con, s_con, flowcell) if not prj_summary is None: qc_data = get_qc_data(project_name, p_con, s_con, flowcell) if prj_summary.get("application") not in APPLICATION_MAP.keys(): if not application: LOG.warn("No such application {}. Please use the application option (available choices {})".format(application, ",".join(QC_CUTOFF.keys()))) return output_data application = application else: application = APPLICATION_MAP[prj_summary.get("application")] else: LOG.info("No such project {} in project summary. Trying to get qc data anyway.".format(project_name)) if not application: LOG.warn("No application provided. Please use the application option (available choices {})".format(",".join(QC_CUTOFF.keys()))) return output_data qc_data = _get_sample_qc_data(project_name, application, s_con, flowcell) output_data = _qc_info_header(project_name, application, output_data) for k,v in sorted(qc_data.iteritems()): y = [str(x) for x in assess_qc(v, application)] output_data["stdout"].write("".join(y) + "\n") return output_data
def test_2_make_project_note(self): """Make a project note subset by flowcell and project""" s_con = SampleRunMetricsConnection(username=self.user, password=self.pw, url=self.url) fc_con = FlowcellRunMetricsConnection(username=self.user, password=self.pw, url=self.url) p_con = ProjectSummaryConnection(username=self.user, password=self.pw, url=self.url) paragraphs = project_note_paragraphs() headers = project_note_headers() param = parameters project = p_con.get_entry(self.examples["project"]) if not project: print "No project named {}".format(self.examples["project"]) return if project: ordered_amount = p_con.get_ordered_amount(self.examples["project"]) else: return ordered_amount = self.pargs.ordered_million_reads ## Start collecting the data sample_table = [] sample_list = project['samples'] param.update({key:project.get(ps_to_parameter[key], None) for key in ps_to_parameter.keys()}) samples = p_con.map_name_to_srm(self.examples["project"], check_consistency=True, use_bc_map=True) all_passed = True for k,v in samples.items(): if k=="Unexpected": continue project_sample = sample_list[k] vals = {x:project_sample.get(prjs_to_table[x], None) for x in prjs_to_table.keys()} vals['MOrdered'] = ordered_amount vals['BarcodeSeq'] = s_con.get_entry(v.keys()[0], "sequence") ## Set status vals['Status'] = set_status(vals) if vals['Status'] is None else vals['Status'] vals.update({k:"N/A" for k in vals.keys() if vals[k] is None}) if vals['Status']=="N/A" or vals['Status']=="NP": all_passed = False sample_table.append([vals[k] for k in table_keys]) if all_passed: param["finished"] = 'Project finished.' sample_table.sort() sample_table = list(sample_table for sample_table,_ in itertools.groupby(sample_table)) sample_table.insert(0, ['ScilifeID', 'CustomerID', 'BarcodeSeq', 'MSequenced', 'MOrdered', 'Status']) paragraphs["Samples"]["tpl"] = make_sample_table(sample_table) make_note("{}.pdf".format(self.examples["project"]), headers, paragraphs, **param)
def bcbb_configuration_from_samplesheet(csv_samplesheet, couch_credentials): """Parse an illumina csv-samplesheet and return a dictionary suitable for the bcbb-pipeline """ tfh, yaml_file = tempfile.mkstemp('.yaml', 'samplesheet') os.close(tfh) yaml_file = bcbio.solexa.samplesheet.csv2yaml(csv_samplesheet, yaml_file) with open(yaml_file) as fh: config = yaml.load(fh) application_setup = { 'Amplicon': { 'analysis': 'Align_standard' }, 'ChIP-seq': { 'analysis': 'Align_standard', 'genome_build': 'phix' }, 'Custom capture': { 'analysis': 'Align_standard_seqcap' }, 'de novo': { 'analysis': 'Align_standard', 'genome_build': 'unknown' }, 'Exome capture': { 'analysis': 'Align_standard_seqcap' }, 'Finished library': { 'analysis': 'Align_standard', 'genome_build': 'phix' }, 'Mate-pair': { 'analysis': 'Align_standard', 'genome_build': 'unknown' }, 'Metagenome': { 'analysis': 'Align_standard', 'genome_build': 'unknown' }, 'miRNA-seq': { 'analysis': 'Align_standard', 'genome_build': 'unknown' }, 'RNA-seq (mRNA)': { 'analysis': 'Align_standard', 'genome_build': 'phix' }, 'RNA-seq (total RNA)': { 'analysis': 'Align_standard', 'genome_build': 'phix' }, 'WG re-seq': { 'analysis': 'Align_standard' }, 'default': { 'analysis': 'Align_standard' }, } #Connect to maggie to get project application try: p_con = ProjectSummaryConnection(**couch_credentials) except: print "Can't connect to maggie to get application" p_con = None # Replace the default analysis ## TODO: This is an ugly hack, should be replaced by a custom config for lane in config: for plex in lane.get('multiplex', []): application = 'default' if p_con is not None: try: Proj = plex.get('sample_prj', '') project = p_con.get_entry(Proj) if project is not None: application = project.get("application", 'default').strip() except: application = 'default' setup = application_setup.get(application, application_setup['default']) for key, val in setup.items(): plex[key] = val # Remove the yaml file, we will write a new one later os.remove(yaml_file) return config
def raw_data(self): if not self._check_pargs(["project"]): return # if necessary, reformat flowcell identifier if self.pargs.flowcell: self.pargs.flowcell = self.pargs.flowcell.split("_")[-1] # get the uid and gid to use for destination files uid = os.getuid() gid = os.getgid() if self.pargs.group is not None and len(self.pargs.group) > 0: gid = grp.getgrnam(group).gr_gid self.log.debug("Connecting to project database") p_con = ProjectSummaryConnection(**vars(self.pargs)) assert p_con, "Could not get connection to project database" self.log.debug("Connecting to flowcell database") f_con = FlowcellRunMetricsConnection(**vars(self.pargs)) assert f_con, "Could not get connection to flowcell database" self.log.debug("Connecting to x_flowcell database") x_con = X_FlowcellRunMetricsConnection(**vars(self.pargs)) assert x_con, "Could not get connection to x_flowcell database" # Fetch the Uppnex project to deliver to if not self.pargs.uppmax_project: self.pargs.uppmax_project = p_con.get_entry( self.pargs.project, "uppnex_id") if not self.pargs.uppmax_project: self.log.error( "Uppmax project was not specified and could not be fetched from project database" ) return # Setup paths and verify parameters self._meta.production_root = self.pargs.root if self.pargs.root else self.app.config.get( "production", "root") self._meta.root_path = self._meta.production_root proj_base_dir = os.path.join(self._meta.root_path, self.pargs.project) assert os.path.exists( self._meta.production_root ), "No such directory {}; check your production config".format( self._meta.production_root) assert os.path.exists( proj_base_dir), "No project {} in production path {}".format( self.pargs.project, self._meta.root_path) try: self._meta.uppnex_project_root = self.app.config.get( "deliver", "uppnex_project_root") except Exception as e: self.log.warn( "{}, will use '/proj' as uppnext_project_root".format(e)) self._meta.uppnex_project_root = '/proj' try: self._meta.uppnex_delivery_dir = self.app.config.get( "deliver", "uppnex_project_delivery_path") except Exception as e: self.log.warn( "{}, will use 'INBOX' as uppnext_project_delivery_path".format( e)) self._meta.uppnex_delivery_dir = 'INBOX' destination_root = os.path.join(self._meta.uppnex_project_root, self.pargs.uppmax_project, self._meta.uppnex_delivery_dir) assert os.path.exists( destination_root ), "Delivery destination folder {} does not exist".format( destination_root) destination_root = os.path.join(destination_root, self.pargs.project) # Find uncompressed fastq uncompressed = self._find_uncompressed_fastq_files( proj_base_dir=proj_base_dir, sample=self.pargs.sample, flowcell=self.pargs.flowcell) if len(uncompressed) > 0: self.log.error( "There are uncompressed fastq file for project, kindly check all files are compressed properly before delivery" ) return # Extract the list of samples and runs associated with the project and sort them samples = self.samples_to_copy( pid=p_con.get_entry(self.pargs.project, "project_id"), pod=p_con.get_entry(self.pargs.project, "open_date"), fc_dict={ 'HiSeq2500': f_con.proj_list, 'HiSeqX': x_con.proj_list }, proj_base_dir=proj_base_dir, destination_root=destination_root, sample=self.pargs.sample, flowcell=self.pargs.flowcell) # If interactively select, build a list of samples to skip if self.pargs.interactive: to_process = {} for sample in samples: if query_yes_no("Deliver sample {} ?".format(sample), default="no"): to_process[sample] = samples[sample] samples = to_process if self.pargs.sample: sample = samples.get(self.pargs.sample) if not sample: self.log.error( "There is no such sample {} for project {}".format( self.pargs.sample, self.pargs.project)) return samples = {self.pargs.sample: sample} self.log.info( "Will deliver data for {} samples from project {} to {}".format( len(samples), self.pargs.project, destination_root)) if not query_yes_no("Continue?"): return # Make sure that transfer will be with rsync if not self.pargs.rsync: self.log.warn("Files must be transferred using rsync") if not query_yes_no( "Do you wish to continue delivering using rsync?", default="yes"): return self.pargs.rsync = True # Process each sample for sample, flowcells in samples.iteritems(): for fc, files in flowcells.iteritems(): self.log.info("Processing sample {} and flowcell {}".format( sample, fc)) # transfer files self.log.debug("Transferring {} fastq files".format( len(files['src']))) self._transfer_files(sources=files['src'], targets=files['dst']) passed = True if self.pargs.link or self.pargs.dry_run: passed = False else: # calculate md5sums on the source side and write it on the destination md5 = [] for s, d in zip(files['src'], files['dst']): m = md5sum(s) mfile = "{}.md5".format(d) md5.append([m, mfile, s]) self.log.debug("md5sum for source file {}: {}".format( s, m)) # write the md5sum to a file at the destination and verify the transfer for m, mfile, srcpath in md5: dstfile = os.path.splitext(mfile)[0] self.log.debug( "Writing md5sum to file {}".format(mfile)) self.app.cmd.write( mfile, "{} {}".format(m, os.path.basename(dstfile)), True) self.log.debug( "Verifying md5sum for file {}".format(dstfile)) dm = md5sum(dstfile) self.log.debug( "md5sum for destination file {}: {}".format( dstfile, dm)) if m != dm: self.log.warn( "md5sum verification FAILED for {}. Source: {}, Target: {}" .format(dstfile, m, dm)) self.log.warn( "Improperly transferred file {} is removed from destination, please retry transfer of this file" .format(dstfile)) self.app.cmd.safe_unlink(dstfile) self.app.cmd.safe_unlink(mfile) passed = False continue # Modify the permissions to ug+rw for f in [dstfile, mfile]: self.app.cmd.chmod( f, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP) # touch the flag to trigger uppmax inbox permission fix self.app.cmd.safe_touchfile( os.path.join("/sw", "uppmax", "var", "inboxfix", "schedule", self.pargs.uppmax_project)) # log the transfer to statusdb if verification passed if passed: data = { 'raw_data_delivery': { 'timestamp': utc_time(), 'files': { os.path.splitext( (os.path.basename(srcpath)))[0]: { 'md5': m, 'path': os.path.splitext(mfile)[0], 'size_in_bytes': self._getsize(os.path.splitext(mfile)[0]), 'source_location': srcpath } for m, mfile, srcpath in md5 } } } jsonstr = json.dumps(data) jsonfile = os.path.join( proj_base_dir, sample, fc, "{}_{}_raw_data_delivery.json".format(sample, fc)) self.log.debug( "Writing delivery to json file {}".format(jsonfile)) self.app.cmd.write(jsonfile, data=jsonstr, overwrite=True) self.log.debug( "Saving delivery in StatusDB document {}".format(id)) if self.proj_flowcells[fc]['type'] == 'HiSeqX': fc_con = x_con else: fc_con = f_con fc_obj = fc_con.get_entry(fc) self.log.info( "Logging delivery to StatusDB document {}".format( fc_obj.get('_id'))) fc_raw_data = fc_obj.get('raw_data_delivery', {}) fc_raw_data.update(data['raw_data_delivery']) fc_obj['raw_data_delivery'] = fc_raw_data self._save(fc_con, fc_obj) self.log.debug(jsonstr)
def test_2_make_project_note(self): """Make a project note subset by flowcell and project""" s_con = SampleRunMetricsConnection(username=self.user, password=self.pw, url=self.url) fc_con = FlowcellRunMetricsConnection(username=self.user, password=self.pw, url=self.url) p_con = ProjectSummaryConnection(username=self.user, password=self.pw, url=self.url) paragraphs = project_note_paragraphs() headers = project_note_headers() param = parameters project = p_con.get_entry(self.examples["project"]) if not project: print "No project named {}".format(self.examples["project"]) return if project: ordered_amount = p_con.get_ordered_amount(self.examples["project"]) else: return ordered_amount = self.pargs.ordered_million_reads ## Start collecting the data sample_table = [] sample_list = project['samples'] param.update({ key: project.get(ps_to_parameter[key], None) for key in ps_to_parameter.keys() }) samples = p_con.map_name_to_srm(self.examples["project"], check_consistency=True, use_bc_map=True) all_passed = True for k, v in samples.items(): if k == "Unexpected": continue project_sample = sample_list[k] vals = { x: project_sample.get(prjs_to_table[x], None) for x in prjs_to_table.keys() } vals['MOrdered'] = ordered_amount vals['BarcodeSeq'] = s_con.get_entry(v.keys()[0], "sequence") ## Set status vals['Status'] = set_status( vals) if vals['Status'] is None else vals['Status'] vals.update({k: "N/A" for k in vals.keys() if vals[k] is None}) if vals['Status'] == "N/A" or vals['Status'] == "NP": all_passed = False sample_table.append([vals[k] for k in table_keys]) if all_passed: param["finished"] = 'Project finished.' sample_table.sort() sample_table = list( sample_table for sample_table, _ in itertools.groupby(sample_table)) sample_table.insert(0, [ 'ScilifeID', 'CustomerID', 'BarcodeSeq', 'MSequenced', 'MOrdered', 'Status' ]) paragraphs["Samples"]["tpl"] = make_sample_table(sample_table) make_note("{}.pdf".format(self.examples["project"]), headers, paragraphs, **param)
class TestDbConnection(unittest.TestCase): def setUp(self): self.user = "******" self.pw = "pw" self.url = "localhost" self.examples = {"sample": "1_120924_AC003CCCXX_TGACCA", "flowcell": "AC003CCCXX", "project": "J.Doe_00_01"} self.p_con = ProjectSummaryConnection( dbname="projects-test", username=self.user, password=self.pw, url=self.url ) def test_connection(self): """Test database connection""" sample_con = SampleRunMetricsConnection( dbname="samples-test", username=self.user, password=self.pw, url=self.url ) self.assertEqual(sample_con.url_string, "http://{}:5984".format(self.url)) def test_get_flowcell(self): """Test getting a flowcell for a given sample""" sample_con = SampleRunMetricsConnection( dbname="samples-test", username=self.user, password=self.pw, url=self.url ) fc = sample_con.get_entry(self.examples["sample"], "flowcell") self.assertEqual(str(fc), self.examples["flowcell"]) def test_get_sample_ids(self): """Test getting sample ids given flowcell and sample_prj""" sample_con = SampleRunMetricsConnection( dbname="samples-test", username=self.user, password=self.pw, url=self.url ) sample_ids = sample_con.get_sample_ids(fc_id=self.examples["flowcell"]) LOG.info("Number of samples before subsetting: " + str(len(sample_ids))) self.assertEqual(len(sample_ids), 4) sample_ids = sample_con.get_sample_ids(fc_id=self.examples["flowcell"], sample_prj=self.examples["project"]) LOG.info("Number of samples after subsetting: " + str(len(sample_ids))) self.assertEqual(len(sample_ids), 2) def test_get_samples(self): """Test getting samples given flowcell and sample_prj.""" sample_con = SampleRunMetricsConnection( dbname="samples-test", username=self.user, password=self.pw, url=self.url ) samples = sample_con.get_samples(fc_id=self.examples["flowcell"]) LOG.info("Selecting on flowcell: " + str(len(samples))) self.assertEqual(len(samples), 4) samples = sample_con.get_samples(fc_id=self.examples["flowcell"], sample_prj=self.examples["project"]) LOG.info("Selecting on flowcell, subsetting on project: " + str(len(samples))) self.assertEqual(len(samples), 2) samples = sample_con.get_samples(sample_prj=self.examples["project"]) LOG.info("Selecting on project: " + str(len(samples))) self.assertEqual(len(samples), 3) samples = sample_con.get_samples(sample_prj=self.examples["project"], fc_id=self.examples["flowcell"]) LOG.info("Selecting on project, subsetting on flowcell: " + str(len(samples))) self.assertEqual(len(samples), 2) def test_get_samples_wrong_info(self): """Test getting samples when either flowcell or project id information is wrong""" sample_con = SampleRunMetricsConnection( dbname="samples-test", username=self.user, password=self.pw, url=self.url ) samples = sample_con.get_samples(sample_prj="bogusproject", fc_id=self.examples["flowcell"]) LOG.info("Selecting on bogus project, subsetting on flowcell: " + str(len(samples))) self.assertEqual(len(samples), 0) def test_get_project_sample_ids(self): """Test getting project sample ids""" sample_con = SampleRunMetricsConnection( dbname="samples-test", username=self.user, password=self.pw, url=self.url ) sample_ids = sample_con.get_sample_ids(sample_prj=self.examples["project"]) sample_names = [sample_con.db.get(x)["name"] for x in sample_ids] self.assertEqual( set(sample_names), set(["1_120924_AC003CCCXX_TGACCA", "2_120924_AC003CCCXX_ACAGTG", "1_121015_BB002BBBXX_TGACCA"]), ) def test_get_latest_library_prep(self): """Test getting latest library prep""" prj = self.p_con.get_entry("J.Doe_00_01") prj["samples"]["P001_102"]["library_prep"]["B"] = {"sample_run_metrics": {"2_120924_AC003CCCXX_TTGGAA": None}} self.p_con.save(prj) preps = self.p_con.get_latest_library_prep(project_name=self.examples["project"]) srm = [x for l in preps.values() for x in l] # Make sure A prep not in list self.assertNotIn("2_120924_AC003CCCXX_ACAGTG", srm) # Make sure B prep in list self.assertIn("2_120924_AC003CCCXX_TTGGAA", srm) # Reset data prj = self.p_con.get_entry("J.Doe_00_01") del prj["samples"]["P001_102"]["library_prep"]["B"] self.p_con.save(prj)
def _project_status_note_table(project_name=None, username=None, password=None, url=None, use_ps_map=True, use_bc_map=False, check_consistency=False, ordered_million_reads=None, uppnex_id=None, customer_reference=None, exclude_sample_ids={}, project_alias=None, sample_aliases={}, projectdb="projects", samplesdb="samples", flowcelldb="flowcells", include_all_samples=False, param={}, **kw): # mapping project_summary to parameter keys ps_to_parameter = {"scilife_name":"scilife_name", "customer_name":"customer_name", "project_name":"project_name"} # mapping project sample to table table_keys = ['ScilifeID', 'SubmittedID', 'BarcodeSeq', 'MSequenced', 'MOrdered'] output_data = {'stdout':StringIO(), 'stderr':StringIO(), 'debug':StringIO()} # Connect and run s_con = SampleRunMetricsConnection(dbname=samplesdb, username=username, password=password, url=url) fc_con = FlowcellRunMetricsConnection(dbname=flowcelldb, username=username, password=password, url=url) p_con = ProjectSummaryConnection(dbname=projectdb, username=username, password=password, url=url) #Get the information source for this project source = p_con.get_info_source(project_name) # Get project summary from project database sample_aliases = _literal_eval_option(sample_aliases, default={}) prj_summary = p_con.get_entry(project_name) if not prj_summary: LOG.warn("No such project '{}'".format(project_name)) return LOG.debug("Working on project '{}'.".format(project_name)) # Determine if project is finished by getting all samples sequenced date try: all_samples_sequenced = prj_summary['project_summary']['all_samples_sequenced'] except (TypeError,KeyError): all_samples_sequenced = False # Get sample run list and loop samples to make mapping sample -> {sampleruns} sample_run_list = _set_sample_run_list(project_name, flowcell=None, project_alias=project_alias, s_con=s_con) samples = {} for s in sample_run_list: prj_sample = p_con.get_project_sample(project_name, s.get("project_sample_name", None)) if prj_sample: sample_name = prj_sample['project_sample'].get("scilife_name", None) s_d = {s["name"] : {'sample':sample_name, 'id':s["_id"]}} samples.update(s_d) else: if s["barcode_name"] in sample_aliases: s_d = {sample_aliases[s["barcode_name"]] : {'sample':sample_aliases[s["barcode_name"]], 'id':s["_id"]}} samples.update(s_d) else: s_d = {s["name"]:{'sample':s["name"], 'id':s["_id"], 'barcode_name':s["barcode_name"]}} LOG.warn("No mapping found for sample run:\n '{}'".format(s_d)) # Convert to mapping from desired sample name to list of aliases # Less important for the moment; one solution is to update the # Google docs summary table to use the P names sample_dict = prj_summary['samples'] param.update({key:prj_summary.get(ps_to_parameter[key], None) for key in ps_to_parameter.keys()}) param["ordered_amount"] = param.get("ordered_amount", p_con.get_ordered_amount(project_name, samples=sample_dict)) if not param.get('customer_reference') : try: param['customer_reference'] = prj_summary['details']['customer_project_reference'] except (TypeError,KeyError): param['customer_reference'] = prj_summary.get('customer_reference') param['uppnex_project_id'] = param.get('uppnex_project_id', prj_summary.get('uppnex_id')) # Override database values if options passed at command line if uppnex_id: param["uppnex_project_id"] = uppnex_id if customer_reference: param["customer_reference"] = customer_reference # Process options ordered_million_reads = _literal_eval_option(ordered_million_reads) exclude_sample_ids = _literal_eval_option(exclude_sample_ids, default={}) ## Start collecting the data sample_table = [] samples_excluded = [] last_library_preps = p_con.get_latest_library_prep(project_name) last_library_preps_srm = [x for l in last_library_preps.values() for x in l] LOG.debug("Looping through sample map that maps project sample names to sample run metrics ids") for k,v in samples.items(): LOG.debug("project sample '{}' maps to '{}'".format(k, v)) if not include_all_samples: if v['sample'] not in last_library_preps.keys(): LOG.info("No library prep information for sample {}; keeping in report".format(v['sample'])) else: if k not in last_library_preps_srm: LOG.info("Sample run {} ('{}') is not latest library prep ({}) for project sample {}: excluding from report".format(k, v["id"], ",".join(list(set(last_library_preps[v['sample']].values()))), v['sample'])) continue else: pass if re.search("Unexpected", k): continue barcode_seq = s_con.get_entry(k, "sequence") # Exclude sample id? if _exclude_sample_id(exclude_sample_ids, v['sample'], barcode_seq): samples_excluded.append(v['sample']) continue # Get the project sample name from the sample run and set table values project_sample = sample_dict[v['sample']] vals = _set_sample_table_values(v['sample'], project_sample, barcode_seq, ordered_million_reads, param) sample_table.append([vals[k] for k in table_keys]) # Loop through samples in sample_dict for which there is no sample run information samples_in_table_or_excluded = list(set([x[0] for x in sample_table])) + samples_excluded samples_not_in_table = list(set(sample_dict.keys()) - set(samples_in_table_or_excluded)) for sample in samples_not_in_table: if re.search("Unexpected", sample): continue project_sample = sample_dict[sample] # Set project_sample_d: a dictionary mapping from sample run metrics name to sample run metrics database id project_sample_d = _set_project_sample_dict(project_sample, source) if project_sample_d: for k,v in project_sample_d.iteritems(): barcode_seq = s_con.get_entry(k, "sequence") vals = _set_sample_table_values(sample, project_sample, barcode_seq, ordered_million_reads, param) sample_table.append([vals[k] for k in table_keys]) else: barcode_seq = None vals = _set_sample_table_values(sample, project_sample, barcode_seq, ordered_million_reads, param) sample_table.append([vals[k] for k in table_keys]) if all_samples_sequenced: param["finished"] = 'All samples for this project have been sequenced.' sample_table.sort() sample_table = list(sample_table for sample_table,_ in itertools.groupby(sample_table)) sample_table.insert(0, ['ScilifeID', 'SubmittedID', 'BarcodeSeq', 'MSequenced', 'MOrdered']) return output_data, sample_table, param
def initiate_survey(report, project, **kw): # Get a connection to the database pcon = ProjectSummaryConnection(**kw) if not pcon: report.log.error( "Could not get connection to database".format(project)) return False # Get the document for the project pdoc = pcon.get_entry(project) if not pdoc: report.log.error("No such project: {} in database".format(project)) return False # get a project instance from lims lproj = lims_project(report, pdoc.get("project_id")) if not lproj: report.log.error( "Could not initiate LIMS object for project {}".format(project)) return False # Check if project is aborted if project_aborted(lproj): report.log.warn("Project {} has been aborted".format(project)) return False # check if project is closed closed = project_closed(lproj) if closed is None: report.log.warn("Project {} is not closed".format(project)) return False report.log.debug("Project {} closed on {}".format( project, datetime.datetime.strptime(closed, report._meta.date_format))) # check if a user survey has already been sent if survey_sent(lproj): report.log.info("Survey already sent for project {}".format(project)) return False report.log.debug("No previous survey sent for {}".format(project)) # get email addresses for persons connected to the project emails = project_email(report, lproj) if len(emails) == 0: report.log.warn( "No email addresses found associated with project {}".format( project)) return False # verify the format of the email address recipients = [] for email in emails: if email is None or not re.match(r'[^@]+@[^@]+\.[^@]+', email): report.log.warn("Illegal email format: {}".format(email)) continue recipients.append(email) # send the survey email to each recipient sent = send_survey(report, project, recipients, sender=kw.get("sender"), smtphost=kw.get("smtphost"), smtpport=kw.get("smtpport"), dryrun=report.pargs.dry_run) # update the project udf to indicate that we have sent out the survey if sent: report.log.info("Survey sent to recipients {} successfully".format( ",".join(recipients))) lproj.udf['Survey sent'] = datetime.datetime.now().date() if not report.pargs.dry_run: lproj.put() elif not sent: report.log.warn("Sending survey to recipients {} failed".format( ",".join(recipients))) return sent
def initiate_survey(report, project, **kw): # Get a connection to the database pcon = ProjectSummaryConnection(**kw) if not pcon: report.log.error("Could not get connection to database".format(project)) return False # Get the document for the project pdoc = pcon.get_entry(project) if not pdoc: report.log.error("No such project: {} in database".format(project)) return False # get a project instance from lims lproj = lims_project(report, pdoc.get("project_id")) if not lproj: report.log.error("Could not initiate LIMS object for project {}".format(project)) return False # check if project is closed closed = project_closed(lproj) if closed is None: report.log.warn("Project {} is not closed".format(project)) return False report.log.debug("Project {} closed on {}".format(project,datetime.datetime.strptime(closed,report._meta.date_format))) # check if a user survey has already been sent if survey_sent(lproj): report.log.info("Survey already sent for project {}".format(project)) return False report.log.debug("No previous survey sent for {}".format(project)) # get email addresses for persons connected to the project emails = project_email(report,lproj) if len(emails) == 0: report.log.warn("No email addresses found associated with project {}".format(project)) return False # verify the format of the email address recipients = [] for email in emails: if email is None or not re.match(r'^[_a-z0-9-]+(\.[_a-z0-9-]+)*@[a-z0-9-]+(\.[a-z0-9-]+)*(\.[a-z]{2,3})$',email): report.log.warn("Illegal email format: {}".format(email)) continue recipients.append(email) # send the survey email to each recipient sent = send_survey(report, project, recipients, sender = kw.get("sender"), smtphost=kw.get("smtphost"), smtpport=kw.get("smtpport"), dryrun=report.pargs.dry_run) # update the project udf to indicate that we have sent out the survey if sent: report.log.info("Survey sent to recipients {} successfully".format(",".join(recipients))) lproj.udf['Survey sent'] = datetime.datetime.now().date() if not report.pargs.dry_run: lproj.put() elif not sent: report.log.warn("Sending survey to recipients {} failed".format(",".join(recipients))) return sent
def project_status_note(project_name=None, username=None, password=None, url=None, use_ps_map=True, use_bc_map=False, check_consistency=False, ordered_million_reads=None, uppnex_id=None, customer_reference=None, exclude_sample_ids={}, project_alias=None, sample_aliases={}, projectdb="projects", samplesdb="samples", flowcelldb="flowcells", include_all_samples=False, **kw): """Make a project status note. Used keywords: :param project_name: project name :param user: db user name :param password: db password :param url: db url :param use_ps_map: use project summary mapping :param use_bc_map: use project to barcode name mapping :param check_consistency: check consistency between mappings :param ordered_million_reads: number of ordered reads in millions :param uppnex_id: the uppnex id :param customer_reference: customer project name :param exclude_sample_ids: exclude some sample ids from project note :param project_alias: project alias name :param sample_aliases: sample alias names :param projectdb: project db name :param samplesdb: samples db name :param flowcelldb: flowcells db name :param include_all_samples: include all samples in report """ # parameters parameters = { "project_name": project_name, "finished": "Not finished, or cannot yet assess if finished.", } # mapping project_summary to parameter keys ps_to_parameter = { "scilife_name": "scilife_name", "customer_name": "customer_name", "project_name": "project_name" } # mapping project sample to table table_keys = [ 'ScilifeID', 'CustomerID', 'BarcodeSeq', 'MSequenced', 'MOrdered', 'Status' ] output_data = { 'stdout': StringIO(), 'stderr': StringIO(), 'debug': StringIO() } # Connect and run s_con = SampleRunMetricsConnection(dbname=samplesdb, username=username, password=password, url=url) fc_con = FlowcellRunMetricsConnection(dbname=flowcelldb, username=username, password=password, url=url) p_con = ProjectSummaryConnection(dbname=projectdb, username=username, password=password, url=url) # Set report paragraphs paragraphs = project_note_paragraphs() headers = project_note_headers() # Set local param variable param = parameters # Get project summary from project database sample_aliases = _literal_eval_option(sample_aliases, default={}) prj_summary = p_con.get_entry(project_name) if not prj_summary: LOG.warn("No such project '{}'".format(project_name)) return LOG.debug("Working on project '{}'.".format(project_name)) # Get sample run list and loop samples to make mapping sample -> {sampleruns} sample_run_list = _set_sample_run_list(project_name, flowcell=None, project_alias=project_alias, s_con=s_con) samples = {} for s in sample_run_list: prj_sample = p_con.get_project_sample( project_name, s.get("project_sample_name", None)) if prj_sample: sample_name = prj_sample['project_sample'].get( "scilife_name", None) s_d = {s["name"]: {'sample': sample_name, 'id': s["_id"]}} samples.update(s_d) else: if s["barcode_name"] in sample_aliases: s_d = { sample_aliases[s["barcode_name"]]: { 'sample': sample_aliases[s["barcode_name"]], 'id': s["_id"] } } samples.update(s_d) else: s_d = { s["name"]: { 'sample': s["name"], 'id': s["_id"], 'barcode_name': s["barcode_name"] } } LOG.warn( "No mapping found for sample run:\n '{}'".format(s_d)) # Convert to mapping from desired sample name to list of aliases # Less important for the moment; one solution is to update the # Google docs summary table to use the P names sample_dict = prj_summary['samples'] param.update({ key: prj_summary.get(ps_to_parameter[key], None) for key in ps_to_parameter.keys() }) param["ordered_amount"] = param.get("ordered_amount", p_con.get_ordered_amount(project_name)) param['customer_reference'] = param.get( 'customer_reference', prj_summary.get('customer_reference')) param['uppnex_project_id'] = param.get('uppnex_project_id', prj_summary.get('uppnex_id')) # Override database values if options passed at command line if uppnex_id: param["uppnex_project_id"] = uppnex_id if customer_reference: param["customer_reference"] = customer_reference # Process options ordered_million_reads = _literal_eval_option(ordered_million_reads) exclude_sample_ids = _literal_eval_option(exclude_sample_ids, default={}) ## Start collecting the data sample_table = [] samples_excluded = [] all_passed = True last_library_preps = p_con.get_latest_library_prep(project_name) last_library_preps_srm = [ x for l in last_library_preps.values() for x in l ] LOG.debug( "Looping through sample map that maps project sample names to sample run metrics ids" ) for k, v in samples.items(): LOG.debug("project sample '{}' maps to '{}'".format(k, v)) if not include_all_samples: if v['sample'] not in last_library_preps.keys(): LOG.info( "No library prep information for sample {}; keeping in report" .format(v['sample'])) else: if k not in last_library_preps_srm: LOG.info( "Sample run {} ('{}') is not latest library prep ({}) for project sample {}: excluding from report" .format(k, v["id"], last_library_preps[v['sample']].values()[0], v['sample'])) continue else: pass if re.search("Unexpected", k): continue barcode_seq = s_con.get_entry(k, "sequence") # Exclude sample id? if _exclude_sample_id(exclude_sample_ids, v['sample'], barcode_seq): samples_excluded.append(v['sample']) continue # Get the project sample name from the sample run and set table values project_sample = sample_dict[v['sample']] vals = _set_sample_table_values(v['sample'], project_sample, barcode_seq, ordered_million_reads, param) if vals['Status'] == "N/A" or vals['Status'] == "NP": all_passed = False sample_table.append([vals[k] for k in table_keys]) # Loop through samples in sample_dict for which there is no sample run information samples_in_table_or_excluded = list(set([x[0] for x in sample_table ])) + samples_excluded samples_not_in_table = list( set(sample_dict.keys()) - set(samples_in_table_or_excluded)) for sample in samples_not_in_table: if re.search("Unexpected", sample): continue project_sample = sample_dict[sample] # Set project_sample_d: a dictionary mapping from sample run metrics name to sample run metrics database id project_sample_d = _set_project_sample_dict(project_sample) if project_sample_d: for k, v in project_sample_d.iteritems(): barcode_seq = s_con.get_entry(k, "sequence") vals = _set_sample_table_values(sample, project_sample, barcode_seq, ordered_million_reads, param) if vals['Status'] == "N/A" or vals['Status'] == "NP": all_passed = False sample_table.append([vals[k] for k in table_keys]) else: barcode_seq = None vals = _set_sample_table_values(sample, project_sample, barcode_seq, ordered_million_reads, param) if vals['Status'] == "N/A" or vals['Status'] == "NP": all_passed = False sample_table.append([vals[k] for k in table_keys]) if all_passed: param["finished"] = 'Project finished.' sample_table.sort() sample_table = list(sample_table for sample_table, _ in itertools.groupby(sample_table)) sample_table.insert(0, [ 'ScilifeID', 'CustomerID', 'BarcodeSeq', 'MSequenced', 'MOrdered', 'Status' ]) paragraphs["Samples"]["tpl"] = make_sample_table(sample_table) make_note("{}_project_summary.pdf".format(project_name), headers, paragraphs, **param) make_rest_note("{}_project_summary.rst".format(project_name), sample_table=sample_table, report="project_report", **param) param.update( {k: "N/A" for k in param.keys() if param[k] is None or param[k] == ""}) output_data["debug"].write( json.dumps({ 'param': param, 'table': sample_table })) return output_data
from scilifelab.db.statusdb import ProjectSummaryConnection pcon = ProjectSummaryConnection(url="tools.scilifelab.se", username="******", password="******") project = pcon.get_entry('C.Dixelius_13_01') for sample in project.get("samples",{}).values(): print("\t".join([sample.get('scilife_name'),sample.get('customer_name')]))
def data_delivery_note(**kw): """Create an easily parseable information file with information about the data delivery """ output_data = { 'stdout': StringIO(), 'stderr': StringIO(), 'debug': StringIO() } project_name = kw.get('project_name', None) flowcell = kw.get('flowcell', None) LOG.debug("Generating data delivery note for project {}{}.".format( project_name, ' and flowcell {}'.format(flowcell if flowcell else ''))) # Get a connection to the project and sample databases p_con = ProjectSummaryConnection(**kw) assert p_con, "Could not connect to project database" s_con = SampleRunMetricsConnection(**kw) assert s_con, "Could not connect to sample database" # Get the entry for the project and samples from the database LOG.debug("Fetching samples from sample database") samples = s_con.get_samples(sample_prj=project_name, fc_id=flowcell) LOG.debug("Got {} samples from database".format(len(samples))) # Get the customer sample names from the project database LOG.debug("Fetching samples from project database") project_samples = p_con.get_entry(project_name, "samples") customer_names = { sample_name: sample.get('customer_name', 'N/A') for sample_name, sample in project_samples.items() } data = [[ 'SciLifeLab ID', 'Submitted ID', 'Flowcell', 'Lane', 'Barcode', 'Read', 'Path', 'MD5', 'Size (bytes)', 'Timestamp' ]] for sample in samples: sname = sample.get('project_sample_name', 'N/A') cname = customer_names.get(sname, 'N/A') fc = sample.get('flowcell', 'N/A') lane = sample.get('lane', 'N/A') barcode = sample.get('sequence', 'N/A') if 'raw_data_delivery' not in sample: data.append([sname, cname, '', '', '', '', '', '', '', '']) continue delivery = sample['raw_data_delivery'] tstamp = delivery.get('timestamp', 'N/A') for read, file in delivery.get('files', {}).items(): data.append([ sname, cname, fc, lane, barcode, read, file.get('path', 'N/A'), file.get('md5', 'N/A'), file.get('size_in_bytes', 'N/A'), tstamp, ]) # Write the data to a csv file outfile = "{}{}_data_delivery.csv".format( project_name, '_{}'.format(flowcell) if flowcell else '') LOG.debug("Writing delivery data to {}".format(outfile)) with open(outfile, "w") as outh: csvw = csv.writer(outh) for row in data: csvw.writerow(row) # Write Texttable formatted output to stdout tt = texttable.Texttable(180) tt.add_rows(data) output_data['stdout'].write(tt.draw()) return output_data
def raw_data(self): if not self._check_pargs(["project"]): return # if necessary, reformat flowcell identifier if self.pargs.flowcell: self.pargs.flowcell = self.pargs.flowcell.split("_")[-1] # get the uid and gid to use for destination files uid = os.getuid() gid = os.getgid() if self.pargs.group is not None and len(self.pargs.group) > 0: gid = grp.getgrnam(group).gr_gid self.log.debug("Connecting to project database") p_con = ProjectSummaryConnection(**vars(self.pargs)) assert p_con, "Could not get connection to project databse" self.log.debug("Connecting to samples database") s_con = SampleRunMetricsConnection(**vars(self.pargs)) assert s_con, "Could not get connection to samples databse" # Fetch the Uppnex project to deliver to if not self.pargs.uppmax_project: self.pargs.uppmax_project = p_con.get_entry(self.pargs.project, "uppnex_id") if not self.pargs.uppmax_project: self.log.error("Uppmax project was not specified and could not be fetched from project database") return # Extract the list of samples and runs associated with the project and sort them samples = sorted(s_con.get_samples(fc_id=self.pargs.flowcell, sample_prj=self.pargs.project), key=lambda k: (k.get('project_sample_name','NA'), k.get('flowcell','NA'), k.get('lane','NA'))) # Setup paths and verify parameters self._meta.production_root = self.app.config.get("production", "root") self._meta.root_path = self._meta.production_root proj_base_dir = os.path.join(self._meta.root_path, self.pargs.project) assert os.path.exists(self._meta.production_root), "No such directory {}; check your production config".format(self._meta.production_root) assert os.path.exists(proj_base_dir), "No project {} in production path {}".format(self.pargs.project,self._meta.root_path) try: self._meta.uppnex_project_root = self.app.config.get("deliver", "uppnex_project_root") except Exception as e: self.log.warn("{}, will use '/proj' as uppnext_project_root".format(e)) self._meta.uppnex_project_root = '/proj' try: self._meta.uppnex_delivery_dir = self.app.config.get("deliver", "uppnex_project_delivery_path") except Exception as e: self.log.warn("{}, will use 'INBOX' as uppnext_project_delivery_path".format(e)) self._meta.uppnex_delivery_dir = 'INBOX' destination_root = os.path.join(self._meta.uppnex_project_root,self.pargs.uppmax_project,self._meta.uppnex_delivery_dir) assert os.path.exists(destination_root), "Delivery destination folder {} does not exist".format(destination_root) destination_root = os.path.join(destination_root,self.pargs.project) # If interactively select, build a list of samples to skip if self.pargs.interactive: to_process = [] for sample in samples: sname = sample.get("project_sample_name") index = sample.get("sequence") fcid = sample.get("flowcell") lane = sample.get("lane") date = sample.get("date") self.log.info("Sample: {}, Barcode: {}, Flowcell: {}, Lane: {}, Started on: {}".format(sname, index, fcid, lane, date)) if query_yes_no("Deliver sample?", default="no"): to_process.append(sample) samples = to_process # Find uncompressed fastq uncompressed = self._find_uncompressed_fastq_files(proj_base_dir,samples) if len(uncompressed) > 0: self.log.warn("The following samples have uncompressed *.fastq files that cannot be delivered: {}".format(",".join(uncompressed))) if not query_yes_no("Continue anyway?", default="no"): return self.log.info("Will deliver data for {} samples from project {} to {}".format(len(samples),self.pargs.project,destination_root)) if not query_yes_no("Continue?"): return # Get the list of files to transfer and the destination self.log.debug("Gathering list of files to copy") to_copy = self.get_file_copy_list(proj_base_dir, destination_root, samples) # Make sure that transfer will be with rsync if not self.pargs.rsync: self.log.warn("Files must be transferred using rsync") if not query_yes_no("Do you wish to continue delivering using rsync?", default="yes"): return self.pargs.rsync = True # Process each sample run for id, files in to_copy.items(): # get the sample database object [sample] = [s for s in samples if s.get('_id') == id] self.log.info("Processing sample {} and flowcell {}".format(sample.get("project_sample_name","NA"),sample.get("flowcell","NA"))) # transfer files self.log.debug("Transferring {} fastq files".format(len(files))) self._transfer_files([f[0] for f in files], [f[1] for f in files]) passed = True if self.pargs.link or self.pargs.dry_run: passed = False else: # calculate md5sums on the source side and write it on the destination md5 = [] for f in files: m = md5sum(f[0]) mfile = "{}.md5".format(f[1]) md5.append([m,mfile,f[2],f[0]]) self.log.debug("md5sum for source file {}: {}".format(f[0],m)) # write the md5sum to a file at the destination and verify the transfer for m, mfile, read, srcpath in md5: dstfile = os.path.splitext(mfile)[0] self.log.debug("Writing md5sum to file {}".format(mfile)) self.app.cmd.write(mfile,"{} {}".format(m,os.path.basename(dstfile)),True) self.log.debug("Verifying md5sum for file {}".format(dstfile)) dm = md5sum(dstfile) self.log.debug("md5sum for destination file {}: {}".format(dstfile,dm)) if m != dm: self.log.warn("md5sum verification FAILED for {}. Source: {}, Target: {}".format(dstfile,m,dm)) self.log.warn("Improperly transferred file {} is removed from destination, please retry transfer of this file".format(dstfile)) self.app.cmd.safe_unlink(dstfile) self.app.cmd.safe_unlink(mfile) passed = False continue # Modify the permissions to ug+rw for f in [dstfile, mfile]: self.app.cmd.chmod(f,stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP) # touch the flag to trigger uppmax inbox permission fix self.app.cmd.safe_touchfile(os.path.join("/sw","uppmax","var","inboxfix","schedule",self.pargs.uppmax_project)) # log the transfer to statusdb if verification passed if passed: self.log.info("Logging delivery to StatusDB document {}".format(id)) data = {'raw_data_delivery': {'timestamp': utc_time(), 'files': {'R{}'.format(read):{'md5': m, 'path': os.path.splitext(mfile)[0], 'size_in_bytes': self._getsize(os.path.splitext(mfile)[0]), 'source_location': srcpath} for m, mfile, read, srcpath in md5}, } } jsonstr = json.dumps(data) jsonfile = os.path.join(os.path.dirname(md5[0][3]), "{}_{}_{}_{}_L{}_raw_data_delivery.json".format(sample.get("date"), sample.get("flowcell"), sample.get("project_sample_name"), sample.get("sequence"), sample.get("lane"))) self.log.debug("Writing delivery to json file {}".format(jsonfile)) self.app.cmd.write(jsonfile,data=jsonstr,overwrite=True) self.log.debug("Saving delivery in StatusDB document {}".format(id)) sample.update(data) self._save(s_con,sample) self.log.debug(jsonstr)
def raw_data(self): if not self._check_pargs(["project"]): return # if necessary, reformat flowcell identifier if self.pargs.flowcell: self.pargs.flowcell = self.pargs.flowcell.split("_")[-1] # get the uid and gid to use for destination files uid = os.getuid() gid = os.getgid() if self.pargs.group is not None and len(self.pargs.group) > 0: gid = grp.getgrnam(group).gr_gid self.log.debug("Connecting to project database") p_con = ProjectSummaryConnection(**vars(self.pargs)) assert p_con, "Could not get connection to project databse" self.log.debug("Connecting to samples database") s_con = SampleRunMetricsConnection(**vars(self.pargs)) assert s_con, "Could not get connection to samples databse" # Fetch the Uppnex project to deliver to if not self.pargs.uppmax_project: self.pargs.uppmax_project = p_con.get_entry( self.pargs.project, "uppnex_id") if not self.pargs.uppmax_project: self.log.error( "Uppmax project was not specified and could not be fetched from project database" ) return # Extract the list of samples and runs associated with the project and sort them samples = sorted(s_con.get_samples(fc_id=self.pargs.flowcell, sample_prj=self.pargs.project), key=lambda k: (k.get('project_sample_name', 'NA'), k.get('flowcell', 'NA'), k.get('lane', 'NA'))) # Setup paths and verify parameters self._meta.production_root = self.app.config.get("production", "root") self._meta.root_path = self._meta.production_root proj_base_dir = os.path.join(self._meta.root_path, self.pargs.project) assert os.path.exists( self._meta.production_root ), "No such directory {}; check your production config".format( self._meta.production_root) assert os.path.exists( proj_base_dir), "No project {} in production path {}".format( self.pargs.project, self._meta.root_path) try: self._meta.uppnex_project_root = self.app.config.get( "deliver", "uppnex_project_root") except Exception as e: self.log.warn( "{}, will use '/proj' as uppnext_project_root".format(e)) self._meta.uppnex_project_root = '/proj' try: self._meta.uppnex_delivery_dir = self.app.config.get( "deliver", "uppnex_project_delivery_path") except Exception as e: self.log.warn( "{}, will use 'INBOX' as uppnext_project_delivery_path".format( e)) self._meta.uppnex_delivery_dir = 'INBOX' destination_root = os.path.join(self._meta.uppnex_project_root, self.pargs.uppmax_project, self._meta.uppnex_delivery_dir) assert os.path.exists( destination_root ), "Delivery destination folder {} does not exist".format( destination_root) destination_root = os.path.join(destination_root, self.pargs.project) # If interactively select, build a list of samples to skip if self.pargs.interactive: to_process = [] for sample in samples: sname = sample.get("project_sample_name") index = sample.get("sequence") fcid = sample.get("flowcell") lane = sample.get("lane") date = sample.get("date") self.log.info( "Sample: {}, Barcode: {}, Flowcell: {}, Lane: {}, Started on: {}" .format(sname, index, fcid, lane, date)) if query_yes_no("Deliver sample?", default="no"): to_process.append(sample) samples = to_process # Find uncompressed fastq uncompressed = self._find_uncompressed_fastq_files( proj_base_dir, samples) if len(uncompressed) > 0: self.log.warn( "The following samples have uncompressed *.fastq files that cannot be delivered: {}" .format(",".join(uncompressed))) if not query_yes_no("Continue anyway?", default="no"): return self.log.info( "Will deliver data for {} samples from project {} to {}".format( len(samples), self.pargs.project, destination_root)) if not query_yes_no("Continue?"): return # Get the list of files to transfer and the destination self.log.debug("Gathering list of files to copy") to_copy = self.get_file_copy_list(proj_base_dir, destination_root, samples) # Make sure that transfer will be with rsync if not self.pargs.rsync: self.log.warn("Files must be transferred using rsync") if not query_yes_no( "Do you wish to continue delivering using rsync?", default="yes"): return self.pargs.rsync = True # Process each sample run for id, files in to_copy.items(): # get the sample database object [sample] = [s for s in samples if s.get('_id') == id] self.log.info("Processing sample {} and flowcell {}".format( sample.get("project_sample_name", "NA"), sample.get("flowcell", "NA"))) # calculate md5sums on the source side and write it on the destination md5 = [] for f in files: m = md5sum(f[0]) mfile = "{}.md5".format(f[1]) md5.append([m, mfile, f[2], f[0]]) self.log.debug("md5sum for source file {}: {}".format(f[0], m)) # transfer files self.log.debug("Transferring {} fastq files".format(len(files))) self._transfer_files([f[0] for f in files], [f[1] for f in files]) # write the md5sum to a file at the destination and verify the transfer passed = True for m, mfile, read, srcpath in md5: dstfile = os.path.splitext(mfile)[0] self.log.debug("Writing md5sum to file {}".format(mfile)) self.app.cmd.write( mfile, "{} {}".format(m, os.path.basename(dstfile)), True) self.log.debug("Verifying md5sum for file {}".format(dstfile)) # if dry-run, make sure verification pass if self.pargs.dry_run: dm = m else: dm = md5sum(dstfile) self.log.debug("md5sum for destination file {}: {}".format( dstfile, dm)) if m != dm: self.log.warn( "md5sum verification FAILED for {}. Source: {}, Target: {}" .format(dstfile, m, dm)) self.log.warn( "Improperly transferred file {} is removed from destination, please retry transfer of this file" .format(dstfile)) self.app.cmd.safe_unlink(dstfile) self.app.cmd.safe_unlink(mfile) passed = False continue # Modify the permissions to ug+rw for f in [dstfile, mfile]: self.app.cmd.chmod( f, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP) # touch the flag to trigger uppmax inbox permission fix self.app.cmd.safe_touchfile( os.path.join("/sw", "uppmax", "var", "inboxfix", "schedule", self.pargs.uppmax_project)) # log the transfer to statusdb if verification passed if passed: self.log.info( "Logging delivery to StatusDB document {}".format(id)) data = { 'raw_data_delivery': { 'timestamp': utc_time(), 'files': { 'R{}'.format(read): { 'md5': m, 'path': os.path.splitext(mfile)[0], 'size_in_bytes': self._getsize(os.path.splitext(mfile)[0]), 'source_location': srcpath } for m, mfile, read, srcpath in md5 }, } } jsonstr = json.dumps(data) jsonfile = os.path.join( os.path.dirname(md5[0][3]), "{}_{}_{}_{}_L{}_raw_data_delivery.json".format( sample.get("date"), sample.get("flowcell"), sample.get("project_sample_name"), sample.get("sequence"), sample.get("lane"))) self.log.debug( "Writing delivery to json file {}".format(jsonfile)) self.app.cmd.write(jsonfile, data=jsonstr, overwrite=True) self.log.debug( "Saving delivery in StatusDB document {}".format(id)) sample.update(data) self._save(s_con, sample) self.log.debug(jsonstr)
class GDocsUpdater(rm.RunMonitor): def __init__(self, config): super(GDocsUpdater, self).__init__(config) # Connect to the Google Docs api gdconf = self.config.get("gdocs",{}) creds = os.path.expanduser(gdconf.get("credentials_file","")) assert os.path.exists(creds), "Supplied GDocs credentials file does not exist" self.gdcon = SpreadSheet(get_credentials(creds)) assert self.gdcon, "Could not get a SpreadSheet object, please verify gdocs credentials" doc = gdconf.get("qc_checklist",None) assert doc, "No QC checklist specified in configuration, please specify" ssheet = self.gdcon.get_spreadsheet(doc) assert ssheet, "Could not locate QC checklist '{}' on Google Docs. Please make sure it exists".format(doc) self.gdcon.ssheet = ssheet # Get the Ongoing, Finished and Coming worksheets self.ongoing = self.gdcon.get_worksheet("Ongoing") self.coming = self.gdcon.get_worksheet("Coming") self.finished = self.gdcon.get_worksheet("Finished") assert self.ongoing and self.coming and self.finished, "Could not get 'Ongoing', 'Finished' and 'Coming' worksheets from '{}'. Please make sure that they exist".format(doc) # Get a connection to the StatusDB project database dbconf = self.config.get("statusdb",{}) try: self.pcon = ProjectSummaryConnection(url=dbconf.get("url","localhost"), username=dbconf.get("user","user"), password=dbconf.get("password","pass")) except ConnectionError: self.pcon = None def _list_runs(self, lists): # Loop over the lists and fetch the cards runs = {} for tlist in lists: list_obj = self.trello.get_list(self.trello_board,tlist,True) if not list_obj: continue # Loop over the cards in the list for card in list_obj.list_cards(): # Get the description and convert it to a dictionary runs[card.name] = self.description_to_dict(card.description) return runs def coming_runs(self): """Return a dictionary with runs that are currently in process, i.e. not handed over to the processing pipeline on Uppmax. The key in the dictionary is the run id and the values is a metadata dictionary """ # Runs in these lists are to be considered "coming" lists = [rm.FIRSTREAD, rm.INDEXREAD, rm.SECONDREAD, rm.PROCESSING, rm.UPPMAX, rm.STALLED] return self._list_runs(lists) def ongoing_runs(self): """Return a dictionary with runs that have finished and have been handed over to the processing pipeline on Uppmax. The key in the dictionary is the run id and the values is a metadata dictionary """ # Runs in these lists are to be considered "coming" lists = [rm.COMPLETED] return self._list_runs(lists) def reshape_run_info(self, runs, skiplist=[]): """Take the dictionary of runs and convert to a sorted list of lists with elements corresponding to the columns in the checklist""" run_projects = [] for id,data in runs.items(): p = data.get('Projects',['']) if type(p) is not list: p = [p] for project in p: if len(project) == 0: project = 'Unknown, please check!' if "{}_{}".format(id,project) not in skiplist: application, tp = '',''#self.lookup_project(project) run_projects.append([id,project,application,tp,'',data.get('Run mode',[''])[0]]) return run_projects def lookup_project(self, project): """Lookup project application and type in StatusDB""" application = "" type = "" if self.pcon: pdoc = self.pcon.get_entry(project) if pdoc: application = str(pdoc.get("application","")) type = str(pdoc.get("type",pdoc.get("details",{}).get("type",""))) return application, type def get_skiplist(self): """Get the runs and projects already listed in the GDocs spreadsheet """ skiplist = [] # Get the contents from the finished worksheet for run_project in self.gdocs_finished_runs(): skiplist.append("{}_{}".format(run_project[0],run_project[1])) return skiplist def gdocs_coming_runs(self): return self._get_gdocs_run_projects(self.coming,COMING_HEADER_OFFSET) def gdocs_ongoing_runs(self): return self._get_gdocs_run_projects(self.ongoing,ONGOING_HEADER_OFFSET) def gdocs_finished_runs(self): return self._get_gdocs_run_projects(self.finished,FINISHED_HEADER_OFFSET) def _get_gdocs_run_projects(self, wsheet, header_offset): # Get the cell data run_projects = {} rows = self.gdcon.get_cell_content(wsheet,header_offset,1,0,6) for row in rows: if len(str(row[0])) == 0: continue data = [str(r) for r in row] key = "{}{}".format(data[0],data[1]) if key in run_projects: continue run_projects[key] = data # Only return unique rows return run_projects.values() def update_gdocs(self): # Get the coming runs from Trello but Exclude runs that are already in gdocs gdocs_finished = self.gdocs_finished_runs() gdocs_ongoing = self.gdocs_ongoing_runs() gdocs_coming = self.gdocs_coming_runs() trello_coming = self.reshape_run_info(self.coming_runs(), ["{}_{}".format(r[0],r[1]) for r in gdocs_finished + gdocs_ongoing + gdocs_coming]) # Get the ongoing runs from Trello but exclude runs that are already in the finished or ongoing tab trello_ongoing = self.reshape_run_info(self.ongoing_runs(), ["{}_{}".format(r[0],r[1]) for r in gdocs_finished + gdocs_ongoing]) # Add each coming run to the next empty row for run in trello_coming: self.update_empty_row(self.coming,run,COMING_HEADER_OFFSET) # Move each run from coming if it exists there to the ongoing tab or just add it for run in trello_ongoing: status = self.run_project_match(run,gdocs_coming) if status == 0: self.update_empty_row(self.ongoing,run,ONGOING_HEADER_OFFSET) continue # Find the row index of the run in the coming tab row_index = self.gdcon.get_row_index(self.coming,run[0:2],COMING_HEADER_OFFSET) # Get the data from the coming tab, add it to an empty row in the ongoing tab and replace it with empty values row_data = self.gdcon.get_cell_content(self.coming,row_index,0,row_index,0) self.update_empty_row(self.ongoing,row_data[0],ONGOING_HEADER_OFFSET) self.gdcon.update_row(self.coming,row_index,["" for i in xrange(len(row_data[0]))]) def last_name(data): pcs = data[1].split('.') if len(pcs) == 1: return pcs[0] return "".join(pcs[1:]) # Lastly, update the application and type fields in gdocs if they are empty for wsheet, offset in [(self.coming, COMING_HEADER_OFFSET), (self.ongoing, ONGOING_HEADER_OFFSET)]: # Print a reader-friendly text to stdout print("{}\n{}\n".format(wsheet.title.text,"".join(['-' for i in xrange(len(wsheet.title.text))]))) for run in sorted(self._get_gdocs_run_projects(wsheet,offset), key=last_name): if len(run) < 4: continue if run[2] == "" or run[3] == "": app, tp = self.lookup_project(run[1]) if run[2] == "": run[2] = app if run[3] == "": run[3] = tp row_index = self.gdcon.get_row_index(wsheet,run[0:2],offset) self.gdcon.update_row(wsheet,row_index,run[0:4]) print("{} - {}{}".format(run[1],"{} - ".format(run[3]) if len(run[3]) > 0 else "",run[4])) print("{}{}\n".format("{}\n".format(run[2]) if len(run[2]) > 0 else "",run[0])) def update_empty_row(self, wsheet, data, offset, merged=False): """Update the next empty row after the specified offset with the supplied data """ updated = False # Require two empty rows in succession row_index = offset r2 = row_index while r2-row_index != 1: row_index = self.gdcon.get_row_index(wsheet,["" for i in xrange(len(data))],r2) # If we're writing a merged row, we need two consecutive empty rows if merged: r2 = self.gdcon.get_row_index(wsheet,["" for i in xrange(len(data))],row_index+1) else: r2 = row_index+1 assert row_index > 0, "***ERROR*** No more rows left in spreadsheet" updated = self.gdcon.update_row(wsheet,row_index,data) # FIXME: do this better.. if the row is merged, write the same data to the second "hidden" row if merged: self.gdcon.update_row(wsheet,row_index+1,data) return updated def run_project_match(self, needle, haystack): """Checks if a run and project exist in a list of lists. Determines identity by the two first columns in each list, the third and fourth are checked to determine if they need updating. Return 0 for no match, 1 for match that needs updating and 2 for a match that does not need updating """ if len(needle) < 4: return 0 for straw in haystack: if len(straw) < 4: continue if needle[0] != straw[0] or needle[1] != straw[1]: continue if needle[2] != straw[2] or needle[3] != straw[3]: return 1 return 2 return 0
class TestDbConnection(unittest.TestCase): def setUp(self): self.user = "******" self.pw = "pw" self.url = "localhost" self.examples = { "sample": "1_120924_AC003CCCXX_TGACCA", "flowcell": "AC003CCCXX", "project": "J.Doe_00_01" } self.p_con = ProjectSummaryConnection(dbname="projects-test", username=self.user, password=self.pw, url=self.url) def test_connection(self): """Test database connection""" sample_con = SampleRunMetricsConnection(dbname="samples-test", username=self.user, password=self.pw, url=self.url) self.assertEqual(sample_con.url_string, "http://{}:5984".format(self.url)) def test_get_flowcell(self): """Test getting a flowcell for a given sample""" sample_con = SampleRunMetricsConnection(dbname="samples-test", username=self.user, password=self.pw, url=self.url) fc = sample_con.get_entry(self.examples["sample"], "flowcell") self.assertEqual(str(fc), self.examples["flowcell"]) def test_get_sample_ids(self): """Test getting sample ids given flowcell and sample_prj""" sample_con = SampleRunMetricsConnection(dbname="samples-test", username=self.user, password=self.pw, url=self.url) sample_ids = sample_con.get_sample_ids(fc_id=self.examples["flowcell"]) LOG.info("Number of samples before subsetting: " + str(len(sample_ids))) self.assertEqual(len(sample_ids), 5) sample_ids = sample_con.get_sample_ids( fc_id=self.examples["flowcell"], sample_prj=self.examples["project"]) LOG.info("Number of samples after subsetting: " + str(len(sample_ids))) self.assertEqual(len(sample_ids), 2) def test_get_samples(self): """Test getting samples given flowcell and sample_prj.""" sample_con = SampleRunMetricsConnection(dbname="samples-test", username=self.user, password=self.pw, url=self.url) samples = sample_con.get_samples(fc_id=self.examples["flowcell"]) LOG.info("Selecting on flowcell: " + str(len(samples))) self.assertEqual(len(samples), 5) samples = sample_con.get_samples(fc_id=self.examples["flowcell"], sample_prj=self.examples["project"]) LOG.info("Selecting on flowcell, subsetting on project: " + str(len(samples))) self.assertEqual(len(samples), 2) samples = sample_con.get_samples(sample_prj=self.examples["project"]) LOG.info("Selecting on project: " + str(len(samples))) self.assertEqual(len(samples), 3) samples = sample_con.get_samples(sample_prj=self.examples["project"], fc_id=self.examples["flowcell"]) LOG.info("Selecting on project, subsetting on flowcell: " + str(len(samples))) self.assertEqual(len(samples), 2) def test_get_samples_wrong_info(self): """Test getting samples when either flowcell or project id information is wrong""" sample_con = SampleRunMetricsConnection(dbname="samples-test", username=self.user, password=self.pw, url=self.url) samples = sample_con.get_samples(sample_prj="bogusproject", fc_id=self.examples["flowcell"]) LOG.info("Selecting on bogus project, subsetting on flowcell: " + str(len(samples))) self.assertEqual(len(samples), 0) def test_get_project_sample_ids(self): """Test getting project sample ids""" sample_con = SampleRunMetricsConnection(dbname="samples-test", username=self.user, password=self.pw, url=self.url) sample_ids = sample_con.get_sample_ids( sample_prj=self.examples["project"]) sample_names = [sample_con.db.get(x)["name"] for x in sample_ids] self.assertEqual( set(sample_names), set([ '1_120924_AC003CCCXX_TGACCA', '2_120924_AC003CCCXX_ACAGTG', '1_121015_BB002BBBXX_TGACCA' ])) def test_get_latest_library_prep(self): """Test getting latest library prep""" prj = self.p_con.get_entry("J.Doe_00_01") prj['samples']['P001_102']['library_prep']['B'] = { 'sample_run_metrics': { '2_120924_AC003CCCXX_TTGGAA': None } } self.p_con.save(prj) preps = self.p_con.get_latest_library_prep( project_name=self.examples["project"]) srm = [x for l in preps.values() for x in l] # Make sure A prep not in list self.assertNotIn('2_120924_AC003CCCXX_ACAGTG', srm) # Make sure B prep in list self.assertIn('2_120924_AC003CCCXX_TTGGAA', srm) # Reset data prj = self.p_con.get_entry("J.Doe_00_01") del prj['samples']['P001_102']['library_prep']['B'] self.p_con.save(prj) def test_get_barcode_lane_statistics(self): """Test getting barcode lane statistics from flowcell database""" fc_con = FlowcellRunMetricsConnection(dbname="flowcells-test", username="******", password="******") # Try getting wrong sample name, should return None data = fc_con.get_barcode_lane_statistics("J.Doe_00_01", "P001_101_index6", "120924_AC003CCCXX", "1") self.assertEqual(data, (None, None)) data = fc_con.get_barcode_lane_statistics("J.Doe_00_01", "P001_101_index3", "120924_AC003CCCXX", "1") self.assertEqual(data, (u'35.22', u'90.05'))
def sample_status_note(project_name=None, flowcell=None, username=None, password=None, url=None, ordered_million_reads=None, uppnex_id=None, customer_reference=None, bc_count=None, project_alias=[], projectdb="projects", samplesdb="samples", flowcelldb="flowcells", phix=None, is_paired=True, **kw): """Make a sample status note. Used keywords: :param project_name: project name :param flowcell: flowcell id :param username: db username :param password: db password :param url: db url :param ordered_million_reads: number of ordered reads in millions :param uppnex_id: the uppnex id :param customer_reference: customer project name :param project_alias: project alias name :param phix: phix error rate :param is_paired: True if run is paired-end, False for single-end """ # Cutoffs cutoffs = { "phix_err_cutoff" : 2.0, "qv_cutoff" : 30, } instrument = _parse_instrument_config(os.path.expanduser(kw.get("instrument_config",""))) instrument_dict = {i['instrument_id']: i for i in instrument} # parameters parameters = { "project_name" : None, "start_date" : None, "FC_id" : None, "scilifelab_name" : None, "rounded_read_count" : None, "phix_error_rate" : None, "avg_quality_score" : None, "pct_q30_bases" : None, "success" : None, "run_mode":None, "is_paired":True } # key mapping from sample_run_metrics to parameter keys srm_to_parameter = {"project_name":"sample_prj", "FC_id":"flowcell", "scilifelab_name":"barcode_name", "start_date":"date", "rounded_read_count":"bc_count", "lane": "lane"} LOG.debug("got parameters {}".format(parameters)) output_data = {'stdout':StringIO(), 'stderr':StringIO(), 'debug':StringIO()} if not _assert_flowcell_format(flowcell): LOG.warn("Wrong flowcell format {}; skipping. Please use the flowcell id (format \"[A-Z0-9\-]+\")".format(flowcell) ) return output_data output_data = _update_sample_output_data(output_data, cutoffs) # Connect and run s_con = SampleRunMetricsConnection(dbname=samplesdb, username=username, password=password, url=url) fc_con = FlowcellRunMetricsConnection(dbname=flowcelldb, username=username, password=password, url=url) p_con = ProjectSummaryConnection(dbname=projectdb, username=username, password=password, url=url) # Set up paragraphs paragraphs = sample_note_paragraphs() headers = sample_note_headers() # Get project project = p_con.get_entry(project_name) source = p_con.get_info_source(project_name) if not project: LOG.warn("No such project '{}'".format(project_name)) return output_data # Set samples list sample_run_list = _set_sample_run_list(project_name, flowcell, project_alias, s_con) if len(sample_run_list) == 0: LOG.warn("No samples for project '{}', flowcell '{}'. Maybe there are no sample run metrics in statusdb?".format(project_name, flowcell)) return output_data # Set options ordered_million_reads = _literal_eval_option(ordered_million_reads) bc_count = _literal_eval_option(bc_count) phix = _literal_eval_option(phix) # Count number of times a sample has been run on a flowcell; if several, make lane-specific reports sample_count = Counter([x.get("barcode_name") for x in sample_run_list]) # Loop samples and collect information s_param_out = [] fcdoc = None for s in sample_run_list: s_param = {} LOG.debug("working on sample '{}', sample run metrics name '{}', id '{}'".format(s.get("barcode_name", None), s.get("name", None), s.get("_id", None))) s_param.update(parameters) s_param.update({key:s[srm_to_parameter[key]] for key in srm_to_parameter.keys()}) fc = "{}_{}".format(s.get("date"), s.get("flowcell")) # Get instrument try: s_param.update(instrument_dict[fc_con.get_instrument(str(fc))]) except: LOG.warn("Failed to set instrument and software versions for flowcell {} in report due to missing RunInfo -> Instrument field in statusdb. Either rerun 'pm qc update-qc' or search-and-replace 'NN' in the sample report.".format(fc)) s_param.update(instrument_dict['default']) # Get run mode if not fcdoc or fcdoc.get("name") != fc: fcdoc = fc_con.get_entry(fc) runp = fcdoc.get("RunParameters",{}) s_param["sequencing_platform"] = "MiSeq" if "MCSVersion" in runp else "HiSeq2500" s_param["clustering_method"] = "onboard clustering" if runp.get("ClusteringChoice","") == "OnBoardClustering" or s_param["sequencing_platform"] == "MiSeq" else "cBot" s_param["sequencing_setup"] = fcdoc.get("run_setup") s_param["sequencing_mode"] = runp.get("RunMode","High Output") s_param["sequencing_software"] = "RTA {}".format(runp.get("RTAVersion")) if s_param["sequencing_platform"] == "MiSeq": s_param["sequencing_software"] = "MCS {}/{}".format(runp.get("MCSVersion"),s_param["sequencing_software"]) else: s_param["sequencing_software"] = "{} {}/{}".format(runp.get("ApplicationName"),runp.get("ApplicationVersion"),s_param["sequencing_software"]) s_param["is_paired"] = fc_con.is_paired_end(str(fc)) if s_param["is_paired"] is None: LOG.warn("Could not determine run setup for flowcell {}. Will assume paired-end.".format(fc)) s_param["is_paired"] = True s_param.update(software_versions) s_param["phix_error_rate"] = fc_con.get_phix_error_rate(str(fc), s["lane"]) if phix: s_param["phix_error_rate"] = _get_phix_error_rate(s["lane"], phix) # Get quality score from demultiplex stats, if that fails # (which it shouldn't), fall back on fastqc data. (avg_quality_score, pct_q30_bases) = fc_con.get_barcode_lane_statistics(project_name, s.get("barcode_name"), fc, s["lane"]) s_param['avg_quality_score'] = avg_quality_score if avg_quality_score else calc_avg_qv(s) if not s_param['avg_quality_score']: LOG.warn("Setting average quality failed for sample {}, id {}".format(s.get("name"), s.get("_id"))) s_param['pct_q30_bases'] = pct_q30_bases if not s_param['pct_q30_bases']: LOG.warn("Setting % of >= Q30 Bases (PF) failed for sample {}, id {}".format(s.get("name"), s.get("_id"))) # Compare phix error and qv to cutoffs err_stat = "OK" qv_stat = "OK" if s_param["phix_error_rate"] > cutoffs["phix_err_cutoff"]: err_stat = "HIGH" elif s_param["phix_error_rate"] == -1: err_stat = "N/A" if s_param["avg_quality_score"] < cutoffs["qv_cutoff"]: qv_stat = "LOW" output_data["stdout"].write("{:>18}\t{:>6}\t{:>12}\t{:>12}\t{:>12}\t{:>12}\n".format(s["barcode_name"], s["lane"], s_param["phix_error_rate"], err_stat, s_param["avg_quality_score"], qv_stat)) # Update/set remaning sample run parameters, falling back on project defaults if *key* is missing s_param['ordered_amount'] = s_param.get('ordered_amount', p_con.get_ordered_amount(project_name, samples=p_con.get_entry(project_name,'samples'))) s_param['customer_reference'] = s_param.get('customer_reference', project.get('customer_reference')) s_param['uppnex_project_id'] = s_param.get('uppnex_project_id', project.get('uppnex_id')) # Override database settings if options passed at command line if ordered_million_reads: s_param["ordered_amount"] = _get_ordered_million_reads(s["barcode_name"], ordered_million_reads) if bc_count: s_param["rounded_read_count"] = _round_read_count_in_millions(_get_bc_count(s["barcode_name"], bc_count, s)) else: s_param["rounded_read_count"] = _round_read_count_in_millions(s_param["rounded_read_count"]) if uppnex_id: s_param["uppnex_project_id"] = uppnex_id if customer_reference: s_param["customer_reference"] = customer_reference # Get the project sample name corresponding to the sample run project_sample = p_con.get_project_sample(project_name, s.get("project_sample_name", None)) if project_sample: LOG.debug("project sample run metrics mapping found: '{}' : '{}'".format(s["name"], project_sample["sample_name"])) project_sample_item = project_sample['project_sample'] # Set project_sample_d: a dictionary mapping from sample run metrics name to sample run metrics database id project_sample_d = _set_project_sample_dict(project_sample_item, source) if not project_sample_d: LOG.warn("No sample_run_metrics information for sample '{}', barcode name '{}', id '{}'\n\tProject summary information {}".format(s["name"], s["barcode_name"], s["_id"], project_sample)) # Check if sample run metrics name present in project database: if so, verify that database ids are consistent if s["name"] not in project_sample_d.keys(): LOG.warn("no such sample run metrics '{}' in project sample run metrics dictionary".format(s["name"]) ) else: if s["_id"] == project_sample_d[s["name"]]: LOG.debug("project sample run metrics mapping found: '{}' : '{}'".format(s["name"], project_sample_d[s["name"]])) else: LOG.warn("inconsistent mapping for '{}': '{}' != '{}' (project summary id)".format(s["name"], s["_id"], project_sample_d[s["name"]])) s_param['customer_name'] = project_sample_item.get("customer_name", None) # Always normalize submitted id, since module textttable does not support unicode if type(s_param['customer_name']) is unicode: s_param['customer_name'] = unicodedata.normalize('NFKD', s_param['customer_name']).encode('ascii', 'ignore') # No project sample found. Manual upload to database necessary. else: s_param['customer_name'] = None LOG.warn("No project sample name found for sample run name '{}'".format(s["barcode_name"])) LOG.info("Please run 'pm qc upload-qc FLOWCELL_ID --extensive-matching' to update project sample names ") LOG.info("or 'pm qc update --sample_prj PROJECT_NAME --names BARCODE_TO_SAMPLE_MAP to update project sample names.") LOG.info("Please refer to the pm documentation for examples.") query_ok(force=kw.get("force", False)) # Finally assess sequencing success, update parameters and set outputs s_param['success'] = sequencing_success(s_param, cutoffs) s_param.update({k:"N/A" for k in s_param.keys() if s_param[k] is None or s_param[k] == "" or s_param[k] == -1.0}) if sample_count[s.get("barcode_name")] > 1: outfile = "{}_{}_{}_{}.pdf".format(s["barcode_name"], s["date"], s["flowcell"], s["lane"]) else: outfile = "{}_{}_{}.pdf".format(s["barcode_name"], s["date"], s["flowcell"]) s_param["outfile"] = outfile s_param_out.append(s_param) # Write final output to reportlab and rst files output_data["debug"].write(json.dumps({'s_param': s_param_out, 'sample_runs':{s["name"]:s["barcode_name"] for s in sample_run_list}})) notes = [make_note(headers=headers, paragraphs=paragraphs, **sp) for sp in s_param_out] rest_notes = make_sample_rest_notes("{}_{}_{}_sample_summary.rst".format(project_name, s.get("date", None), s.get("flowcell", None)), s_param_out) concatenate_notes(notes, "{}_{}_{}_sample_summary.pdf".format(project_name, s.get("date", None), s.get("flowcell", None))) return output_data
def sample_status_note(project_name=None, flowcell=None, username=None, password=None, url=None, ordered_million_reads=None, uppnex_id=None, customer_reference=None, bc_count=None, project_alias=[], projectdb="projects", samplesdb="samples", flowcelldb="flowcells", phix=None, **kw): """Make a sample status note. Used keywords: :param project_name: project name :param flowcell: flowcell id :param username: db username :param password: db password :param url: db url :param ordered_million_reads: number of ordered reads in millions :param uppnex_id: the uppnex id :param customer_reference: customer project name :param project_alias: project alias name :param phix: phix error rate """ # Cutoffs cutoffs = { "phix_err_cutoff" : 2.0, "qv_cutoff" : 30, } # parameters parameters = { "project_name" : None, "start_date" : None, "FC_id" : None, "scilifelab_name" : None, "rounded_read_count" : None, "phix_error_rate" : None, "avg_quality_score" : None, "success" : None, "run_mode":None, } # key mapping from sample_run_metrics to parameter keys srm_to_parameter = {"project_name":"sample_prj", "FC_id":"flowcell", "scilifelab_name":"barcode_name", "start_date":"date", "rounded_read_count":"bc_count"} LOG.debug("got parameters {}".format(parameters)) output_data = {'stdout':StringIO(), 'stderr':StringIO(), 'debug':StringIO()} if not _assert_flowcell_format(flowcell): LOG.warn("Wrong flowcell format {}; skipping. Please use the flowcell id (format \"[A-Z0-9]+XX\")".format(flowcell) ) return output_data output_data = _update_sample_output_data(output_data, cutoffs) # Connect and run s_con = SampleRunMetricsConnection(dbname=samplesdb, username=username, password=password, url=url) fc_con = FlowcellRunMetricsConnection(dbname=flowcelldb, username=username, password=password, url=url) p_con = ProjectSummaryConnection(dbname=projectdb, username=username, password=password, url=url) # Set up paragraphs paragraphs = sample_note_paragraphs() headers = sample_note_headers() # Get project project = p_con.get_entry(project_name) if not project: LOG.warn("No such project '{}'".format(project_name)) return output_data # Set samples list sample_run_list = _set_sample_run_list(project_name, flowcell, project_alias, s_con) if len(sample_run_list) == 0: LOG.warn("No samples for project '{}', flowcell '{}'. Maybe there are no sample run metrics in statusdb?".format(project_name, flowcell)) return output_data # Set options ordered_million_reads = _literal_eval_option(ordered_million_reads) bc_count = _literal_eval_option(bc_count) phix = _literal_eval_option(phix) # Count number of times a sample has been run on a flowcell; if several, make lane-specific reports sample_count = Counter([x.get("barcode_name") for x in sample_run_list]) # Loop samples and collect information s_param_out = [] for s in sample_run_list: s_param = {} LOG.debug("working on sample '{}', sample run metrics name '{}', id '{}'".format(s.get("barcode_name", None), s.get("name", None), s.get("_id", None))) s_param.update(parameters) s_param.update({key:s[srm_to_parameter[key]] for key in srm_to_parameter.keys()}) fc = "{}_{}".format(s.get("date"), s.get("flowcell")) # Get instrument try: s_param.update(instrument[fc_con.get_instrument(str(fc))]) except: LOG.warn("Failed to set instrument and software versions for flowcell {} in report due to missing RunInfo -> Instrument field in statusdb. Either rerun 'pm qc update-qc' or search-and-replace 'NN' in the sample report.".format(fc)) s_param.update(instrument['default']) # Get run mode s_param["run_mode"] = fc_con.get_run_mode(str(fc)) s_param.update(software_versions) s_param["phix_error_rate"] = fc_con.get_phix_error_rate(str(fc), s["lane"]) if phix: s_param["phix_error_rate"] = _get_phix_error_rate(s["lane"], phix) s_param['avg_quality_score'] = calc_avg_qv(s) if not s_param['avg_quality_score']: LOG.warn("Calculation of average quality failed for sample {}, id {}".format(s.get("name"), s.get("_id"))) # Compare phix error and qv to cutoffs err_stat = "OK" qv_stat = "OK" if s_param["phix_error_rate"] > cutoffs["phix_err_cutoff"]: err_stat = "HIGH" elif s_param["phix_error_rate"] == -1: err_stat = "N/A" if s_param["avg_quality_score"] < cutoffs["qv_cutoff"]: qv_stat = "LOW" output_data["stdout"].write("{:>18}\t{:>6}\t{:>12}\t{:>12}\t{:>12}\t{:>12}\n".format(s["barcode_name"], s["lane"], s_param["phix_error_rate"], err_stat, s_param["avg_quality_score"], qv_stat)) # Update/set remaning sample run parameters, falling back on project defaults if *key* is missing s_param['ordered_amount'] = s_param.get('ordered_amount', p_con.get_ordered_amount(project_name)) s_param['customer_reference'] = s_param.get('customer_reference', project.get('customer_reference')) s_param['uppnex_project_id'] = s_param.get('uppnex_project_id', project.get('uppnex_id')) # Override database settings if options passed at command line if ordered_million_reads: s_param["ordered_amount"] = _get_ordered_million_reads(s["barcode_name"], ordered_million_reads) if bc_count: s_param["rounded_read_count"] = _round_read_count_in_millions(_get_bc_count(s["barcode_name"], bc_count, s)) else: s_param["rounded_read_count"] = _round_read_count_in_millions(s_param["rounded_read_count"]) if uppnex_id: s_param["uppnex_project_id"] = uppnex_id if customer_reference: s_param["customer_reference"] = customer_reference # Get the project sample name corresponding to the sample run project_sample = p_con.get_project_sample(project_name, s.get("project_sample_name", None)) if project_sample: LOG.debug("project sample run metrics mapping found: '{}' : '{}'".format(s["name"], project_sample["sample_name"])) project_sample_item = project_sample['project_sample'] # Set project_sample_d: a dictionary mapping from sample run metrics name to sample run metrics database id project_sample_d = _set_project_sample_dict(project_sample_item) if not project_sample_d: LOG.warn("No sample_run_metrics information for sample '{}', barcode name '{}', id '{}'\n\tProject summary information {}".format(s["name"], s["barcode_name"], s["_id"], project_sample)) # Check if sample run metrics name present in project database: if so, verify that database ids are consistent if s["name"] not in project_sample_d.keys(): LOG.warn("no such sample run metrics '{}' in project sample run metrics dictionary".format(s["name"]) ) else: if s["_id"] == project_sample_d[s["name"]]: LOG.debug("project sample run metrics mapping found: '{}' : '{}'".format(s["name"], project_sample_d[s["name"]])) else: LOG.warn("inconsistent mapping for '{}': '{}' != '{}' (project summary id)".format(s["name"], s["_id"], project_sample_d[s["name"]])) s_param['customer_name'] = project_sample_item.get("customer_name", None) # No project sample found. Manual upload to database necessary. else: s_param['customer_name'] = None LOG.warn("No project sample name found for sample run name '{}'".format(s["barcode_name"])) LOG.info("Please run 'pm qc upload-qc FLOWCELL_ID --extensive-matching' to update project sample names ") LOG.info("or 'pm qc update --sample_prj PROJECT_NAME --names BARCODE_TO_SAMPLE_MAP to update project sample names.") LOG.info("Please refer to the pm documentation for examples.") query_ok(force=kw.get("force", False)) # Finally assess sequencing success, update parameters and set outputs s_param['success'] = sequencing_success(s_param, cutoffs) s_param.update({k:"N/A" for k in s_param.keys() if s_param[k] is None or s_param[k] == "" or s_param[k] == -1.0}) if sample_count[s.get("barcode_name")] > 1: outfile = "{}_{}_{}_{}.pdf".format(s["barcode_name"], s["date"], s["flowcell"], s["lane"]) else: outfile = "{}_{}_{}.pdf".format(s["barcode_name"], s["date"], s["flowcell"]) s_param["outfile"] = outfile s_param_out.append(s_param) # Write final output to reportlab and rst files output_data["debug"].write(json.dumps({'s_param': s_param_out, 'sample_runs':{s["name"]:s["barcode_name"] for s in sample_run_list}})) notes = [make_note(headers=headers, paragraphs=paragraphs, **sp) for sp in s_param_out] rest_notes = make_sample_rest_notes("{}_{}_{}_sample_summary.rst".format(project_name, s.get("date", None), s.get("flowcell", None)), s_param_out) concatenate_notes(notes, "{}_{}_{}_sample_summary.pdf".format(project_name, s.get("date", None), s.get("flowcell", None))) return output_data
def project_status_note(project_name=None, username=None, password=None, url=None, use_ps_map=True, use_bc_map=False, check_consistency=False, ordered_million_reads=None, uppnex_id=None, customer_reference=None, exclude_sample_ids={}, project_alias=None, sample_aliases={}, projectdb="projects", samplesdb="samples", flowcelldb="flowcells", include_all_samples=False, **kw): """Make a project status note. Used keywords: :param project_name: project name :param user: db user name :param password: db password :param url: db url :param use_ps_map: use project summary mapping :param use_bc_map: use project to barcode name mapping :param check_consistency: check consistency between mappings :param ordered_million_reads: number of ordered reads in millions :param uppnex_id: the uppnex id :param customer_reference: customer project name :param exclude_sample_ids: exclude some sample ids from project note :param project_alias: project alias name :param sample_aliases: sample alias names :param projectdb: project db name :param samplesdb: samples db name :param flowcelldb: flowcells db name :param include_all_samples: include all samples in report """ # parameters parameters = { "project_name" : project_name, "finished" : "Not finished, or cannot yet assess if finished.", } # mapping project_summary to parameter keys ps_to_parameter = {"scilife_name":"scilife_name", "customer_name":"customer_name", "project_name":"project_name"} # mapping project sample to table table_keys = ['ScilifeID', 'CustomerID', 'BarcodeSeq', 'MSequenced', 'MOrdered', 'Status'] output_data = {'stdout':StringIO(), 'stderr':StringIO(), 'debug':StringIO()} # Connect and run s_con = SampleRunMetricsConnection(dbname=samplesdb, username=username, password=password, url=url) fc_con = FlowcellRunMetricsConnection(dbname=flowcelldb, username=username, password=password, url=url) p_con = ProjectSummaryConnection(dbname=projectdb, username=username, password=password, url=url) # Set report paragraphs paragraphs = project_note_paragraphs() headers = project_note_headers() # Set local param variable param = parameters # Get project summary from project database sample_aliases = _literal_eval_option(sample_aliases, default={}) prj_summary = p_con.get_entry(project_name) if not prj_summary: LOG.warn("No such project '{}'".format(project_name)) return LOG.debug("Working on project '{}'.".format(project_name)) # Get sample run list and loop samples to make mapping sample -> {sampleruns} sample_run_list = _set_sample_run_list(project_name, flowcell=None, project_alias=project_alias, s_con=s_con) samples = {} for s in sample_run_list: prj_sample = p_con.get_project_sample(project_name, s.get("project_sample_name", None)) if prj_sample: sample_name = prj_sample['project_sample'].get("scilife_name", None) s_d = {s["name"] : {'sample':sample_name, 'id':s["_id"]}} samples.update(s_d) else: if s["barcode_name"] in sample_aliases: s_d = {sample_aliases[s["barcode_name"]] : {'sample':sample_aliases[s["barcode_name"]], 'id':s["_id"]}} samples.update(s_d) else: s_d = {s["name"]:{'sample':s["name"], 'id':s["_id"], 'barcode_name':s["barcode_name"]}} LOG.warn("No mapping found for sample run:\n '{}'".format(s_d)) # Convert to mapping from desired sample name to list of aliases # Less important for the moment; one solution is to update the # Google docs summary table to use the P names sample_dict = prj_summary['samples'] param.update({key:prj_summary.get(ps_to_parameter[key], None) for key in ps_to_parameter.keys()}) param["ordered_amount"] = param.get("ordered_amount", p_con.get_ordered_amount(project_name)) param['customer_reference'] = param.get('customer_reference', prj_summary.get('customer_reference')) param['uppnex_project_id'] = param.get('uppnex_project_id', prj_summary.get('uppnex_id')) # Override database values if options passed at command line if uppnex_id: param["uppnex_project_id"] = uppnex_id if customer_reference: param["customer_reference"] = customer_reference # Process options ordered_million_reads = _literal_eval_option(ordered_million_reads) exclude_sample_ids = _literal_eval_option(exclude_sample_ids, default={}) ## Start collecting the data sample_table = [] samples_excluded = [] all_passed = True last_library_preps = p_con.get_latest_library_prep(project_name) last_library_preps_srm = [x for l in last_library_preps.values() for x in l] LOG.debug("Looping through sample map that maps project sample names to sample run metrics ids") for k,v in samples.items(): LOG.debug("project sample '{}' maps to '{}'".format(k, v)) if not include_all_samples: if v['sample'] not in last_library_preps.keys(): LOG.info("No library prep information for sample {}; keeping in report".format(v['sample'])) else: if k not in last_library_preps_srm: LOG.info("Sample run {} ('{}') is not latest library prep ({}) for project sample {}: excluding from report".format(k, v["id"], last_library_preps[v['sample']].values()[0], v['sample'])) continue else: pass if re.search("Unexpected", k): continue barcode_seq = s_con.get_entry(k, "sequence") # Exclude sample id? if _exclude_sample_id(exclude_sample_ids, v['sample'], barcode_seq): samples_excluded.append(v['sample']) continue # Get the project sample name from the sample run and set table values project_sample = sample_dict[v['sample']] vals = _set_sample_table_values(v['sample'], project_sample, barcode_seq, ordered_million_reads, param) if vals['Status']=="N/A" or vals['Status']=="NP": all_passed = False sample_table.append([vals[k] for k in table_keys]) # Loop through samples in sample_dict for which there is no sample run information samples_in_table_or_excluded = list(set([x[0] for x in sample_table])) + samples_excluded samples_not_in_table = list(set(sample_dict.keys()) - set(samples_in_table_or_excluded)) for sample in samples_not_in_table: if re.search("Unexpected", sample): continue project_sample = sample_dict[sample] # Set project_sample_d: a dictionary mapping from sample run metrics name to sample run metrics database id project_sample_d = _set_project_sample_dict(project_sample) if project_sample_d: for k,v in project_sample_d.iteritems(): barcode_seq = s_con.get_entry(k, "sequence") vals = _set_sample_table_values(sample, project_sample, barcode_seq, ordered_million_reads, param) if vals['Status']=="N/A" or vals['Status']=="NP": all_passed = False sample_table.append([vals[k] for k in table_keys]) else: barcode_seq = None vals = _set_sample_table_values(sample, project_sample, barcode_seq, ordered_million_reads, param) if vals['Status']=="N/A" or vals['Status']=="NP": all_passed = False sample_table.append([vals[k] for k in table_keys]) if all_passed: param["finished"] = 'Project finished.' sample_table.sort() sample_table = list(sample_table for sample_table,_ in itertools.groupby(sample_table)) sample_table.insert(0, ['ScilifeID', 'CustomerID', 'BarcodeSeq', 'MSequenced', 'MOrdered', 'Status']) paragraphs["Samples"]["tpl"] = make_sample_table(sample_table) make_note("{}_project_summary.pdf".format(project_name), headers, paragraphs, **param) make_rest_note("{}_project_summary.rst".format(project_name), sample_table=sample_table, report="project_report", **param) param.update({k:"N/A" for k in param.keys() if param[k] is None or param[k] == ""}) output_data["debug"].write(json.dumps({'param':param, 'table':sample_table})) return output_data
def data_delivery_note(**kw): """Create an easily parseable information file with information about the data delivery """ output_data = {'stdout':StringIO(), 'stderr':StringIO(), 'debug':StringIO()} project_name = kw.get('project_name',None) flowcell = kw.get('flowcell',None) LOG.debug("Generating data delivery note for project {}{}.".format(project_name,' and flowcell {}'.format(flowcell if flowcell else ''))) # Get a connection to the project and sample databases p_con = ProjectSummaryConnection(**kw) assert p_con, "Could not connect to project database" s_con = SampleRunMetricsConnection(**kw) assert s_con, "Could not connect to sample database" # Get the entry for the project and samples from the database LOG.debug("Fetching samples from sample database") samples = s_con.get_samples(sample_prj=project_name, fc_id=flowcell) LOG.debug("Got {} samples from database".format(len(samples))) # Get the customer sample names from the project database LOG.debug("Fetching samples from project database") project_samples = p_con.get_entry(project_name, "samples") customer_names = {sample_name:sample.get('customer_name','N/A') for sample_name, sample in project_samples.items()} data = [['SciLifeLab ID','Submitted ID','Flowcell','Lane','Barcode','Read','Path','MD5','Size (bytes)','Timestamp']] for sample in samples: sname = sample.get('project_sample_name','N/A') cname = customer_names.get(sname,'N/A') fc = sample.get('flowcell','N/A') lane = sample.get('lane','N/A') barcode = sample.get('sequence','N/A') if 'raw_data_delivery' not in sample: data.append([sname,cname,'','','','','','','','']) continue delivery = sample['raw_data_delivery'] tstamp = delivery.get('timestamp','N/A') for read, file in delivery.get('files',{}).items(): data.append([sname, cname, fc, lane, barcode, read, file.get('path','N/A'), file.get('md5','N/A'), file.get('size_in_bytes','N/A'), tstamp,]) # Write the data to a csv file outfile = "{}{}_data_delivery.csv".format(project_name,'_{}'.format(flowcell) if flowcell else '') LOG.debug("Writing delivery data to {}".format(outfile)) with open(outfile,"w") as outh: csvw = csv.writer(outh) for row in data: csvw.writerow(row) # Write Texttable formatted output to stdout tt = texttable.Texttable(180) tt.add_rows(data) output_data['stdout'].write(tt.draw()) return output_data
def application_qc(project_name=None, flowcell=None, application=None, username=None, password=None, url=None, sampledb="samples", projectdb="projects", **kw): """Perform application specific qc on a project. :param project_name: project name :param flowcell: flowcell identifier :param application: application for which to perform qc :param username: database username :param password: database password :param url: database url :param sampledb: samples database name :param projectdb: project database name """ LOG.debug("Doing application qc for project {}, flowcell {}".format( project_name, flowcell)) output_data = {'stdout': StringIO(), 'stderr': StringIO()} p_con = ProjectSummaryConnection(dbname=projectdb, username=username, password=password, url=url) s_con = SampleRunMetricsConnection(dbname=sampledb, username=username, password=password, url=url) prj_summary = p_con.get_entry(project_name) qc_data = get_qc_data(project_name, p_con, s_con, flowcell) if not prj_summary is None: qc_data = get_qc_data(project_name, p_con, s_con, flowcell) if prj_summary.get("application") not in APPLICATION_MAP.keys(): if not application: LOG.warn( "No such application {}. Please use the application option (available choices {})" .format(application, ",".join(QC_CUTOFF.keys()))) return output_data application = application else: application = APPLICATION_MAP[prj_summary.get("application")] else: LOG.info( "No such project {} in project summary. Trying to get qc data anyway." .format(project_name)) if not application: LOG.warn( "No application provided. Please use the application option (available choices {})" .format(",".join(QC_CUTOFF.keys()))) return output_data qc_data = _get_sample_qc_data(project_name, application, s_con, flowcell) output_data = _qc_info_header(project_name, application, output_data) for k, v in sorted(qc_data.iteritems()): y = [str(x) for x in assess_qc(v, application)] output_data["stdout"].write("".join(y) + "\n") return output_data
def sample_status_note(project_name=None, flowcell=None, username=None, password=None, url=None, ordered_million_reads=None, uppnex_id=None, customer_reference=None, bc_count=None, project_alias=[], projectdb="projects", samplesdb="samples", flowcelldb="flowcells", phix=None, is_paired=True, **kw): """Make a sample status note. Used keywords: :param project_name: project name :param flowcell: flowcell id :param username: db username :param password: db password :param url: db url :param ordered_million_reads: number of ordered reads in millions :param uppnex_id: the uppnex id :param customer_reference: customer project name :param project_alias: project alias name :param phix: phix error rate :param is_paired: True if run is paired-end, False for single-end """ # Cutoffs cutoffs = { "phix_err_cutoff": 2.0, "qv_cutoff": 30, } instrument = _parse_instrument_config( os.path.expanduser(kw.get("instrument_config", ""))) instrument_dict = {i['instrument_id']: i for i in instrument} # parameters parameters = { "project_name": None, "start_date": None, "FC_id": None, "scilifelab_name": None, "rounded_read_count": None, "phix_error_rate": None, "avg_quality_score": None, "pct_q30_bases": None, "success": None, "run_mode": None, "is_paired": True } # key mapping from sample_run_metrics to parameter keys srm_to_parameter = { "project_name": "sample_prj", "FC_id": "flowcell", "scilifelab_name": "barcode_name", "start_date": "date", "rounded_read_count": "bc_count", "lane": "lane" } LOG.debug("got parameters {}".format(parameters)) output_data = { 'stdout': StringIO(), 'stderr': StringIO(), 'debug': StringIO() } if not _assert_flowcell_format(flowcell): LOG.warn( "Wrong flowcell format {}; skipping. Please use the flowcell id (format \"[A-Z0-9\-]+\")" .format(flowcell)) return output_data output_data = _update_sample_output_data(output_data, cutoffs) # Connect and run s_con = SampleRunMetricsConnection(dbname=samplesdb, username=username, password=password, url=url) fc_con = FlowcellRunMetricsConnection(dbname=flowcelldb, username=username, password=password, url=url) p_con = ProjectSummaryConnection(dbname=projectdb, username=username, password=password, url=url) # Set up paragraphs paragraphs = sample_note_paragraphs() headers = sample_note_headers() # Get project project = p_con.get_entry(project_name) source = p_con.get_info_source(project_name) if not project: LOG.warn("No such project '{}'".format(project_name)) return output_data # Set samples list sample_run_list = _set_sample_run_list(project_name, flowcell, project_alias, s_con) if len(sample_run_list) == 0: LOG.warn( "No samples for project '{}', flowcell '{}'. Maybe there are no sample run metrics in statusdb?" .format(project_name, flowcell)) return output_data # Set options ordered_million_reads = _literal_eval_option(ordered_million_reads) bc_count = _literal_eval_option(bc_count) phix = _literal_eval_option(phix) # Count number of times a sample has been run on a flowcell; if several, make lane-specific reports sample_count = Counter([x.get("barcode_name") for x in sample_run_list]) # Loop samples and collect information s_param_out = [] fcdoc = None for s in sample_run_list: s_param = {} LOG.debug( "working on sample '{}', sample run metrics name '{}', id '{}'". format(s.get("barcode_name", None), s.get("name", None), s.get("_id", None))) s_param.update(parameters) s_param.update( {key: s[srm_to_parameter[key]] for key in srm_to_parameter.keys()}) fc = "{}_{}".format(s.get("date"), s.get("flowcell")) # Get instrument try: s_param.update(instrument_dict[fc_con.get_instrument(str(fc))]) except: LOG.warn( "Failed to set instrument and software versions for flowcell {} in report due to missing RunInfo -> Instrument field in statusdb. Either rerun 'pm qc update-qc' or search-and-replace 'NN' in the sample report." .format(fc)) s_param.update(instrument_dict['default']) # Get run mode if not fcdoc or fcdoc.get("name") != fc: fcdoc = fc_con.get_entry(fc) runp = fcdoc.get("RunParameters", {}) s_param[ "sequencing_platform"] = "MiSeq" if "MCSVersion" in runp else "HiSeq2500" s_param["clustering_method"] = "onboard clustering" if runp.get( "ClusteringChoice", "") == "OnBoardClustering" or s_param[ "sequencing_platform"] == "MiSeq" else "cBot" s_param["sequencing_setup"] = fcdoc.get("run_setup") s_param["sequencing_mode"] = runp.get("RunMode", "High Output") s_param["sequencing_software"] = "RTA {}".format( runp.get("RTAVersion")) if s_param["sequencing_platform"] == "MiSeq": s_param["sequencing_software"] = "MCS {}/{}".format( runp.get("MCSVersion"), s_param["sequencing_software"]) else: s_param["sequencing_software"] = "{} {}/{}".format( runp.get("ApplicationName"), runp.get("ApplicationVersion"), s_param["sequencing_software"]) s_param["is_paired"] = fc_con.is_paired_end(str(fc)) if s_param["is_paired"] is None: LOG.warn( "Could not determine run setup for flowcell {}. Will assume paired-end." .format(fc)) s_param["is_paired"] = True s_param.update(software_versions) s_param["phix_error_rate"] = fc_con.get_phix_error_rate( str(fc), s["lane"]) if phix: s_param["phix_error_rate"] = _get_phix_error_rate(s["lane"], phix) # Get quality score from demultiplex stats, if that fails # (which it shouldn't), fall back on fastqc data. (avg_quality_score, pct_q30_bases) = fc_con.get_barcode_lane_statistics( project_name, s.get("barcode_name"), fc, s["lane"]) s_param[ 'avg_quality_score'] = avg_quality_score if avg_quality_score else calc_avg_qv( s) if not s_param['avg_quality_score']: LOG.warn( "Setting average quality failed for sample {}, id {}".format( s.get("name"), s.get("_id"))) s_param['pct_q30_bases'] = pct_q30_bases if not s_param['pct_q30_bases']: LOG.warn( "Setting % of >= Q30 Bases (PF) failed for sample {}, id {}". format(s.get("name"), s.get("_id"))) # Compare phix error and qv to cutoffs err_stat = "OK" qv_stat = "OK" if s_param["phix_error_rate"] > cutoffs["phix_err_cutoff"]: err_stat = "HIGH" elif s_param["phix_error_rate"] == -1: err_stat = "N/A" if s_param["avg_quality_score"] < cutoffs["qv_cutoff"]: qv_stat = "LOW" output_data["stdout"].write( "{:>18}\t{:>6}\t{:>12}\t{:>12}\t{:>12}\t{:>12}\n".format( s["barcode_name"], s["lane"], s_param["phix_error_rate"], err_stat, s_param["avg_quality_score"], qv_stat)) # Update/set remaning sample run parameters, falling back on project defaults if *key* is missing s_param['ordered_amount'] = s_param.get( 'ordered_amount', p_con.get_ordered_amount(project_name, samples=p_con.get_entry( project_name, 'samples'))) s_param['customer_reference'] = s_param.get( 'customer_reference', project.get('customer_reference')) s_param['uppnex_project_id'] = s_param.get('uppnex_project_id', project.get('uppnex_id')) # Override database settings if options passed at command line if ordered_million_reads: s_param["ordered_amount"] = _get_ordered_million_reads( s["barcode_name"], ordered_million_reads) if bc_count: s_param["rounded_read_count"] = _round_read_count_in_millions( _get_bc_count(s["barcode_name"], bc_count, s)) else: s_param["rounded_read_count"] = _round_read_count_in_millions( s_param["rounded_read_count"]) if uppnex_id: s_param["uppnex_project_id"] = uppnex_id if customer_reference: s_param["customer_reference"] = customer_reference # Get the project sample name corresponding to the sample run project_sample = p_con.get_project_sample( project_name, s.get("project_sample_name", None)) if project_sample: LOG.debug( "project sample run metrics mapping found: '{}' : '{}'".format( s["name"], project_sample["sample_name"])) project_sample_item = project_sample['project_sample'] # Set project_sample_d: a dictionary mapping from sample run metrics name to sample run metrics database id project_sample_d = _set_project_sample_dict( project_sample_item, source) if not project_sample_d: LOG.warn( "No sample_run_metrics information for sample '{}', barcode name '{}', id '{}'\n\tProject summary information {}" .format(s["name"], s["barcode_name"], s["_id"], project_sample)) # Check if sample run metrics name present in project database: if so, verify that database ids are consistent if s["name"] not in project_sample_d.keys(): LOG.warn( "no such sample run metrics '{}' in project sample run metrics dictionary" .format(s["name"])) else: if s["_id"] == project_sample_d[s["name"]]: LOG.debug( "project sample run metrics mapping found: '{}' : '{}'" .format(s["name"], project_sample_d[s["name"]])) else: LOG.warn( "inconsistent mapping for '{}': '{}' != '{}' (project summary id)" .format(s["name"], s["_id"], project_sample_d[s["name"]])) s_param['customer_name'] = project_sample_item.get( "customer_name", None) # Always normalize submitted id, since module textttable does not support unicode if type(s_param['customer_name']) is unicode: s_param['customer_name'] = unicodedata.normalize( 'NFKD', s_param['customer_name']).encode('ascii', 'ignore') # No project sample found. Manual upload to database necessary. else: s_param['customer_name'] = None LOG.warn( "No project sample name found for sample run name '{}'".format( s["barcode_name"])) LOG.info( "Please run 'pm qc upload-qc FLOWCELL_ID --extensive-matching' to update project sample names " ) LOG.info( "or 'pm qc update --sample_prj PROJECT_NAME --names BARCODE_TO_SAMPLE_MAP to update project sample names." ) LOG.info("Please refer to the pm documentation for examples.") query_ok(force=kw.get("force", False)) # Finally assess sequencing success, update parameters and set outputs s_param['success'] = sequencing_success(s_param, cutoffs) s_param.update({ k: "N/A" for k in s_param.keys() if s_param[k] is None or s_param[k] == "" or s_param[k] == -1.0 }) if sample_count[s.get("barcode_name")] > 1: outfile = "{}_{}_{}_{}.pdf".format(s["barcode_name"], s["date"], s["flowcell"], s["lane"]) else: outfile = "{}_{}_{}.pdf".format(s["barcode_name"], s["date"], s["flowcell"]) s_param["outfile"] = outfile s_param_out.append(s_param) # Write final output to reportlab and rst files output_data["debug"].write( json.dumps({ 's_param': s_param_out, 'sample_runs': {s["name"]: s["barcode_name"] for s in sample_run_list} })) notes = [ make_note(headers=headers, paragraphs=paragraphs, **sp) for sp in s_param_out ] rest_notes = make_sample_rest_notes( "{}_{}_{}_sample_summary.rst".format(project_name, s.get("date", None), s.get("flowcell", None)), s_param_out) concatenate_notes( notes, "{}_{}_{}_sample_summary.pdf".format(project_name, s.get("date", None), s.get("flowcell", None))) return output_data
def sample_status_note(project_name=None, flowcell=None, username=None, password=None, url=None, ordered_million_reads=None, uppnex_id=None, customer_reference=None, bc_count=None, project_alias=[], projectdb="projects", samplesdb="samples", flowcelldb="flowcells", phix=None, **kw): """Make a sample status note. Used keywords: :param project_name: project name :param flowcell: flowcell id :param username: db username :param password: db password :param url: db url :param ordered_million_reads: number of ordered reads in millions :param uppnex_id: the uppnex id :param customer_reference: customer project name :param project_alias: project alias name :param phix: phix error rate """ # Cutoffs cutoffs = { "phix_err_cutoff": 2.0, "qv_cutoff": 30, } # parameters parameters = { "project_name": None, "start_date": None, "FC_id": None, "scilifelab_name": None, "rounded_read_count": None, "phix_error_rate": None, "avg_quality_score": None, "success": None, "run_mode": None, } # key mapping from sample_run_metrics to parameter keys srm_to_parameter = { "project_name": "sample_prj", "FC_id": "flowcell", "scilifelab_name": "barcode_name", "start_date": "date", "rounded_read_count": "bc_count" } LOG.debug("got parameters {}".format(parameters)) output_data = { 'stdout': StringIO(), 'stderr': StringIO(), 'debug': StringIO() } if not _assert_flowcell_format(flowcell): LOG.warn( "Wrong flowcell format {}; skipping. Please use the flowcell id (format \"[A-Z0-9]+XX\")" .format(flowcell)) return output_data output_data = _update_sample_output_data(output_data, cutoffs) # Connect and run s_con = SampleRunMetricsConnection(dbname=samplesdb, username=username, password=password, url=url) fc_con = FlowcellRunMetricsConnection(dbname=flowcelldb, username=username, password=password, url=url) p_con = ProjectSummaryConnection(dbname=projectdb, username=username, password=password, url=url) # Set up paragraphs paragraphs = sample_note_paragraphs() headers = sample_note_headers() # Get project project = p_con.get_entry(project_name) if not project: LOG.warn("No such project '{}'".format(project_name)) return output_data # Set samples list sample_run_list = _set_sample_run_list(project_name, flowcell, project_alias, s_con) if len(sample_run_list) == 0: LOG.warn( "No samples for project '{}', flowcell '{}'. Maybe there are no sample run metrics in statusdb?" .format(project_name, flowcell)) return output_data # Set options ordered_million_reads = _literal_eval_option(ordered_million_reads) bc_count = _literal_eval_option(bc_count) phix = _literal_eval_option(phix) # Count number of times a sample has been run on a flowcell; if several, make lane-specific reports sample_count = Counter([x.get("barcode_name") for x in sample_run_list]) # Loop samples and collect information s_param_out = [] for s in sample_run_list: s_param = {} LOG.debug( "working on sample '{}', sample run metrics name '{}', id '{}'". format(s.get("barcode_name", None), s.get("name", None), s.get("_id", None))) s_param.update(parameters) s_param.update( {key: s[srm_to_parameter[key]] for key in srm_to_parameter.keys()}) fc = "{}_{}".format(s.get("date"), s.get("flowcell")) # Get instrument try: s_param.update(instrument[fc_con.get_instrument(str(fc))]) except: LOG.warn( "Failed to set instrument and software versions for flowcell {} in report due to missing RunInfo -> Instrument field in statusdb. Either rerun 'pm qc update-qc' or search-and-replace 'NN' in the sample report." .format(fc)) s_param.update(instrument['default']) # Get run mode s_param["run_mode"] = fc_con.get_run_mode(str(fc)) s_param.update(software_versions) s_param["phix_error_rate"] = fc_con.get_phix_error_rate( str(fc), s["lane"]) if phix: s_param["phix_error_rate"] = _get_phix_error_rate(s["lane"], phix) s_param['avg_quality_score'] = calc_avg_qv(s) if not s_param['avg_quality_score']: LOG.warn( "Calculation of average quality failed for sample {}, id {}". format(s.get("name"), s.get("_id"))) # Compare phix error and qv to cutoffs err_stat = "OK" qv_stat = "OK" if s_param["phix_error_rate"] > cutoffs["phix_err_cutoff"]: err_stat = "HIGH" elif s_param["phix_error_rate"] == -1: err_stat = "N/A" if s_param["avg_quality_score"] < cutoffs["qv_cutoff"]: qv_stat = "LOW" output_data["stdout"].write( "{:>18}\t{:>6}\t{:>12}\t{:>12}\t{:>12}\t{:>12}\n".format( s["barcode_name"], s["lane"], s_param["phix_error_rate"], err_stat, s_param["avg_quality_score"], qv_stat)) # Update/set remaning sample run parameters, falling back on project defaults if *key* is missing s_param['ordered_amount'] = s_param.get( 'ordered_amount', p_con.get_ordered_amount(project_name)) s_param['customer_reference'] = s_param.get( 'customer_reference', project.get('customer_reference')) s_param['uppnex_project_id'] = s_param.get('uppnex_project_id', project.get('uppnex_id')) # Override database settings if options passed at command line if ordered_million_reads: s_param["ordered_amount"] = _get_ordered_million_reads( s["barcode_name"], ordered_million_reads) if bc_count: s_param["rounded_read_count"] = _round_read_count_in_millions( _get_bc_count(s["barcode_name"], bc_count, s)) else: s_param["rounded_read_count"] = _round_read_count_in_millions( s_param["rounded_read_count"]) if uppnex_id: s_param["uppnex_project_id"] = uppnex_id if customer_reference: s_param["customer_reference"] = customer_reference # Get the project sample name corresponding to the sample run project_sample = p_con.get_project_sample( project_name, s.get("project_sample_name", None)) if project_sample: LOG.debug( "project sample run metrics mapping found: '{}' : '{}'".format( s["name"], project_sample["sample_name"])) project_sample_item = project_sample['project_sample'] # Set project_sample_d: a dictionary mapping from sample run metrics name to sample run metrics database id project_sample_d = _set_project_sample_dict(project_sample_item) if not project_sample_d: LOG.warn( "No sample_run_metrics information for sample '{}', barcode name '{}', id '{}'\n\tProject summary information {}" .format(s["name"], s["barcode_name"], s["_id"], project_sample)) # Check if sample run metrics name present in project database: if so, verify that database ids are consistent if s["name"] not in project_sample_d.keys(): LOG.warn( "no such sample run metrics '{}' in project sample run metrics dictionary" .format(s["name"])) else: if s["_id"] == project_sample_d[s["name"]]: LOG.debug( "project sample run metrics mapping found: '{}' : '{}'" .format(s["name"], project_sample_d[s["name"]])) else: LOG.warn( "inconsistent mapping for '{}': '{}' != '{}' (project summary id)" .format(s["name"], s["_id"], project_sample_d[s["name"]])) s_param['customer_name'] = project_sample_item.get( "customer_name", None) # No project sample found. Manual upload to database necessary. else: s_param['customer_name'] = None LOG.warn( "No project sample name found for sample run name '{}'".format( s["barcode_name"])) LOG.info( "Please run 'pm qc upload-qc FLOWCELL_ID --extensive-matching' to update project sample names " ) LOG.info( "or 'pm qc update --sample_prj PROJECT_NAME --names BARCODE_TO_SAMPLE_MAP to update project sample names." ) LOG.info("Please refer to the pm documentation for examples.") query_ok(force=kw.get("force", False)) # Finally assess sequencing success, update parameters and set outputs s_param['success'] = sequencing_success(s_param, cutoffs) s_param.update({ k: "N/A" for k in s_param.keys() if s_param[k] is None or s_param[k] == "" or s_param[k] == -1.0 }) if sample_count[s.get("barcode_name")] > 1: outfile = "{}_{}_{}_{}.pdf".format(s["barcode_name"], s["date"], s["flowcell"], s["lane"]) else: outfile = "{}_{}_{}.pdf".format(s["barcode_name"], s["date"], s["flowcell"]) s_param["outfile"] = outfile s_param_out.append(s_param) # Write final output to reportlab and rst files output_data["debug"].write( json.dumps({ 's_param': s_param_out, 'sample_runs': {s["name"]: s["barcode_name"] for s in sample_run_list} })) notes = [ make_note(headers=headers, paragraphs=paragraphs, **sp) for sp in s_param_out ] rest_notes = make_sample_rest_notes( "{}_{}_{}_sample_summary.rst".format(project_name, s.get("date", None), s.get("flowcell", None)), s_param_out) concatenate_notes( notes, "{}_{}_{}_sample_summary.pdf".format(project_name, s.get("date", None), s.get("flowcell", None))) return output_data
def _project_status_note_table(project_name=None, username=None, password=None, url=None, use_ps_map=True, use_bc_map=False, check_consistency=False, ordered_million_reads=None, uppnex_id=None, customer_reference=None, exclude_sample_ids={}, project_alias=None, sample_aliases={}, projectdb="projects", samplesdb="samples", flowcelldb="flowcells", include_all_samples=False, param={}, **kw): # mapping project_summary to parameter keys ps_to_parameter = { "scilife_name": "scilife_name", "customer_name": "customer_name", "project_name": "project_name" } # mapping project sample to table table_keys = [ 'ScilifeID', 'SubmittedID', 'BarcodeSeq', 'MSequenced', 'MOrdered' ] output_data = { 'stdout': StringIO(), 'stderr': StringIO(), 'debug': StringIO() } # Connect and run s_con = SampleRunMetricsConnection(dbname=samplesdb, username=username, password=password, url=url) fc_con = FlowcellRunMetricsConnection(dbname=flowcelldb, username=username, password=password, url=url) p_con = ProjectSummaryConnection(dbname=projectdb, username=username, password=password, url=url) #Get the information source for this project source = p_con.get_info_source(project_name) # Get project summary from project database sample_aliases = _literal_eval_option(sample_aliases, default={}) prj_summary = p_con.get_entry(project_name) if not prj_summary: LOG.warn("No such project '{}'".format(project_name)) return LOG.debug("Working on project '{}'.".format(project_name)) # Determine if project is finished by getting all samples sequenced date try: all_samples_sequenced = prj_summary['project_summary'][ 'all_samples_sequenced'] except (TypeError, KeyError): all_samples_sequenced = False # Get sample run list and loop samples to make mapping sample -> {sampleruns} sample_run_list = _set_sample_run_list(project_name, flowcell=None, project_alias=project_alias, s_con=s_con) samples = {} for s in sample_run_list: prj_sample = p_con.get_project_sample( project_name, s.get("project_sample_name", None)) if prj_sample: sample_name = prj_sample['project_sample'].get( "scilife_name", None) s_d = {s["name"]: {'sample': sample_name, 'id': s["_id"]}} samples.update(s_d) else: if s["barcode_name"] in sample_aliases: s_d = { sample_aliases[s["barcode_name"]]: { 'sample': sample_aliases[s["barcode_name"]], 'id': s["_id"] } } samples.update(s_d) else: s_d = { s["name"]: { 'sample': s["name"], 'id': s["_id"], 'barcode_name': s["barcode_name"] } } LOG.warn( "No mapping found for sample run:\n '{}'".format(s_d)) # Convert to mapping from desired sample name to list of aliases # Less important for the moment; one solution is to update the # Google docs summary table to use the P names sample_dict = prj_summary['samples'] param.update({ key: prj_summary.get(ps_to_parameter[key], None) for key in ps_to_parameter.keys() }) param["ordered_amount"] = param.get( "ordered_amount", p_con.get_ordered_amount(project_name, samples=sample_dict)) if not param.get('customer_reference'): try: param['customer_reference'] = prj_summary['details'][ 'customer_project_reference'] except (TypeError, KeyError): param['customer_reference'] = prj_summary.get('customer_reference') param['uppnex_project_id'] = param.get('uppnex_project_id', prj_summary.get('uppnex_id')) # Override database values if options passed at command line if uppnex_id: param["uppnex_project_id"] = uppnex_id if customer_reference: param["customer_reference"] = customer_reference # Process options ordered_million_reads = _literal_eval_option(ordered_million_reads) exclude_sample_ids = _literal_eval_option(exclude_sample_ids, default={}) ## Start collecting the data sample_table = [] samples_excluded = [] last_library_preps = p_con.get_latest_library_prep(project_name) last_library_preps_srm = [ x for l in last_library_preps.values() for x in l ] LOG.debug( "Looping through sample map that maps project sample names to sample run metrics ids" ) for k, v in samples.items(): LOG.debug("project sample '{}' maps to '{}'".format(k, v)) if not include_all_samples: if v['sample'] not in last_library_preps.keys(): LOG.info( "No library prep information for sample {}; keeping in report" .format(v['sample'])) else: if k not in last_library_preps_srm: LOG.info( "Sample run {} ('{}') is not latest library prep ({}) for project sample {}: excluding from report" .format( k, v["id"], ",".join( list( set(last_library_preps[ v['sample']].values()))), v['sample'])) continue else: pass if re.search("Unexpected", k): continue barcode_seq = s_con.get_entry(k, "sequence") # Exclude sample id? if _exclude_sample_id(exclude_sample_ids, v['sample'], barcode_seq): samples_excluded.append(v['sample']) continue # Get the project sample name from the sample run and set table values project_sample = sample_dict[v['sample']] vals = _set_sample_table_values(v['sample'], project_sample, barcode_seq, ordered_million_reads, param) sample_table.append([vals[k] for k in table_keys]) # Loop through samples in sample_dict for which there is no sample run information samples_in_table_or_excluded = list(set([x[0] for x in sample_table ])) + samples_excluded samples_not_in_table = list( set(sample_dict.keys()) - set(samples_in_table_or_excluded)) for sample in samples_not_in_table: if re.search("Unexpected", sample): continue project_sample = sample_dict[sample] # Set project_sample_d: a dictionary mapping from sample run metrics name to sample run metrics database id project_sample_d = _set_project_sample_dict(project_sample, source) if project_sample_d: for k, v in project_sample_d.iteritems(): barcode_seq = s_con.get_entry(k, "sequence") vals = _set_sample_table_values(sample, project_sample, barcode_seq, ordered_million_reads, param) sample_table.append([vals[k] for k in table_keys]) else: barcode_seq = None vals = _set_sample_table_values(sample, project_sample, barcode_seq, ordered_million_reads, param) sample_table.append([vals[k] for k in table_keys]) if all_samples_sequenced: param["finished"] = 'All samples for this project have been sequenced.' sample_table.sort() sample_table = list(sample_table for sample_table, _ in itertools.groupby(sample_table)) sample_table.insert( 0, ['ScilifeID', 'SubmittedID', 'BarcodeSeq', 'MSequenced', 'MOrdered']) return output_data, sample_table, param
class TestDbConnection(unittest.TestCase): def setUp(self): self.user = "******" self.pw = "pw" self.url = "localhost" self.examples = {"sample":"1_120924_AC003CCCXX_TGACCA", "flowcell":"AC003CCCXX", "project":"J.Doe_00_01"} self.p_con = ProjectSummaryConnection(dbname="projects-test", username=self.user, password=self.pw, url=self.url) def test_connection(self): """Test database connection""" sample_con = SampleRunMetricsConnection(dbname="samples-test", username=self.user, password=self.pw, url=self.url) self.assertEqual(sample_con.url_string, "http://{}:5984".format(self.url)) def test_get_flowcell(self): """Test getting a flowcell for a given sample""" sample_con = SampleRunMetricsConnection(dbname="samples-test", username=self.user, password=self.pw, url=self.url) fc = sample_con.get_entry(self.examples["sample"], "flowcell") self.assertEqual(str(fc), self.examples["flowcell"]) def test_get_sample_ids(self): """Test getting sample ids given flowcell and sample_prj""" sample_con = SampleRunMetricsConnection(dbname="samples-test", username=self.user, password=self.pw, url=self.url) sample_ids = sample_con.get_sample_ids(fc_id=self.examples["flowcell"]) LOG.info("Number of samples before subsetting: " + str(len(sample_ids))) self.assertEqual(len(sample_ids), 5) sample_ids = sample_con.get_sample_ids(fc_id=self.examples["flowcell"], sample_prj=self.examples["project"]) LOG.info( "Number of samples after subsetting: " + str(len(sample_ids))) self.assertEqual(len(sample_ids), 2) def test_get_samples(self): """Test getting samples given flowcell and sample_prj.""" sample_con = SampleRunMetricsConnection(dbname="samples-test", username=self.user, password=self.pw, url=self.url) samples = sample_con.get_samples(fc_id=self.examples["flowcell"]) LOG.info("Selecting on flowcell: " + str(len(samples))) self.assertEqual(len(samples), 5) samples = sample_con.get_samples(fc_id=self.examples["flowcell"], sample_prj=self.examples["project"]) LOG.info("Selecting on flowcell, subsetting on project: " + str(len(samples))) self.assertEqual(len(samples), 2) samples = sample_con.get_samples(sample_prj=self.examples["project"]) LOG.info("Selecting on project: " + str(len(samples))) self.assertEqual(len(samples), 3) samples = sample_con.get_samples(sample_prj=self.examples["project"], fc_id=self.examples["flowcell"]) LOG.info("Selecting on project, subsetting on flowcell: " + str(len(samples))) self.assertEqual(len(samples), 2) def test_get_samples_wrong_info(self): """Test getting samples when either flowcell or project id information is wrong""" sample_con = SampleRunMetricsConnection(dbname="samples-test", username=self.user, password=self.pw, url=self.url) samples = sample_con.get_samples(sample_prj="bogusproject", fc_id=self.examples["flowcell"]) LOG.info("Selecting on bogus project, subsetting on flowcell: " + str(len(samples))) self.assertEqual(len(samples), 0) def test_get_project_sample_ids(self): """Test getting project sample ids""" sample_con = SampleRunMetricsConnection(dbname="samples-test", username=self.user, password=self.pw, url=self.url) sample_ids = sample_con.get_sample_ids(sample_prj=self.examples["project"]) sample_names = [sample_con.db.get(x)["name"] for x in sample_ids] self.assertEqual(set(sample_names) , set(['1_120924_AC003CCCXX_TGACCA', '2_120924_AC003CCCXX_ACAGTG', '1_121015_BB002BBBXX_TGACCA'])) def test_get_latest_library_prep(self): """Test getting latest library prep""" prj = self.p_con.get_entry("J.Doe_00_01") prj['samples']['P001_102']['library_prep']['B'] = {'sample_run_metrics': {'2_120924_AC003CCCXX_TTGGAA': None}} self.p_con.save(prj) preps = self.p_con.get_latest_library_prep(project_name=self.examples["project"]) srm = [x for l in preps.values() for x in l] # Make sure A prep not in list self.assertNotIn('2_120924_AC003CCCXX_ACAGTG', srm) # Make sure B prep in list self.assertIn('2_120924_AC003CCCXX_TTGGAA', srm) # Reset data prj = self.p_con.get_entry("J.Doe_00_01") del prj['samples']['P001_102']['library_prep']['B'] self.p_con.save(prj) def test_get_barcode_lane_statistics(self): """Test getting barcode lane statistics from flowcell database""" fc_con = FlowcellRunMetricsConnection(dbname="flowcells-test", username="******", password="******") # Try getting wrong sample name, should return None data = fc_con.get_barcode_lane_statistics("J.Doe_00_01", "P001_101_index6", "120924_AC003CCCXX", "1") self.assertEqual(data, (None, None)) data = fc_con.get_barcode_lane_statistics("J.Doe_00_01", "P001_101_index3", "120924_AC003CCCXX", "1") self.assertEqual(data, (u'35.22', u'90.05'))