Пример #1
0
 def test_dbcon(self):
     """Test database connection and that we get expected values."""
     s_con = SampleRunMetricsConnection(dbname="samples-test", username="******", password="******")
     samples = [s_con.get_entry(x) for x in s_con.name_view]
     samples_d = {x["name"]: x for x in samples}
     self.assertEqual(samples_d["1_120924_AC003CCCXX_TGACCA"]["date"], "120924")
     self.assertEqual(samples_d["1_121015_BB002BBBXX_TGACCA"]["flowcell"], "BB002BBBXX")
     self.assertEqual(samples_d["2_120924_AC003CCCXX_ACAGTG"]["entity_type"], "sample_run_metrics")
     self.assertEqual(samples_d["3_120924_AC003CCCXX_ACAGTG"]["lane"], "3")
     self.assertEqual(samples_d["4_120924_AC003CCCXX_CGTTAA"]["sequence"], "CGTTAA")
     self.assertEqual(samples_d["2_121015_BB002BBBXX_TGACCA"]["project_id"], "P002")
     fc_con = FlowcellRunMetricsConnection(dbname="flowcells-test", username="******", password="******")
     flowcells = [fc_con.get_entry(x) for x in fc_con.name_view]
     flowcells_d = {x["name"]: x for x in flowcells}
     self.assertEqual(flowcells_d["120924_AC003CCCXX"]["name"], "120924_AC003CCCXX")
     self.assertEqual(flowcells_d["121015_BB002BBBXX"]["name"], "121015_BB002BBBXX")
     self.assertEqual(flowcells_d["120924_AC003CCCXX"]["entity_type"], "flowcell_run_metrics")
     p_con = ProjectSummaryConnection(dbname="projects-test", username="******", password="******")
     projects = [p_con.get_entry(x) for x in p_con.name_view]
     projects_d = {x["project_name"]: x for x in projects}
     self.assertEqual(projects_d["J.Doe_00_01"]["min_m_reads_per_sample_ordered"], 0.1)
     self.assertEqual(projects_d["J.Doe_00_01"]["no_of_samples"], 2)
     self.assertEqual(
         set(projects_d["J.Doe_00_01"]["samples"].keys()), set(["P001_101_index3", "P001_102", "P001_103"])
     )
     self.assertEqual(projects_d["J.Doe_00_01"]["customer_reference"], "GnuGenome")
     self.assertEqual(projects_d["J.Doe_00_02"]["min_m_reads_per_sample_ordered"], 0.2)
     self.assertEqual(projects_d["J.Doe_00_03"]["samples"].keys(), ["3_index6"])
     self.assertIn("A", projects_d["J.Doe_00_03"]["samples"]["3_index6"]["library_prep"])
Пример #2
0
    def test_2_make_note(self):
        """Make a note subset by example flowcell and project"""
        s_con = SampleRunMetricsConnection(username=self.user, password=self.pw, url=self.url)
        fc_con = FlowcellRunMetricsConnection(username=self.user, password=self.pw, url=self.url)
        p_con = ProjectSummaryConnection(username=self.user, password=self.pw, url=self.url)
        paragraphs = sample_note_paragraphs()
        headers = sample_note_headers()
        samples = s_con.get_samples(self.examples["flowcell"], self.examples["project"])
        project = p_con.get_entry(self.examples["project"])
        samples = p_con.map_srm_to_name(self.examples["project"], fc_id=self.examples["flowcell"], use_bc_map=True)
        for k,v  in samples.items():
            s_param = parameters
            s = s_con.get_entry(k)
            s_param.update({key:s[srm_to_parameter[key]] for key in srm_to_parameter.keys()})
            fc = "{}_{}".format(s["date"], s["flowcell"])
            s_param["phix_error_rate"] = fc_con.get_phix_error_rate(str(fc), s["lane"])
            s_param['avg_quality_score'] = s_con.calc_avg_qv(s["name"])
            s_param['rounded_read_count'] = round(float(s_param['rounded_read_count'])/1e6,1) if s_param['rounded_read_count'] else None
            s_param['customer_name'] = project['samples'][v["sample"]].get('customer_name', None)

            if project:
                s_param['ordered_amount'] = p_con.get_ordered_amount(self.examples["project"])
                s_param['customer_reference'] = s_param.get('customer_reference', project['customer_reference'])
                s_param['uppnex_project_id'] = s_param.get('uppnex_project_id', project['uppnex_id'])
            s_param['success'] = sequencing_success(s_param, cutoffs)
            s_param.update({k:"N/A" for k in s_param.keys() if s_param[k] is None})
            make_note("{}.pdf".format(s["barcode_name"]), headers, paragraphs, **s_param)
Пример #3
0
    def swestore(self):
        """This function is the entry point for tasks having to do with packaging and sending runs to swestore
        """
        db_info = self.app.config.get_section_dict('db')
        f_conn = FlowcellRunMetricsConnection(username=db_info.get('user'),
                                              password=db_info.get('password'),
                                              url=db_info.get('url'))
        # Create a tarball out of the run folder
        if self.pargs.package_run:

            # We require a flowcell argument
            if not self._check_pargs(["flowcell"]):
                return

            self.pargs.tarball = package_run(self,self.config.get('archive','swestore_staging'), **vars(self.pargs))
            if not self.pargs.tarball:
                self.log.error("No tarball was created, exiting")
                return
            if self.pargs.clean:
                rm_run(self,self.config.get('archive','root'), flowcell=self.pargs.flowcell)

            if self.pargs.clean_from_staging:
                #Check that the run has been archived on the NAS before removing it, otherwise it will keep synching
                if self.pargs.flowcell in f_conn.get_storage_status('NAS_nosync').keys():
                    rm_run(self,self.config.get('archive','swestore_staging'), flowcell=self.pargs.flowcell)
                else:
                    self.log.warn("Run storage status is not NAS_nosync, not removing run from swestore_stage!")

        if not self.pargs.tarball:
            self.log.error("Required argument --tarball was not specified")
            return

        if not os.path.exists(self.pargs.tarball):
            self.log.error("Tarball {} does not exist".format(self.pargs.tarball))
            return

        # Upload a tarball to a remote host
        if self.pargs.remote_upload:
            result = upload_tarball(self,
                                    **dict(self.config.get_section_dict('archive').items() + vars(self.pargs).items()))
            if not result:
                return
            if self.pargs.clean:
                rm_tarball(self,tarball=self.pargs.tarball)

        # Send the tarball to Swestore using irods
        if self.pargs.send_to_swestore:
            result = send_to_swestore(self,**dict(self.config.get_section_dict('archive').items() + vars(self.pargs).items()))
            if not result:
                # If archiving failed, we need to give a non-zero exit code in order for a remote instance to detect the failure
                sys.exit(1)
            if self.pargs.clean:
                rm_tarball(self,tarball=self.pargs.tarball)
            #Set the run as archived in StatusDB
            fc_db_id = f_conn.id_view.get(self.pargs.flowcell)
            f_conn.set_storage_status(fc_db_id, 'swestore_archived')
            # Log to statusdb
            if self.pargs.log_to_db:
                # implement this
                raise NotImplementedError("logging to db functionality not implemented")
Пример #4
0
 def setUp(self):
     """FIXME: All other tests depend on data being uploaded, so
     these are not real unit tests. The setup to TestQCUpload has to
     be run prior to other tests, else unexpected failures will
     occur."""
     self.app = self.make_app(
         argv=['qc', 'upload-qc', flowcells[0], '--mtime', '10000'],
         extensions=[
             'scilifelab.pm.ext.ext_qc', 'scilifelab.pm.ext.ext_couchdb'
         ])
     self._run_app()
     self.app = self.make_app(
         argv=['qc', 'upload-qc', flowcells[1], '--mtime', '10000'],
         extensions=[
             'scilifelab.pm.ext.ext_qc', 'scilifelab.pm.ext.ext_couchdb'
         ])
     self._run_app()
     self.s_con = SampleRunMetricsConnection(dbname="samples-test",
                                             username="******",
                                             password="******")
     self.p_con = ProjectSummaryConnection(dbname="projects-test",
                                           username="******",
                                           password="******")
     self.fc_con = FlowcellRunMetricsConnection(dbname="flowcells-test",
                                                username="******",
                                                password="******")
Пример #5
0
    def list_projects(self):
        if not self._check_pargs(["flowcell"]):
            return

        url = self.pargs.url if self.pargs.url else self.app.config.get("db", "url")
        if not url:
            self.app.log.warn("Please provide a valid url: got {}".format(url))
            return
        if not validate_fc_directory_format(self.pargs.flowcell):
            self.app.log.warn(
                "Path '{}' does not conform to bcbio flowcell directory format; aborting".format(self.pargs.flowcell)
            )
            return

        out_data = [[self.pargs.flowcell]]
        s = self.pargs.flowcell.split("_")
        fcid = "_".join([s[0], s[-1]])

        self.log.debug("Establishing FlowcellRunMetricsConnection")
        fc_con = FlowcellRunMetricsConnection(dbname=self.app.config.get("db", "flowcells"), **vars(self.app.pargs))
        self.log.debug("Establishing ProjectSummaryConnection")
        p_con = ProjectSummaryConnection(dbname=self.app.config.get("db", "projects"), **vars(self.app.pargs))

        self.log.debug("Fetching flowcell metric document for flowcell {}".format(fcid))
        fc = fc_con.get_entry(fcid)
        if fc is None:
            self.log.warn("No flowcell metric document for flowcell {}".format(fcid))
            return

        self.log.debug("Fetching csv samplesheet data for flowcell {}".format(fcid))
        ssheet_data = self._get_samplesheet_sample_data(fc)
        if len(ssheet_data) == 0:
            self.log.warn("No csv samplesheet data for flowcell {}".format(fcid))
            return

        self.log.debug("Fetch runParameter data for flowcell {}".format(fcid))
        run_data = self._get_run_parameter_data(fc)
        if len(run_data) == 0:
            self.log.warn("No runParameter data for flowcell {}".format(fcid))

        out_data = [
            [self.pargs.flowcell, run_data.get("InstrumentType", "HiSeq2000"), run_data.get("RunMode", "High Output")]
        ]

        # Extract the project names
        projects = set([proj[0].replace("__", ".") for data in ssheet_data.values() for proj in data.values()])

        # Extract application for each project
        for project in projects:
            self.log.debug("Fetching project data document for project {}".format(project))
            pdoc = p_con.get_entry(project)
            if pdoc is None:
                self.log.warn("No project data document for project {}".format(project))
                pdoc = {}

            application = pdoc.get("application", "N/A")
            out_data.append([project, application])

        self.app._output_data["stdout"].write("\n".join(["\t".join([str(r) for r in row]) for row in out_data]))
Пример #6
0
 def test_get_barcode_lane_statistics(self):
     """Test getting barcode lane statistics from flowcell database"""
     fc_con = FlowcellRunMetricsConnection(dbname="flowcells-test", username="******", password="******")
     # Try getting wrong sample name, should return None
     data = fc_con.get_barcode_lane_statistics("J.Doe_00_01", "P001_101_index6", "120924_AC003CCCXX", "1")
     self.assertEqual(data, (None, None))
     data = fc_con.get_barcode_lane_statistics("J.Doe_00_01", "P001_101_index3", "120924_AC003CCCXX", "1")
     self.assertEqual(data, (u'35.22', u'90.05'))
Пример #7
0
    def upload_qc(self):
        if not self._check_pargs(['flowcell']):
            return
        url = self.pargs.url if self.pargs.url else self.app.config.get(
            "db", "url")
        if not url:
            self.app.log.warn("Please provide a valid url: got {}".format(url))
            return
        if not validate_fc_directory_format(self.pargs.flowcell):
            self.app.log.warn(
                "Path '{}' does not conform to bcbio flowcell directory format; aborting"
                .format(self.pargs.flowcell))
            return

        runinfo_csv = os.path.join(os.path.abspath(self.pargs.flowcell),
                                   "{}.csv".format(fc_id(self.pargs.flowcell)))
        runinfo_yaml = os.path.join(os.path.abspath(self.pargs.flowcell),
                                    "run_info.yaml")
        (fc_date, fc_name) = fc_parts(self.pargs.flowcell)
        if int(fc_date) < 120815:
            self.log.info(
                "Assuming pre-casava based file structure for {}".format(
                    fc_id(self.pargs.flowcell)))
            qc_objects = self._collect_pre_casava_qc()
        else:
            self.log.info("Assuming casava based file structure for {}".format(
                fc_id(self.pargs.flowcell)))
            qc_objects = self._collect_casava_qc()

        if len(qc_objects) == 0:
            self.log.info("No out-of-date qc objects for {}".format(
                fc_id(self.pargs.flowcell)))
            return
        else:
            self.log.info("Retrieved {} updated qc objects".format(
                len(qc_objects)))

        s_con = SampleRunMetricsConnection(dbname=self.app.config.get(
            "db", "samples"),
                                           **vars(self.app.pargs))
        fc_con = FlowcellRunMetricsConnection(dbname=self.app.config.get(
            "db", "flowcells"),
                                              **vars(self.app.pargs))
        p_con = ProjectSummaryConnection(dbname=self.app.config.get(
            "db", "projects"),
                                         **vars(self.app.pargs))
        for obj in qc_objects:
            if self.app.pargs.debug:
                self.log.debug("{}: {}".format(str(obj), obj["_id"]))
            if isinstance(obj, FlowcellRunMetricsDocument):
                dry("Saving object {}".format(repr(obj)), fc_con.save(obj))
            if isinstance(obj, SampleRunMetricsDocument):
                project_sample = p_con.get_project_sample(
                    obj.get("sample_prj", None), obj.get("barcode_name", None),
                    self.pargs.extensive_matching)
                if project_sample:
                    obj["project_sample_name"] = project_sample['sample_name']
                dry("Saving object {}".format(repr(obj)), s_con.save(obj))
Пример #8
0
 def list_projects(self):
     if not self._check_pargs(["flowcell"]):
         return
     
     url = self.pargs.url if self.pargs.url else self.app.config.get("db", "url")
     if not url:
         self.app.log.warn("Please provide a valid url: got {}".format(url))
         return
     if not validate_fc_directory_format(self.pargs.flowcell):
         self.app.log.warn("Path '{}' does not conform to bcbio flowcell directory format; aborting".format(self.pargs.flowcell))
         return
     
     out_data = [[self.pargs.flowcell]]
     s = self.pargs.flowcell.split("_")
     fcid = "_".join([s[0],s[-1]])
     
     self.log.debug("Establishing FlowcellRunMetricsConnection")
     fc_con = FlowcellRunMetricsConnection(dbname=self.app.config.get("db", "flowcells"), **vars(self.app.pargs))
     self.log.debug("Establishing ProjectSummaryConnection")
     p_con = ProjectSummaryConnection(dbname=self.app.config.get("db", "projects"), **vars(self.app.pargs))
 
     self.log.debug("Fetching flowcell metric document for flowcell {}".format(fcid))
     fc = fc_con.get_entry(fcid)
     if fc is None:
         self.log.warn("No flowcell metric document for flowcell {}".format(fcid))
         return
 
     self.log.debug("Fetching csv samplesheet data for flowcell {}".format(fcid))
     ssheet_data = self._get_samplesheet_sample_data(fc)
     if len(ssheet_data) == 0:
         self.log.warn("No csv samplesheet data for flowcell {}".format(fcid))
         return
     
     self.log.debug("Fetch runParameter data for flowcell {}".format(fcid))
     run_data = self._get_run_parameter_data(fc)
     if len(run_data) == 0:
         self.log.warn("No runParameter data for flowcell {}".format(fcid))
     
     out_data = [[self.pargs.flowcell,
                  run_data.get("InstrumentType","HiSeq2000"),
                  run_data.get("RunMode","High Output")]]
     
     # Extract the project names
     projects = set([proj[0].replace("__",".") for data in ssheet_data.values() for proj in data.values()])
 
     # Extract application for each project
     for project in projects:    
         self.log.debug("Fetching project data document for project {}".format(project))
         pdoc = p_con.get_entry(project)
         if pdoc is None:
             self.log.warn("No project data document for project {}".format(project))
             pdoc = {}
     
         application = pdoc.get("application","N/A")
         type = pdoc.get("type","Check GPL")
         out_data.append([project,application,type])
     
     self.app._output_data['stdout'].write("\n".join(["\t".join([str(r) for r in row]) for row in out_data]))
Пример #9
0
 def test_get_barcode_lane_statistics(self):
     """Test getting barcode lane statistics from flowcell database"""
     fc_con = FlowcellRunMetricsConnection(dbname="flowcells-test",
                                           username="******",
                                           password="******")
     # Try getting wrong sample name, should return None
     data = fc_con.get_barcode_lane_statistics("J.Doe_00_01",
                                               "P001_101_index6",
                                               "120924_AC003CCCXX", "1")
     self.assertEqual(data, (None, None))
     data = fc_con.get_barcode_lane_statistics("J.Doe_00_01",
                                               "P001_101_index3",
                                               "120924_AC003CCCXX", "1")
     self.assertEqual(data, (u'35.22', u'90.05'))
Пример #10
0
 def test_dbcon(self):
     """Test database connection and that we get expected values."""
     s_con = SampleRunMetricsConnection(dbname="samples-test",
                                        username="******",
                                        password="******")
     samples = [s_con.get_entry(x) for x in s_con.name_view]
     samples_d = {x["name"]: x for x in samples}
     self.assertEqual(samples_d["1_120924_AC003CCCXX_TGACCA"]["date"],
                      "120924")
     self.assertEqual(samples_d["1_121015_BB002BBBXX_TGACCA"]["flowcell"],
                      "BB002BBBXX")
     self.assertEqual(
         samples_d["2_120924_AC003CCCXX_ACAGTG"]["entity_type"],
         "sample_run_metrics")
     self.assertEqual(samples_d["3_120924_AC003CCCXX_ACAGTG"]["lane"], "3")
     self.assertEqual(samples_d["4_120924_AC003CCCXX_CGTTAA"]["sequence"],
                      "CGTTAA")
     self.assertEqual(samples_d["2_121015_BB002BBBXX_TGACCA"]["project_id"],
                      "P002")
     fc_con = FlowcellRunMetricsConnection(dbname="flowcells-test",
                                           username="******",
                                           password="******")
     flowcells = [fc_con.get_entry(x) for x in fc_con.name_view]
     flowcells_d = {x["name"]: x for x in flowcells}
     self.assertEqual(flowcells_d["120924_AC003CCCXX"]["name"],
                      "120924_AC003CCCXX")
     self.assertEqual(flowcells_d["121015_BB002BBBXX"]["name"],
                      "121015_BB002BBBXX")
     self.assertEqual(flowcells_d["120924_AC003CCCXX"]["entity_type"],
                      "flowcell_run_metrics")
     p_con = ProjectSummaryConnection(dbname="projects-test",
                                      username="******",
                                      password="******")
     projects = [p_con.get_entry(x) for x in p_con.name_view]
     projects_d = {x["project_name"]: x for x in projects}
     self.assertEqual(
         projects_d["J.Doe_00_01"]["min_m_reads_per_sample_ordered"], 0.1)
     self.assertEqual(projects_d["J.Doe_00_01"]["no_of_samples"], 2)
     self.assertEqual(set(projects_d["J.Doe_00_01"]["samples"].keys()),
                      set(["P001_101_index3", "P001_102", "P001_103"]))
     self.assertEqual(projects_d["J.Doe_00_01"]["customer_reference"],
                      "GnuGenome")
     self.assertEqual(
         projects_d["J.Doe_00_02"]["min_m_reads_per_sample_ordered"], 0.2)
     self.assertEqual(projects_d["J.Doe_00_03"]["samples"].keys(),
                      ["3_index6"])
     self.assertIn(
         "A",
         projects_d["J.Doe_00_03"]["samples"]["3_index6"]["library_prep"])
Пример #11
0
 def setUp(self):
     self.app = self.make_app(
         argv=["qc", "upload-qc", flowcells[0], "--mtime", "10000"],
         extensions=["scilifelab.pm.ext.ext_qc", "scilifelab.pm.ext.ext_couchdb"],
     )
     self._run_app()
     self.s_con = SampleRunMetricsConnection(dbname="samples-test", username="******", password="******")
     self.p_con = ProjectSummaryConnection(dbname="projects-test", username="******", password="******")
     self.fc_con = FlowcellRunMetricsConnection(dbname="flowcells-test", username="******", password="******")
Пример #12
0
    def upload_qc(self):
        if not self._check_pargs(["flowcell"]):
            return
        url = self.pargs.url if self.pargs.url else self.app.config.get("db", "url")
        if not url:
            self.app.log.warn("Please provide a valid url: got {}".format(url))
            return
        if not validate_fc_directory_format(self.pargs.flowcell):
            self.app.log.warn(
                "Path '{}' does not conform to bcbio flowcell directory format; aborting".format(self.pargs.flowcell)
            )
            return

        runinfo_csv = os.path.join(os.path.abspath(self.pargs.flowcell), "{}.csv".format(fc_id(self.pargs.flowcell)))
        runinfo_yaml = os.path.join(os.path.abspath(self.pargs.flowcell), "run_info.yaml")
        (fc_date, fc_name) = fc_parts(self.pargs.flowcell)
        if int(fc_date) < 120815:
            self.log.info("Assuming pre-casava based file structure for {}".format(fc_id(self.pargs.flowcell)))
            qc_objects = self._collect_pre_casava_qc()
        else:
            self.log.info("Assuming casava based file structure for {}".format(fc_id(self.pargs.flowcell)))
            qc_objects = self._collect_casava_qc()

        if len(qc_objects) == 0:
            self.log.info("No out-of-date qc objects for {}".format(fc_id(self.pargs.flowcell)))
            return
        else:
            self.log.info("Retrieved {} updated qc objects".format(len(qc_objects)))

        s_con = SampleRunMetricsConnection(dbname=self.app.config.get("db", "samples"), **vars(self.app.pargs))
        fc_con = FlowcellRunMetricsConnection(dbname=self.app.config.get("db", "flowcells"), **vars(self.app.pargs))
        p_con = ProjectSummaryConnection(dbname=self.app.config.get("db", "projects"), **vars(self.app.pargs))
        for obj in qc_objects:
            if self.app.pargs.debug:
                self.log.debug("{}: {}".format(str(obj), obj["_id"]))
            if isinstance(obj, FlowcellRunMetricsDocument):
                dry("Saving object {}".format(repr(obj)), fc_con.save(obj))
            if isinstance(obj, SampleRunMetricsDocument):
                project_sample = p_con.get_project_sample(
                    obj.get("sample_prj", None), obj.get("barcode_name", None), self.pargs.extensive_matching
                )
                if project_sample:
                    obj["project_sample_name"] = project_sample["sample_name"]
                dry("Saving object {}".format(repr(obj)), s_con.save(obj))
Пример #13
0
 def setUp(self):
     """FIXME: All other tests depend on data being uploaded, so
     these are not real unit tests. The setup to TestQCUpload has to
     be run prior to other tests, else unexpected failures will
     occur."""
     self.app = self.make_app(argv = ['qc', 'upload-qc', flowcells[0], '--mtime',  '10000'], extensions=['scilifelab.pm.ext.ext_qc', 'scilifelab.pm.ext.ext_couchdb'])
     self._run_app()
     self.app = self.make_app(argv = ['qc', 'upload-qc', flowcells[1], '--mtime',  '10000'], extensions=['scilifelab.pm.ext.ext_qc', 'scilifelab.pm.ext.ext_couchdb'])
     self._run_app()
     self.s_con = SampleRunMetricsConnection(dbname="samples-test", username="******", password="******")
     self.p_con = ProjectSummaryConnection(dbname="projects-test", username="******", password="******")
     self.fc_con = FlowcellRunMetricsConnection(dbname="flowcells-test", username="******", password="******")
Пример #14
0
def _project_status_note_table(project_name=None,
                               username=None,
                               password=None,
                               url=None,
                               use_ps_map=True,
                               use_bc_map=False,
                               check_consistency=False,
                               ordered_million_reads=None,
                               uppnex_id=None,
                               customer_reference=None,
                               exclude_sample_ids={},
                               project_alias=None,
                               sample_aliases={},
                               projectdb="projects",
                               samplesdb="samples",
                               flowcelldb="flowcells",
                               include_all_samples=False,
                               param={},
                               **kw):

    # mapping project_summary to parameter keys
    ps_to_parameter = {
        "scilife_name": "scilife_name",
        "customer_name": "customer_name",
        "project_name": "project_name"
    }
    # mapping project sample to table
    table_keys = [
        'ScilifeID', 'SubmittedID', 'BarcodeSeq', 'MSequenced', 'MOrdered'
    ]

    output_data = {
        'stdout': StringIO(),
        'stderr': StringIO(),
        'debug': StringIO()
    }
    # Connect and run
    s_con = SampleRunMetricsConnection(dbname=samplesdb,
                                       username=username,
                                       password=password,
                                       url=url)
    fc_con = FlowcellRunMetricsConnection(dbname=flowcelldb,
                                          username=username,
                                          password=password,
                                          url=url)
    p_con = ProjectSummaryConnection(dbname=projectdb,
                                     username=username,
                                     password=password,
                                     url=url)

    #Get the information source for this project
    source = p_con.get_info_source(project_name)

    # Get project summary from project database
    sample_aliases = _literal_eval_option(sample_aliases, default={})
    prj_summary = p_con.get_entry(project_name)
    if not prj_summary:
        LOG.warn("No such project '{}'".format(project_name))
        return
    LOG.debug("Working on project '{}'.".format(project_name))

    # Determine if project is finished by getting all samples sequenced date
    try:
        all_samples_sequenced = prj_summary['project_summary'][
            'all_samples_sequenced']
    except (TypeError, KeyError):
        all_samples_sequenced = False

    # Get sample run list and loop samples to make mapping sample -> {sampleruns}
    sample_run_list = _set_sample_run_list(project_name,
                                           flowcell=None,
                                           project_alias=project_alias,
                                           s_con=s_con)
    samples = {}
    for s in sample_run_list:
        prj_sample = p_con.get_project_sample(
            project_name, s.get("project_sample_name", None))
        if prj_sample:
            sample_name = prj_sample['project_sample'].get(
                "scilife_name", None)
            s_d = {s["name"]: {'sample': sample_name, 'id': s["_id"]}}
            samples.update(s_d)
        else:
            if s["barcode_name"] in sample_aliases:
                s_d = {
                    sample_aliases[s["barcode_name"]]: {
                        'sample': sample_aliases[s["barcode_name"]],
                        'id': s["_id"]
                    }
                }
                samples.update(s_d)
            else:
                s_d = {
                    s["name"]: {
                        'sample': s["name"],
                        'id': s["_id"],
                        'barcode_name': s["barcode_name"]
                    }
                }
                LOG.warn(
                    "No mapping found for sample run:\n  '{}'".format(s_d))

    # Convert to mapping from desired sample name to list of aliases
    # Less important for the moment; one solution is to update the
    # Google docs summary table to use the P names
    sample_dict = prj_summary['samples']
    param.update({
        key: prj_summary.get(ps_to_parameter[key], None)
        for key in ps_to_parameter.keys()
    })
    param["ordered_amount"] = param.get(
        "ordered_amount",
        p_con.get_ordered_amount(project_name, samples=sample_dict))

    if not param.get('customer_reference'):
        try:
            param['customer_reference'] = prj_summary['details'][
                'customer_project_reference']
        except (TypeError, KeyError):
            param['customer_reference'] = prj_summary.get('customer_reference')
    param['uppnex_project_id'] = param.get('uppnex_project_id',
                                           prj_summary.get('uppnex_id'))

    # Override database values if options passed at command line
    if uppnex_id:
        param["uppnex_project_id"] = uppnex_id
    if customer_reference:
        param["customer_reference"] = customer_reference

    # Process options
    ordered_million_reads = _literal_eval_option(ordered_million_reads)
    exclude_sample_ids = _literal_eval_option(exclude_sample_ids, default={})

    ## Start collecting the data
    sample_table = []
    samples_excluded = []
    last_library_preps = p_con.get_latest_library_prep(project_name)
    last_library_preps_srm = [
        x for l in last_library_preps.values() for x in l
    ]
    LOG.debug(
        "Looping through sample map that maps project sample names to sample run metrics ids"
    )
    for k, v in samples.items():
        LOG.debug("project sample '{}' maps to '{}'".format(k, v))
        if not include_all_samples:
            if v['sample'] not in last_library_preps.keys():
                LOG.info(
                    "No library prep information for sample {}; keeping in report"
                    .format(v['sample']))
            else:
                if k not in last_library_preps_srm:
                    LOG.info(
                        "Sample run {} ('{}') is not latest library prep ({}) for project sample {}: excluding from report"
                        .format(
                            k, v["id"], ",".join(
                                list(
                                    set(last_library_preps[
                                        v['sample']].values()))), v['sample']))
                    continue
        else:
            pass

        if re.search("Unexpected", k):
            continue
        barcode_seq = s_con.get_entry(k, "sequence")
        # Exclude sample id?
        if _exclude_sample_id(exclude_sample_ids, v['sample'], barcode_seq):
            samples_excluded.append(v['sample'])
            continue
        # Get the project sample name from the sample run and set table values
        project_sample = sample_dict[v['sample']]
        vals = _set_sample_table_values(v['sample'], project_sample,
                                        barcode_seq, ordered_million_reads,
                                        param)
        sample_table.append([vals[k] for k in table_keys])

    # Loop through samples in sample_dict for which there is no sample run information
    samples_in_table_or_excluded = list(set([x[0] for x in sample_table
                                             ])) + samples_excluded
    samples_not_in_table = list(
        set(sample_dict.keys()) - set(samples_in_table_or_excluded))
    for sample in samples_not_in_table:
        if re.search("Unexpected", sample):
            continue
        project_sample = sample_dict[sample]
        # Set project_sample_d: a dictionary mapping from sample run metrics name to sample run metrics database id
        project_sample_d = _set_project_sample_dict(project_sample, source)
        if project_sample_d:
            for k, v in project_sample_d.iteritems():
                barcode_seq = s_con.get_entry(k, "sequence")
                vals = _set_sample_table_values(sample, project_sample,
                                                barcode_seq,
                                                ordered_million_reads, param)
                sample_table.append([vals[k] for k in table_keys])
        else:
            barcode_seq = None
            vals = _set_sample_table_values(sample, project_sample,
                                            barcode_seq, ordered_million_reads,
                                            param)
            sample_table.append([vals[k] for k in table_keys])
    if all_samples_sequenced:
        param["finished"] = 'All samples for this project have been sequenced.'
    sample_table.sort()
    sample_table = list(sample_table
                        for sample_table, _ in itertools.groupby(sample_table))
    sample_table.insert(
        0,
        ['ScilifeID', 'SubmittedID', 'BarcodeSeq', 'MSequenced', 'MOrdered'])

    return output_data, sample_table, param
Пример #15
0
class TestQCUpload(PmFullTest):
    def setUp(self):
        """FIXME: All other tests depend on data being uploaded, so
        these are not real unit tests. The setup to TestQCUpload has to
        be run prior to other tests, else unexpected failures will
        occur."""
        self.app = self.make_app(argv = ['qc', 'upload-qc', flowcells[0], '--mtime',  '10000'], extensions=['scilifelab.pm.ext.ext_qc', 'scilifelab.pm.ext.ext_couchdb'])
        self._run_app()
        self.app = self.make_app(argv = ['qc', 'upload-qc', flowcells[1], '--mtime',  '10000'], extensions=['scilifelab.pm.ext.ext_qc', 'scilifelab.pm.ext.ext_couchdb'])
        self._run_app()
        self.s_con = SampleRunMetricsConnection(dbname="samples-test", username="******", password="******")
        self.p_con = ProjectSummaryConnection(dbname="projects-test", username="******", password="******")
        self.fc_con = FlowcellRunMetricsConnection(dbname="flowcells-test", username="******", password="******")

    def test_samplesheet(self):
        """Test samplesheet upload"""
        fc = self.fc_con.get_entry("120924_AC003CCCXX")
        self.assertEqual(fc["samplesheet_csv"][0]["Index"], "TGACCA")
        self.assertEqual(fc["samplesheet_csv"][0]["Description"], "J__Doe_00_01")
        self.assertEqual(fc["samplesheet_csv"][0]["FCID"], "C003CCCXX")
        self.assertEqual(fc["samplesheet_csv"][1]["SampleRef"], "hg19")
        self.assertEqual(fc["samplesheet_csv"][2]["SampleID"], "P002_101_index3")

    def test_qc_upload(self):
        """Test running qc upload to server. Slightly circular testing
        here - I setup the module with qc update so by definition the
        test must 'work'"""
        self.app = self.make_app(argv = ['qc', 'upload-qc', flowcells[1], '--mtime',  '100'], extensions=['scilifelab.pm.ext.ext_qc',  'scilifelab.pm.ext.ext_couchdb'])
        self._run_app()
        s = self.s_con.get_entry("4_120924_AC003CCCXX_CGTTAA")
        self.assertIsNone(s["project_sample_name"])
        self.assertEqual(s["project_id"], "P003")
        
    def test_qc_update(self):
        """Test running qc update of a project id"""
        s = self.s_con.get_entry("4_120924_AC003CCCXX_CGTTAA")
        s["project_id"]= None
        self.assertIsNone(s["project_id"])
        self.s_con.save(s)
        self.app = self.make_app(argv = ['qc', 'update', '--sample_prj', projects[2], '--project_id', 'P003', '--debug', '--force'], extensions=['scilifelab.pm.ext.ext_qc', 'scilifelab.pm.ext.ext_couchdb'])
        self._run_app()
        s = self.s_con.get_entry("4_120924_AC003CCCXX_CGTTAA")
        self.assertEqual(s["project_id"], "P003")

    def test_qc_update_sample_names(self):
        """Test running qc update of project sample names"""
        s1 = self.s_con.get_entry("1_120924_AC003CCCXX_TGACCA")
        s2 = self.s_con.get_entry("2_120924_AC003CCCXX_ACAGTG")
        s1["project_sample_name"] = None
        s2["project_sample_name"] = None
        self.assertIsNone(s1["project_sample_name"])
        self.assertIsNone(s2["project_sample_name"])
        self.s_con.save(s1)
        self.s_con.save(s2)
        sample_map = {'P001_101_index3': 'P001_101_index3', 'P001_102_index6':'P001_102'}
        self.app = self.make_app(argv = ['qc', 'update', '--sample_prj', projects[0], '--names', "{}".format(sample_map), '--debug', '--force'], extensions=['scilifelab.pm.ext.ext_qc',  'scilifelab.pm.ext.ext_couchdb'])
        self._run_app()
        s1 = self.s_con.get_entry("1_120924_AC003CCCXX_TGACCA")
        s2 = self.s_con.get_entry("2_120924_AC003CCCXX_ACAGTG")
        self.assertEqual(s1["project_sample_name"], "P001_101_index3")
        self.assertEqual(s2["project_sample_name"], "P001_102")
Пример #16
0
def sample_status_note(project_name=None,
                       flowcell=None,
                       username=None,
                       password=None,
                       url=None,
                       ordered_million_reads=None,
                       uppnex_id=None,
                       customer_reference=None,
                       bc_count=None,
                       project_alias=[],
                       projectdb="projects",
                       samplesdb="samples",
                       flowcelldb="flowcells",
                       phix=None,
                       is_paired=True,
                       **kw):
    """Make a sample status note. Used keywords:

    :param project_name: project name
    :param flowcell: flowcell id
    :param username: db username
    :param password: db password
    :param url: db url
    :param ordered_million_reads: number of ordered reads in millions
    :param uppnex_id: the uppnex id
    :param customer_reference: customer project name
    :param project_alias: project alias name
    :param phix: phix error rate
    :param is_paired: True if run is paired-end, False for single-end
    """
    # Cutoffs
    cutoffs = {
        "phix_err_cutoff": 2.0,
        "qv_cutoff": 30,
    }

    instrument = _parse_instrument_config(
        os.path.expanduser(kw.get("instrument_config", "")))
    instrument_dict = {i['instrument_id']: i for i in instrument}

    # parameters
    parameters = {
        "project_name": None,
        "start_date": None,
        "FC_id": None,
        "scilifelab_name": None,
        "rounded_read_count": None,
        "phix_error_rate": None,
        "avg_quality_score": None,
        "pct_q30_bases": None,
        "success": None,
        "run_mode": None,
        "is_paired": True
    }
    # key mapping from sample_run_metrics to parameter keys
    srm_to_parameter = {
        "project_name": "sample_prj",
        "FC_id": "flowcell",
        "scilifelab_name": "barcode_name",
        "start_date": "date",
        "rounded_read_count": "bc_count",
        "lane": "lane"
    }

    LOG.debug("got parameters {}".format(parameters))
    output_data = {
        'stdout': StringIO(),
        'stderr': StringIO(),
        'debug': StringIO()
    }
    if not _assert_flowcell_format(flowcell):
        LOG.warn(
            "Wrong flowcell format {}; skipping. Please use the flowcell id (format \"[A-Z0-9\-]+\")"
            .format(flowcell))
        return output_data
    output_data = _update_sample_output_data(output_data, cutoffs)

    # Connect and run
    s_con = SampleRunMetricsConnection(dbname=samplesdb,
                                       username=username,
                                       password=password,
                                       url=url)
    fc_con = FlowcellRunMetricsConnection(dbname=flowcelldb,
                                          username=username,
                                          password=password,
                                          url=url)
    p_con = ProjectSummaryConnection(dbname=projectdb,
                                     username=username,
                                     password=password,
                                     url=url)

    # Set up paragraphs
    paragraphs = sample_note_paragraphs()
    headers = sample_note_headers()

    # Get project
    project = p_con.get_entry(project_name)
    source = p_con.get_info_source(project_name)
    if not project:
        LOG.warn("No such project '{}'".format(project_name))
        return output_data

    # Set samples list
    sample_run_list = _set_sample_run_list(project_name, flowcell,
                                           project_alias, s_con)
    if len(sample_run_list) == 0:
        LOG.warn(
            "No samples for project '{}', flowcell '{}'. Maybe there are no sample run metrics in statusdb?"
            .format(project_name, flowcell))
        return output_data

    # Set options
    ordered_million_reads = _literal_eval_option(ordered_million_reads)
    bc_count = _literal_eval_option(bc_count)
    phix = _literal_eval_option(phix)

    # Count number of times a sample has been run on a flowcell; if several, make lane-specific reports
    sample_count = Counter([x.get("barcode_name") for x in sample_run_list])

    # Loop samples and collect information
    s_param_out = []
    fcdoc = None
    for s in sample_run_list:
        s_param = {}
        LOG.debug(
            "working on sample '{}', sample run metrics name '{}', id '{}'".
            format(s.get("barcode_name", None), s.get("name", None),
                   s.get("_id", None)))
        s_param.update(parameters)
        s_param.update(
            {key: s[srm_to_parameter[key]]
             for key in srm_to_parameter.keys()})
        fc = "{}_{}".format(s.get("date"), s.get("flowcell"))
        # Get instrument
        try:
            s_param.update(instrument_dict[fc_con.get_instrument(str(fc))])
        except:
            LOG.warn(
                "Failed to set instrument and software versions for flowcell {} in report due to missing RunInfo -> Instrument field in statusdb. Either rerun 'pm qc update-qc' or search-and-replace 'NN' in the sample report."
                .format(fc))
            s_param.update(instrument_dict['default'])
        # Get run mode
        if not fcdoc or fcdoc.get("name") != fc:
            fcdoc = fc_con.get_entry(fc)
        runp = fcdoc.get("RunParameters", {})
        s_param[
            "sequencing_platform"] = "MiSeq" if "MCSVersion" in runp else "HiSeq2500"
        s_param["clustering_method"] = "onboard clustering" if runp.get(
            "ClusteringChoice", "") == "OnBoardClustering" or s_param[
                "sequencing_platform"] == "MiSeq" else "cBot"
        s_param["sequencing_setup"] = fcdoc.get("run_setup")
        s_param["sequencing_mode"] = runp.get("RunMode", "High Output")
        s_param["sequencing_software"] = "RTA {}".format(
            runp.get("RTAVersion"))
        if s_param["sequencing_platform"] == "MiSeq":
            s_param["sequencing_software"] = "MCS {}/{}".format(
                runp.get("MCSVersion"), s_param["sequencing_software"])
        else:
            s_param["sequencing_software"] = "{} {}/{}".format(
                runp.get("ApplicationName"), runp.get("ApplicationVersion"),
                s_param["sequencing_software"])
        s_param["is_paired"] = fc_con.is_paired_end(str(fc))
        if s_param["is_paired"] is None:
            LOG.warn(
                "Could not determine run setup for flowcell {}. Will assume paired-end."
                .format(fc))
            s_param["is_paired"] = True
        s_param.update(software_versions)
        s_param["phix_error_rate"] = fc_con.get_phix_error_rate(
            str(fc), s["lane"])
        if phix:
            s_param["phix_error_rate"] = _get_phix_error_rate(s["lane"], phix)
        # Get quality score from demultiplex stats, if that fails
        # (which it shouldn't), fall back on fastqc data.
        (avg_quality_score,
         pct_q30_bases) = fc_con.get_barcode_lane_statistics(
             project_name, s.get("barcode_name"), fc, s["lane"])
        s_param[
            'avg_quality_score'] = avg_quality_score if avg_quality_score else calc_avg_qv(
                s)
        if not s_param['avg_quality_score']:
            LOG.warn(
                "Setting average quality failed for sample {}, id {}".format(
                    s.get("name"), s.get("_id")))
        s_param['pct_q30_bases'] = pct_q30_bases
        if not s_param['pct_q30_bases']:
            LOG.warn(
                "Setting % of >= Q30 Bases (PF) failed for sample {}, id {}".
                format(s.get("name"), s.get("_id")))
        # Compare phix error and qv to cutoffs
        err_stat = "OK"
        qv_stat = "OK"
        if s_param["phix_error_rate"] > cutoffs["phix_err_cutoff"]:
            err_stat = "HIGH"
        elif s_param["phix_error_rate"] == -1:
            err_stat = "N/A"
        if s_param["avg_quality_score"] < cutoffs["qv_cutoff"]:
            qv_stat = "LOW"
        output_data["stdout"].write(
            "{:>18}\t{:>6}\t{:>12}\t{:>12}\t{:>12}\t{:>12}\n".format(
                s["barcode_name"], s["lane"], s_param["phix_error_rate"],
                err_stat, s_param["avg_quality_score"], qv_stat))

        # Update/set remaning sample run parameters, falling back on project defaults if *key* is missing
        s_param['ordered_amount'] = s_param.get(
            'ordered_amount',
            p_con.get_ordered_amount(project_name,
                                     samples=p_con.get_entry(
                                         project_name, 'samples')))
        s_param['customer_reference'] = s_param.get(
            'customer_reference', project.get('customer_reference'))
        s_param['uppnex_project_id'] = s_param.get('uppnex_project_id',
                                                   project.get('uppnex_id'))

        # Override database settings if options passed at command line
        if ordered_million_reads:
            s_param["ordered_amount"] = _get_ordered_million_reads(
                s["barcode_name"], ordered_million_reads)
        if bc_count:
            s_param["rounded_read_count"] = _round_read_count_in_millions(
                _get_bc_count(s["barcode_name"], bc_count, s))
        else:
            s_param["rounded_read_count"] = _round_read_count_in_millions(
                s_param["rounded_read_count"])
        if uppnex_id:
            s_param["uppnex_project_id"] = uppnex_id
        if customer_reference:
            s_param["customer_reference"] = customer_reference

        # Get the project sample name corresponding to the sample run
        project_sample = p_con.get_project_sample(
            project_name, s.get("project_sample_name", None))
        if project_sample:
            LOG.debug(
                "project sample run metrics mapping found: '{}' : '{}'".format(
                    s["name"], project_sample["sample_name"]))
            project_sample_item = project_sample['project_sample']
            # Set project_sample_d: a dictionary mapping from sample run metrics name to sample run metrics database id
            project_sample_d = _set_project_sample_dict(
                project_sample_item, source)
            if not project_sample_d:
                LOG.warn(
                    "No sample_run_metrics information for sample '{}', barcode name '{}', id '{}'\n\tProject summary information {}"
                    .format(s["name"], s["barcode_name"], s["_id"],
                            project_sample))
            # Check if sample run metrics name present in project database: if so, verify that database ids are consistent
            if s["name"] not in project_sample_d.keys():
                LOG.warn(
                    "no such sample run metrics '{}' in project sample run metrics dictionary"
                    .format(s["name"]))
            else:
                if s["_id"] == project_sample_d[s["name"]]:
                    LOG.debug(
                        "project sample run metrics mapping found: '{}' : '{}'"
                        .format(s["name"], project_sample_d[s["name"]]))
                else:
                    LOG.warn(
                        "inconsistent mapping for '{}': '{}' != '{}' (project summary id)"
                        .format(s["name"], s["_id"],
                                project_sample_d[s["name"]]))
            s_param['customer_name'] = project_sample_item.get(
                "customer_name", None)

            # Always normalize submitted id, since module textttable does not support unicode
            if type(s_param['customer_name']) is unicode:
                s_param['customer_name'] = unicodedata.normalize(
                    'NFKD',
                    s_param['customer_name']).encode('ascii', 'ignore')
        # No project sample found. Manual upload to database necessary.
        else:
            s_param['customer_name'] = None
            LOG.warn(
                "No project sample name found for sample run name '{}'".format(
                    s["barcode_name"]))
            LOG.info(
                "Please run 'pm qc upload-qc FLOWCELL_ID --extensive-matching' to update project sample names "
            )
            LOG.info(
                "or 'pm qc update --sample_prj PROJECT_NAME --names BARCODE_TO_SAMPLE_MAP to update project sample names."
            )
            LOG.info("Please refer to the pm documentation for examples.")
            query_ok(force=kw.get("force", False))

        # Finally assess sequencing success, update parameters and set outputs
        s_param['success'] = sequencing_success(s_param, cutoffs)
        s_param.update({
            k: "N/A"
            for k in s_param.keys()
            if s_param[k] is None or s_param[k] == "" or s_param[k] == -1.0
        })
        if sample_count[s.get("barcode_name")] > 1:
            outfile = "{}_{}_{}_{}.pdf".format(s["barcode_name"], s["date"],
                                               s["flowcell"], s["lane"])
        else:
            outfile = "{}_{}_{}.pdf".format(s["barcode_name"], s["date"],
                                            s["flowcell"])
        s_param["outfile"] = outfile
        s_param_out.append(s_param)

    # Write final output to reportlab and rst files
    output_data["debug"].write(
        json.dumps({
            's_param': s_param_out,
            'sample_runs':
            {s["name"]: s["barcode_name"]
             for s in sample_run_list}
        }))
    notes = [
        make_note(headers=headers, paragraphs=paragraphs, **sp)
        for sp in s_param_out
    ]
    rest_notes = make_sample_rest_notes(
        "{}_{}_{}_sample_summary.rst".format(project_name, s.get("date", None),
                                             s.get("flowcell", None)),
        s_param_out)
    concatenate_notes(
        notes, "{}_{}_{}_sample_summary.pdf".format(project_name,
                                                    s.get("date", None),
                                                    s.get("flowcell", None)))
    return output_data
Пример #17
0
  def multiplex_qc(self):
      
      MAX_UNDEMULTIPLEXED_INDEX_COUNT = 1000000
      EXPECTED_LANE_YIELD = 143000000
      MAX_PHIX_ERROR_RATE = 2.0
      MIN_PHIX_ERROR_RATE = 0.0
      MIN_GTQ30 = 80.0
      read_pairs = True
      
      out_data = []
      
      if not self._check_pargs(['flowcell']):
          return
      url = self.pargs.url if self.pargs.url else self.app.config.get("db", "url")
      if not url:
          self.app.log.warn("Please provide a valid url: got {}".format(url))
          return
      
      # Construct the short form of the fcid
      sp = os.path.basename(self.pargs.flowcell).split("_")
      fcid = "_".join([sp[0],sp[-1]])
      
      # Get a connection to the flowcell database and fetch the corresponding document
      self.log.debug("Connecting to flowcell database".format(fcid))
      fc_con = FlowcellRunMetricsConnection(dbname=self.app.config.get("db", "flowcells"), **vars(self.app.pargs))
      self.log.debug("Fetching run metrics entry for flowcell {}".format(fcid))
      fc_doc = fc_con.get_entry(fcid)
      if not fc_doc:
          self.log.warn("Could not fetch run metrics entry for flowcell {}".format(fcid))
          return
 
      # Adjust the read pairs variable according to the run setup
      read_pairs = fc_con.is_paired_end(fcid) 
      
      # Get the yield per sample from the Demultiplex_Stats
      self.log.debug("Getting yield for flowcell {}".format(fcid))
      sample_yield = self._get_yield_per_sample(fc_doc, read_pairs)
      
      # Get the yield per lane from the Demultiplex_Stats
      self.log.debug("Getting lane yield for flowcell {}".format(fcid))
      lane_yield = self._get_yield_per_lane(fc_doc, read_pairs)
      lanes = lane_yield.keys()
      
      # Get the number of samples in the pools from the Demultiplex_Stats
      self.log.debug("Getting lane pool sizes for flowcell {}".format(fcid))
      pool_size = self._get_pool_size(fc_doc)
      
      # Get the sample information from the csv samplesheet
      self.log.debug("Getting csv samplesheet data for flowcell {}".format(fcid))
      ssheet_samples = self._get_samplesheet_sample_data(fc_doc)
      if len(ssheet_samples) == 0: 
          self.log.warn("No samplesheet data available for flowcell {}".format(fcid))
      
      # Verify that all samples in samplesheet have reported metrics
      for id in ssheet_samples.keys():
          for key in ssheet_samples[id].keys():
              lane, index = key.split("_")
              project = ssheet_samples[id][key][0]
              if id not in sample_yield or \
              key not in sample_yield[id]: 
                  self.log.warn("Sample {} from project {} is in samplesheet but no yield was reported in " \
                                "Demultiplex_Stats.htm for lane {} and index {}".format(id,
                                                                                        project,
                                                                                        lane,
                                                                                        index))
                  continue
              sample_yield[id][key].append('verified')
      
      # Check that all samples in Demultiplex_Stats have entries in Samplesheet
      for id in sample_yield.keys():
          for key in sample_yield[id].keys():
              lane, index = key.split("_")
              if "verified" not in sample_yield[id][key] and \
              index != "Undetermined":
                  self.log.warn("Sample {} from project {}, with index {} on lane {} is in Demultiplex_Stats " \
                                "but no corresponding entry is present in SampleSheet".format(id,
                                                                                              sample_yield[id][key][1],
                                                                                              index,
                                                                                              lane))
                      
      # Check the PhiX error rate for each lane
      self.log.debug("Getting PhiX error rates for flowcell {}".format(fcid))
      for lane in lanes:
          status = "N/A"
          err_rate = fc_con.get_phix_error_rate(fcid,lane)
          if err_rate < 0:
              self.log.warn("Could not get PhiX error rate for lane {} on flowcell {}".format(lane,fcid))
          elif err_rate <= MIN_PHIX_ERROR_RATE or err_rate > MAX_PHIX_ERROR_RATE:
              status = "FAIL"
          else:
              status = "PASS"
          out_data.append([status,
                           "PhiX error rate",
                           lane,
                           err_rate,
                           "{} < PhiX e (%) <= {}".format(MIN_PHIX_ERROR_RATE,
                                                          MAX_PHIX_ERROR_RATE)])
      
      # Check the %>=Q30 value for each sample
      sample_quality = self._get_quality_per_sample(fc_doc)
      for id in sample_quality.keys():
          for key in sample_quality[id].keys():
              lane, index = key.split("_")
              status = "FAIL"
              if float(sample_quality[id][key][0]) >= MIN_GTQ30:
                  status = "PASS"
              out_data.append([status,"Sample quality",lane,sample_quality[id][key][2],id,sample_quality[id][key][0],"[%>=Q30 >= {}%]".format(MIN_GTQ30)])
              
      # Check that each lane received the minimum amount of reads
      for lane, reads in lane_yield.items():
          status = "FAIL"
          if reads >= EXPECTED_LANE_YIELD:
              status = "PASS"
          out_data.append([status,"Lane yield",lane,reads,"[Yield >= {}]".format(EXPECTED_LANE_YIELD)])
              
      # Check that all samples in the pool have received a minimum number of reads
      for id in sample_yield.keys():
          for key in sample_yield[id].keys():
              lane, index = key.split("_")
              if index == "Undetermined":
                  continue
              
              status = "FAIL"
              mplx_min = int(0.5*EXPECTED_LANE_YIELD/pool_size[lane])
              if sample_yield[id][key][0] >= mplx_min:
                  status = "PASS"
              out_data.append([status,"Sample yield",lane,sample_yield[id][key][1],id,sample_yield[id][key][0],"[Yield >= {}]".format(mplx_min)])
      
      # Check that the number of undetermined reads in each lane is below 10% of the total yield for the lane
      for lane, reads in lane_yield.items():
          status = "FAIL"
          key = "_".join([lane,"Undetermined"])
          undetermined = sum([counts.get(key,[0])[0] for counts in sample_yield.values()])
          cutoff = 0.1*reads
          if undetermined < cutoff:
              status = "PASS"
          out_data.append([status,"Index read",lane,undetermined,"[Undetermined < {}]".format(cutoff)])
      
      # Check that no overrepresented index sequence exists in undemultiplexed output
      self.log.debug("Fetching undemultiplexed barcode data for flowcell {}".format(fcid))
      undemux_data = self._get_undetermined_index_counts(fc_doc)
      if len(undemux_data) == 0:
          self.log.warn("No undemultiplexed barcode data available for flowcell {}".format(fcid))
      
      for lane, counts in undemux_data.items():
          mplx_min = int(min(MAX_UNDEMULTIPLEXED_INDEX_COUNT,
                             0.5*EXPECTED_LANE_YIELD/max(1,pool_size[lane])))
          status = "N/A"
          if len(counts) > 0:
              for i in range(len(counts)):
                  status = "FAIL"
                  if int(counts[i][0]) < mplx_min:
                      status = "PASS"
                  out_data.append([status,"Index",lane,counts[i][1],counts[i][2],counts[i][0],"[Undetermined index < {}]".format(mplx_min)])
          else:
              out_data.append([status,"Index",lane,"","",mplx_min,"-"])
                  
      self.app._output_data['stdout'].write("\n".join(["\t".join([str(r) for r in row]) for row in out_data]))
def sample_status_note(project_name=None, flowcell=None, username=None, password=None, url=None,
                       ordered_million_reads=None, uppnex_id=None, customer_reference=None, bc_count=None,
                       project_alias=[], projectdb="projects", samplesdb="samples", flowcelldb="flowcells",
                       phix=None, is_paired=True, **kw):
    """Make a sample status note. Used keywords:

    :param project_name: project name
    :param flowcell: flowcell id
    :param username: db username
    :param password: db password
    :param url: db url
    :param ordered_million_reads: number of ordered reads in millions
    :param uppnex_id: the uppnex id
    :param customer_reference: customer project name
    :param project_alias: project alias name
    :param phix: phix error rate
    :param is_paired: True if run is paired-end, False for single-end
    """
    # Cutoffs
    cutoffs = {
        "phix_err_cutoff" : 2.0,
        "qv_cutoff" : 30,
        }

    instrument = _parse_instrument_config(os.path.expanduser(kw.get("instrument_config","")))
    instrument_dict = {i['instrument_id']: i for i in instrument}

    # parameters
    parameters = {
        "project_name" : None,
        "start_date" : None,
        "FC_id" : None,
        "scilifelab_name" : None,
        "rounded_read_count" : None,
        "phix_error_rate" : None,
        "avg_quality_score" : None,
        "pct_q30_bases" : None,
        "success" : None,
        "run_mode":None,
        "is_paired":True
        }
    # key mapping from sample_run_metrics to parameter keys
    srm_to_parameter = {"project_name":"sample_prj", "FC_id":"flowcell",
                        "scilifelab_name":"barcode_name", "start_date":"date",
                        "rounded_read_count":"bc_count", "lane": "lane"}

    LOG.debug("got parameters {}".format(parameters))
    output_data = {'stdout':StringIO(), 'stderr':StringIO(), 'debug':StringIO()}
    if not _assert_flowcell_format(flowcell):
        LOG.warn("Wrong flowcell format {}; skipping. Please use the flowcell id (format \"[A-Z0-9\-]+\")".format(flowcell) )
        return output_data
    output_data = _update_sample_output_data(output_data, cutoffs)

    # Connect and run
    s_con = SampleRunMetricsConnection(dbname=samplesdb, username=username, password=password, url=url)
    fc_con = FlowcellRunMetricsConnection(dbname=flowcelldb, username=username, password=password, url=url)
    p_con = ProjectSummaryConnection(dbname=projectdb, username=username, password=password, url=url)

    # Set up paragraphs
    paragraphs = sample_note_paragraphs()
    headers = sample_note_headers()

    # Get project
    project = p_con.get_entry(project_name)
    source = p_con.get_info_source(project_name)
    if not project:
        LOG.warn("No such project '{}'".format(project_name))
        return output_data

    # Set samples list
    sample_run_list = _set_sample_run_list(project_name, flowcell, project_alias, s_con)
    if len(sample_run_list) == 0:
        LOG.warn("No samples for project '{}', flowcell '{}'. Maybe there are no sample run metrics in statusdb?".format(project_name, flowcell))
        return output_data

    # Set options
    ordered_million_reads = _literal_eval_option(ordered_million_reads)
    bc_count = _literal_eval_option(bc_count)
    phix = _literal_eval_option(phix)

    # Count number of times a sample has been run on a flowcell; if several, make lane-specific reports
    sample_count = Counter([x.get("barcode_name") for x in sample_run_list])

    # Loop samples and collect information
    s_param_out = []
    fcdoc = None
    for s in sample_run_list:
        s_param = {}
        LOG.debug("working on sample '{}', sample run metrics name '{}', id '{}'".format(s.get("barcode_name", None), s.get("name", None), s.get("_id", None)))
        s_param.update(parameters)
        s_param.update({key:s[srm_to_parameter[key]] for key in srm_to_parameter.keys()})
        fc = "{}_{}".format(s.get("date"), s.get("flowcell"))
        # Get instrument
        try:
            s_param.update(instrument_dict[fc_con.get_instrument(str(fc))])
        except:
            LOG.warn("Failed to set instrument and software versions for flowcell {} in report due to missing RunInfo -> Instrument field in statusdb. Either rerun 'pm qc update-qc' or search-and-replace 'NN' in the sample report.".format(fc))
            s_param.update(instrument_dict['default'])
        # Get run mode
        if not fcdoc or fcdoc.get("name") != fc:
            fcdoc = fc_con.get_entry(fc)
        runp = fcdoc.get("RunParameters",{})
        s_param["sequencing_platform"] = "MiSeq" if "MCSVersion" in runp else "HiSeq2500"
        s_param["clustering_method"] = "onboard clustering" if runp.get("ClusteringChoice","") == "OnBoardClustering" or s_param["sequencing_platform"] == "MiSeq" else "cBot"
        s_param["sequencing_setup"] = fcdoc.get("run_setup")
        s_param["sequencing_mode"] = runp.get("RunMode","High Output")
        s_param["sequencing_software"] = "RTA {}".format(runp.get("RTAVersion"))
        if s_param["sequencing_platform"] == "MiSeq":
            s_param["sequencing_software"] = "MCS {}/{}".format(runp.get("MCSVersion"),s_param["sequencing_software"])
        else:
            s_param["sequencing_software"] = "{} {}/{}".format(runp.get("ApplicationName"),runp.get("ApplicationVersion"),s_param["sequencing_software"])
        s_param["is_paired"] = fc_con.is_paired_end(str(fc))
        if s_param["is_paired"] is None:
            LOG.warn("Could not determine run setup for flowcell {}. Will assume paired-end.".format(fc))
            s_param["is_paired"] = True
        s_param.update(software_versions)
        s_param["phix_error_rate"] = fc_con.get_phix_error_rate(str(fc), s["lane"])
        if phix:
            s_param["phix_error_rate"] = _get_phix_error_rate(s["lane"], phix)
        # Get quality score from demultiplex stats, if that fails
        # (which it shouldn't), fall back on fastqc data.
        (avg_quality_score, pct_q30_bases) = fc_con.get_barcode_lane_statistics(project_name, s.get("barcode_name"), fc, s["lane"])
        s_param['avg_quality_score'] = avg_quality_score if avg_quality_score else calc_avg_qv(s)
        if not s_param['avg_quality_score']:
            LOG.warn("Setting average quality failed for sample {}, id {}".format(s.get("name"), s.get("_id")))
        s_param['pct_q30_bases'] = pct_q30_bases
        if not s_param['pct_q30_bases']:
            LOG.warn("Setting % of >= Q30 Bases (PF) failed for sample {}, id {}".format(s.get("name"), s.get("_id")))
        # Compare phix error and qv to cutoffs
        err_stat = "OK"
        qv_stat = "OK"
        if s_param["phix_error_rate"] > cutoffs["phix_err_cutoff"]:
            err_stat = "HIGH"
        elif s_param["phix_error_rate"] == -1:
            err_stat = "N/A"
        if s_param["avg_quality_score"] < cutoffs["qv_cutoff"]:
            qv_stat = "LOW"
        output_data["stdout"].write("{:>18}\t{:>6}\t{:>12}\t{:>12}\t{:>12}\t{:>12}\n".format(s["barcode_name"], s["lane"], s_param["phix_error_rate"], err_stat, s_param["avg_quality_score"], qv_stat))

        # Update/set remaning sample run parameters, falling back on project defaults if *key* is missing
        s_param['ordered_amount'] = s_param.get('ordered_amount',
                                                p_con.get_ordered_amount(project_name,
                                                                         samples=p_con.get_entry(project_name,'samples')))
        s_param['customer_reference'] = s_param.get('customer_reference', project.get('customer_reference'))
        s_param['uppnex_project_id'] = s_param.get('uppnex_project_id', project.get('uppnex_id'))

        # Override database settings if options passed at command line
        if ordered_million_reads:
            s_param["ordered_amount"] = _get_ordered_million_reads(s["barcode_name"], ordered_million_reads)
        if bc_count:
            s_param["rounded_read_count"] = _round_read_count_in_millions(_get_bc_count(s["barcode_name"], bc_count, s))
        else:
            s_param["rounded_read_count"] = _round_read_count_in_millions(s_param["rounded_read_count"])
        if uppnex_id:
            s_param["uppnex_project_id"] = uppnex_id
        if customer_reference:
            s_param["customer_reference"] = customer_reference

        # Get the project sample name corresponding to the sample run
        project_sample = p_con.get_project_sample(project_name, s.get("project_sample_name", None))
        if project_sample:
            LOG.debug("project sample run metrics mapping found: '{}' : '{}'".format(s["name"], project_sample["sample_name"]))
            project_sample_item = project_sample['project_sample']
            # Set project_sample_d: a dictionary mapping from sample run metrics name to sample run metrics database id
            project_sample_d = _set_project_sample_dict(project_sample_item, source)
            if not project_sample_d:
                LOG.warn("No sample_run_metrics information for sample '{}', barcode name '{}', id '{}'\n\tProject summary information {}".format(s["name"], s["barcode_name"], s["_id"], project_sample))
            # Check if sample run metrics name present in project database: if so, verify that database ids are consistent
            if s["name"] not in project_sample_d.keys():
                LOG.warn("no such sample run metrics '{}' in project sample run metrics dictionary".format(s["name"]) )
            else:
                if s["_id"] == project_sample_d[s["name"]]:
                    LOG.debug("project sample run metrics mapping found: '{}' : '{}'".format(s["name"], project_sample_d[s["name"]]))
                else:
                    LOG.warn("inconsistent mapping for '{}': '{}' != '{}' (project summary id)".format(s["name"], s["_id"], project_sample_d[s["name"]]))
            s_param['customer_name'] = project_sample_item.get("customer_name", None)

            # Always normalize submitted id, since module textttable does not support unicode
            if type(s_param['customer_name']) is unicode:
                s_param['customer_name'] = unicodedata.normalize('NFKD', s_param['customer_name']).encode('ascii', 'ignore')
        # No project sample found. Manual upload to database necessary.
        else:
            s_param['customer_name'] = None
            LOG.warn("No project sample name found for sample run name '{}'".format(s["barcode_name"]))
            LOG.info("Please run 'pm qc upload-qc FLOWCELL_ID --extensive-matching' to update project sample names ")
            LOG.info("or 'pm qc update --sample_prj PROJECT_NAME --names BARCODE_TO_SAMPLE_MAP to update project sample names.")
            LOG.info("Please refer to the pm documentation for examples.")
            query_ok(force=kw.get("force", False))

        # Finally assess sequencing success, update parameters and set outputs
        s_param['success'] = sequencing_success(s_param, cutoffs)
        s_param.update({k:"N/A" for k in s_param.keys() if s_param[k] is None or s_param[k] ==  "" or s_param[k] == -1.0})
        if sample_count[s.get("barcode_name")] > 1:
            outfile = "{}_{}_{}_{}.pdf".format(s["barcode_name"], s["date"], s["flowcell"], s["lane"])
        else:
            outfile = "{}_{}_{}.pdf".format(s["barcode_name"], s["date"], s["flowcell"])
        s_param["outfile"] = outfile
        s_param_out.append(s_param)

    # Write final output to reportlab and rst files
    output_data["debug"].write(json.dumps({'s_param': s_param_out, 'sample_runs':{s["name"]:s["barcode_name"] for s in sample_run_list}}))
    notes = [make_note(headers=headers, paragraphs=paragraphs, **sp) for sp in s_param_out]
    rest_notes = make_sample_rest_notes("{}_{}_{}_sample_summary.rst".format(project_name, s.get("date", None), s.get("flowcell", None)), s_param_out)
    concatenate_notes(notes, "{}_{}_{}_sample_summary.pdf".format(project_name, s.get("date", None), s.get("flowcell", None)))
    return output_data
Пример #19
0
  def multiplex_qc(self):
      
      MAX_UNDEMULTIPLEXED_INDEX_COUNT = 1000000
      EXPECTED_LANE_YIELD = 143000000
      MAX_PHIX_ERROR_RATE = 2.0
      MIN_PHIX_ERROR_RATE = 0.0
      MIN_GTQ30 = 80.0
      read_pairs = True
      
      out_data = []
      
      if not self._check_pargs(['flowcell']):
          return
      url = self.pargs.url if self.pargs.url else self.app.config.get("db", "url")
      if not url:
          self.app.log.warn("Please provide a valid url: got {}".format(url))
          return
      
      # Construct the short form of the fcid
      sp = os.path.basename(self.pargs.flowcell).split("_")
      fcid = "_".join([sp[0],sp[-1]])
      
      # Get a connection to the flowcell database and fetch the corresponding document
      self.log.debug("Connecting to flowcell database".format(fcid))
      fc_con = FlowcellRunMetricsConnection(dbname=self.app.config.get("db", "flowcells"), **vars(self.app.pargs))
      self.log.debug("Fetching run metrics entry for flowcell {}".format(fcid))
      fc_doc = fc_con.get_entry(fcid)
      if not fc_doc:
          self.log.warn("Could not fetch run metrics entry for flowcell {}".format(fcid))
          return
 
      # Get the yield per sample from the Demultiplex_Stats
      self.log.debug("Getting yield for flowcell {}".format(fcid))
      sample_yield = self._get_yield_per_sample(fc_doc, read_pairs)
      
      # Get the yield per lane from the Demultiplex_Stats
      self.log.debug("Getting lane yield for flowcell {}".format(fcid))
      lane_yield = self._get_yield_per_lane(fc_doc, read_pairs)
      lanes = lane_yield.keys()
      
      # Get the number of samples in the pools from the Demultiplex_Stats
      self.log.debug("Getting lane pool sizes for flowcell {}".format(fcid))
      pool_size = self._get_pool_size(fc_doc)
      
      # Get the sample information from the csv samplesheet
      self.log.debug("Getting csv samplesheet data for flowcell {}".format(fcid))
      ssheet_samples = self._get_samplesheet_sample_data(fc_doc)
      if len(ssheet_samples) == 0: 
          self.log.warn("No samplesheet data available for flowcell {}".format(fcid))
      
      # Verify that all samples in samplesheet have reported metrics
      for id in ssheet_samples.keys():
          for key in ssheet_samples[id].keys():
              lane, index = key.split("_")
              project = ssheet_samples[id][key][0]
              if id not in sample_yield or \
              key not in sample_yield[id]: 
                  self.log.warn("Sample {} from project {} is in samplesheet but no yield was reported in " \
                                "Demultiplex_Stats.htm for lane {} and index {}".format(id,
                                                                                        project,
                                                                                        lane,
                                                                                        index))
                  continue
              sample_yield[id][key].append('verified')
      
      # Check that all samples in Demultiplex_Stats have entries in Samplesheet
      for id in sample_yield.keys():
          for key in sample_yield[id].keys():
              lane, index = key.split("_")
              if "verified" not in sample_yield[id][key] and \
              index != "Undetermined":
                  self.log.warn("Sample {} from project {}, with index {} on lane {} is in Demultiplex_Stats " \
                                "but no corresponding entry is present in SampleSheet".format(id,
                                                                                              sample_yield[id][key][1],
                                                                                              index,
                                                                                              lane))
                      
      # Check the PhiX error rate for each lane
      self.log.debug("Getting PhiX error rates for flowcell {}".format(fcid))
      for lane in lanes:
          status = "N/A"
          err_rate = fc_con.get_phix_error_rate(fcid,lane)
          if err_rate < 0:
              self.log.warn("Could not get PhiX error rate for lane {} on flowcell {}".format(lane,fcid))
          elif err_rate <= MIN_PHIX_ERROR_RATE or err_rate > MAX_PHIX_ERROR_RATE:
              status = "FAIL"
          else:
              status = "PASS"
          out_data.append([status,
                           "PhiX error rate",
                           lane,
                           err_rate,
                           "{} < PhiX e (%) <= {}".format(MIN_PHIX_ERROR_RATE,
                                                          MAX_PHIX_ERROR_RATE)])
      
      # Check the %>=Q30 value for each sample
      sample_quality = self._get_quality_per_sample(fc_doc)
      for id in sample_quality.keys():
          for key in sample_quality[id].keys():
              lane, index = key.split("_")
              status = "FAIL"
              if float(sample_quality[id][key][0]) >= MIN_GTQ30:
                  status = "PASS"
              out_data.append([status,"Sample quality",lane,sample_quality[id][key][2],id,sample_quality[id][key][0],"[%>=Q30 >= {}%]".format(MIN_GTQ30)])
              
      # Check that each lane received the minimum amount of reads
      for lane, reads in lane_yield.items():
          status = "FAIL"
          if reads >= EXPECTED_LANE_YIELD:
              status = "PASS"
          out_data.append([status,"Lane yield",lane,reads,"[Yield >= {}]".format(EXPECTED_LANE_YIELD)])
              
      # Check that all samples in the pool have received a minimum number of reads
      for id in sample_yield.keys():
          for key in sample_yield[id].keys():
              lane, index = key.split("_")
              if index == "Undetermined":
                  continue
              
              status = "FAIL"
              mplx_min = int(0.5*EXPECTED_LANE_YIELD/pool_size[lane])
              if sample_yield[id][key][0] >= mplx_min:
                  status = "PASS"
              out_data.append([status,"Sample yield",lane,sample_yield[id][key][1],id,sample_yield[id][key][0],"[Yield >= {}]".format(mplx_min)])
      
      # Check that the number of undetermined reads in each lane is below 10% of the total yield for the lane
      for lane, reads in lane_yield.items():
          status = "FAIL"
          key = "_".join([lane,"Undetermined"])
          undetermined = sum([counts.get(key,[0])[0] for counts in sample_yield.values()])
          cutoff = 0.1*reads
          if undetermined < cutoff:
              status = "PASS"
          out_data.append([status,"Index read",lane,undetermined,"[Undetermined < {}]".format(cutoff)])
      
      # Check that no overrepresented index sequence exists in undemultiplexed output
      self.log.debug("Fetching undemultiplexed barcode data for flowcell {}".format(fcid))
      undemux_data = self._get_undetermined_index_counts(fc_doc)
      if len(undemux_data) == 0:
          self.log.warn("No undemultiplexed barcode data available for flowcell {}".format(fcid))
      
      for lane, counts in undemux_data.items():
          mplx_min = int(min(MAX_UNDEMULTIPLEXED_INDEX_COUNT,
                             0.5*EXPECTED_LANE_YIELD/max(1,pool_size[lane])))
          status = "N/A"
          if len(counts) > 0:
              for i in range(len(counts)):
                  status = "FAIL"
                  if int(counts[i][0]) < mplx_min:
                      status = "PASS"
                  out_data.append([status,"Index",lane,counts[i][1],counts[i][2],counts[i][0],"[Undetermined index < {}]".format(mplx_min)])
          else:
              out_data.append([status,"Index",lane,"","",mplx_min,"-"])
                  
      self.app._output_data['stdout'].write("\n".join(["\t".join([str(r) for r in row]) for row in out_data]))
Пример #20
0
    def raw_data(self):
        if not self._check_pargs(["project"]):
            return

        # if necessary, reformat flowcell identifier
        if self.pargs.flowcell:
            self.pargs.flowcell = self.pargs.flowcell.split("_")[-1]

        # get the uid and gid to use for destination files
        uid = os.getuid()
        gid = os.getgid()
        if self.pargs.group is not None and len(self.pargs.group) > 0:
            gid = grp.getgrnam(group).gr_gid

        self.log.debug("Connecting to project database")
        p_con = ProjectSummaryConnection(**vars(self.pargs))
        assert p_con, "Could not get connection to project database"
        self.log.debug("Connecting to flowcell database")
        f_con = FlowcellRunMetricsConnection(**vars(self.pargs))
        assert f_con, "Could not get connection to flowcell database"
        self.log.debug("Connecting to x_flowcell database")
        x_con = X_FlowcellRunMetricsConnection(**vars(self.pargs))
        assert x_con, "Could not get connection to x_flowcell database"

        # Fetch the Uppnex project to deliver to
        if not self.pargs.uppmax_project:
            self.pargs.uppmax_project = p_con.get_entry(
                self.pargs.project, "uppnex_id")
            if not self.pargs.uppmax_project:
                self.log.error(
                    "Uppmax project was not specified and could not be fetched from project database"
                )
                return

        # Setup paths and verify parameters
        self._meta.production_root = self.pargs.root if self.pargs.root else self.app.config.get(
            "production", "root")
        self._meta.root_path = self._meta.production_root
        proj_base_dir = os.path.join(self._meta.root_path, self.pargs.project)
        assert os.path.exists(
            self._meta.production_root
        ), "No such directory {}; check your production config".format(
            self._meta.production_root)
        assert os.path.exists(
            proj_base_dir), "No project {} in production path {}".format(
                self.pargs.project, self._meta.root_path)

        try:
            self._meta.uppnex_project_root = self.app.config.get(
                "deliver", "uppnex_project_root")
        except Exception as e:
            self.log.warn(
                "{}, will use '/proj' as uppnext_project_root".format(e))
            self._meta.uppnex_project_root = '/proj'

        try:
            self._meta.uppnex_delivery_dir = self.app.config.get(
                "deliver", "uppnex_project_delivery_path")
        except Exception as e:
            self.log.warn(
                "{}, will use 'INBOX' as uppnext_project_delivery_path".format(
                    e))
            self._meta.uppnex_delivery_dir = 'INBOX'

        destination_root = os.path.join(self._meta.uppnex_project_root,
                                        self.pargs.uppmax_project,
                                        self._meta.uppnex_delivery_dir)
        assert os.path.exists(
            destination_root
        ), "Delivery destination folder {} does not exist".format(
            destination_root)
        destination_root = os.path.join(destination_root, self.pargs.project)

        # Find uncompressed fastq
        uncompressed = self._find_uncompressed_fastq_files(
            proj_base_dir=proj_base_dir,
            sample=self.pargs.sample,
            flowcell=self.pargs.flowcell)
        if len(uncompressed) > 0:
            self.log.error(
                "There are uncompressed fastq file for project, kindly check all files are compressed properly before delivery"
            )
            return

        # Extract the list of samples and runs associated with the project and sort them
        samples = self.samples_to_copy(
            pid=p_con.get_entry(self.pargs.project, "project_id"),
            pod=p_con.get_entry(self.pargs.project, "open_date"),
            fc_dict={
                'HiSeq2500': f_con.proj_list,
                'HiSeqX': x_con.proj_list
            },
            proj_base_dir=proj_base_dir,
            destination_root=destination_root,
            sample=self.pargs.sample,
            flowcell=self.pargs.flowcell)

        # If interactively select, build a list of samples to skip
        if self.pargs.interactive:
            to_process = {}
            for sample in samples:
                if query_yes_no("Deliver sample {} ?".format(sample),
                                default="no"):
                    to_process[sample] = samples[sample]
            samples = to_process

        if self.pargs.sample:
            sample = samples.get(self.pargs.sample)
            if not sample:
                self.log.error(
                    "There is no such sample {} for project {}".format(
                        self.pargs.sample, self.pargs.project))
                return
            samples = {self.pargs.sample: sample}

        self.log.info(
            "Will deliver data for {} samples from project {} to {}".format(
                len(samples), self.pargs.project, destination_root))
        if not query_yes_no("Continue?"):
            return

        # Make sure that transfer will be with rsync
        if not self.pargs.rsync:
            self.log.warn("Files must be transferred using rsync")
            if not query_yes_no(
                    "Do you wish to continue delivering using rsync?",
                    default="yes"):
                return
            self.pargs.rsync = True

        # Process each sample
        for sample, flowcells in samples.iteritems():
            for fc, files in flowcells.iteritems():
                self.log.info("Processing sample {} and flowcell {}".format(
                    sample, fc))

                # transfer files
                self.log.debug("Transferring {} fastq files".format(
                    len(files['src'])))
                self._transfer_files(sources=files['src'],
                                     targets=files['dst'])

                passed = True
                if self.pargs.link or self.pargs.dry_run:
                    passed = False
                else:
                    # calculate md5sums on the source side and write it on the destination
                    md5 = []
                    for s, d in zip(files['src'], files['dst']):
                        m = md5sum(s)
                        mfile = "{}.md5".format(d)
                        md5.append([m, mfile, s])
                        self.log.debug("md5sum for source file {}: {}".format(
                            s, m))

                    # write the md5sum to a file at the destination and verify the transfer
                    for m, mfile, srcpath in md5:
                        dstfile = os.path.splitext(mfile)[0]
                        self.log.debug(
                            "Writing md5sum to file {}".format(mfile))
                        self.app.cmd.write(
                            mfile, "{}  {}".format(m,
                                                   os.path.basename(dstfile)),
                            True)
                        self.log.debug(
                            "Verifying md5sum for file {}".format(dstfile))
                        dm = md5sum(dstfile)
                        self.log.debug(
                            "md5sum for destination file {}: {}".format(
                                dstfile, dm))
                        if m != dm:
                            self.log.warn(
                                "md5sum verification FAILED for {}. Source: {}, Target: {}"
                                .format(dstfile, m, dm))
                            self.log.warn(
                                "Improperly transferred file {} is removed from destination, please retry transfer of this file"
                                .format(dstfile))
                            self.app.cmd.safe_unlink(dstfile)
                            self.app.cmd.safe_unlink(mfile)
                            passed = False
                            continue

                        # Modify the permissions to ug+rw
                        for f in [dstfile, mfile]:
                            self.app.cmd.chmod(
                                f, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP
                                | stat.S_IWGRP)

                # touch the flag to trigger uppmax inbox permission fix
                self.app.cmd.safe_touchfile(
                    os.path.join("/sw", "uppmax", "var", "inboxfix",
                                 "schedule", self.pargs.uppmax_project))

                # log the transfer to statusdb if verification passed
                if passed:
                    data = {
                        'raw_data_delivery': {
                            'timestamp': utc_time(),
                            'files': {
                                os.path.splitext(
                                    (os.path.basename(srcpath)))[0]:
                                {
                                    'md5':
                                    m,
                                    'path':
                                    os.path.splitext(mfile)[0],
                                    'size_in_bytes':
                                    self._getsize(os.path.splitext(mfile)[0]),
                                    'source_location':
                                    srcpath
                                }
                                for m, mfile, srcpath in md5
                            }
                        }
                    }
                    jsonstr = json.dumps(data)
                    jsonfile = os.path.join(
                        proj_base_dir, sample, fc,
                        "{}_{}_raw_data_delivery.json".format(sample, fc))
                    self.log.debug(
                        "Writing delivery to json file {}".format(jsonfile))
                    self.app.cmd.write(jsonfile, data=jsonstr, overwrite=True)
                    self.log.debug(
                        "Saving delivery in StatusDB document {}".format(id))
                    if self.proj_flowcells[fc]['type'] == 'HiSeqX':
                        fc_con = x_con
                    else:
                        fc_con = f_con
                    fc_obj = fc_con.get_entry(fc)
                    self.log.info(
                        "Logging delivery to StatusDB document {}".format(
                            fc_obj.get('_id')))
                    fc_raw_data = fc_obj.get('raw_data_delivery', {})
                    fc_raw_data.update(data['raw_data_delivery'])
                    fc_obj['raw_data_delivery'] = fc_raw_data
                    self._save(fc_con, fc_obj)
                    self.log.debug(jsonstr)
Пример #21
0
    def test_2_make_project_note(self):
        """Make a project note subset by flowcell and project"""
        s_con = SampleRunMetricsConnection(username=self.user,
                                           password=self.pw,
                                           url=self.url)
        fc_con = FlowcellRunMetricsConnection(username=self.user,
                                              password=self.pw,
                                              url=self.url)
        p_con = ProjectSummaryConnection(username=self.user,
                                         password=self.pw,
                                         url=self.url)
        paragraphs = project_note_paragraphs()
        headers = project_note_headers()
        param = parameters
        project = p_con.get_entry(self.examples["project"])
        if not project:
            print "No project named {}".format(self.examples["project"])
            return
        if project:
            ordered_amount = p_con.get_ordered_amount(self.examples["project"])
        else:
            return
            ordered_amount = self.pargs.ordered_million_reads

        ## Start collecting the data
        sample_table = []
        sample_list = project['samples']
        param.update({
            key: project.get(ps_to_parameter[key], None)
            for key in ps_to_parameter.keys()
        })
        samples = p_con.map_name_to_srm(self.examples["project"],
                                        check_consistency=True,
                                        use_bc_map=True)
        all_passed = True
        for k, v in samples.items():
            if k == "Unexpected":
                continue
            project_sample = sample_list[k]
            vals = {
                x: project_sample.get(prjs_to_table[x], None)
                for x in prjs_to_table.keys()
            }
            vals['MOrdered'] = ordered_amount
            vals['BarcodeSeq'] = s_con.get_entry(v.keys()[0], "sequence")

            ## Set status
            vals['Status'] = set_status(
                vals) if vals['Status'] is None else vals['Status']
            vals.update({k: "N/A" for k in vals.keys() if vals[k] is None})
            if vals['Status'] == "N/A" or vals['Status'] == "NP":
                all_passed = False
            sample_table.append([vals[k] for k in table_keys])
        if all_passed: param["finished"] = 'Project finished.'
        sample_table.sort()
        sample_table = list(
            sample_table
            for sample_table, _ in itertools.groupby(sample_table))
        sample_table.insert(0, [
            'ScilifeID', 'CustomerID', 'BarcodeSeq', 'MSequenced', 'MOrdered',
            'Status'
        ])
        paragraphs["Samples"]["tpl"] = make_sample_table(sample_table)
        make_note("{}.pdf".format(self.examples["project"]), headers,
                  paragraphs, **param)
Пример #22
0
    def storage_cleanup(self):
        storage_conf = self.app.config.get_section_dict('storage')
        db_info = self.app.config.get_section_dict('db')
        f_conn = FlowcellRunMetricsConnection(username=db_info.get('user'),
                                              password=db_info.get('password'),
                                              url=db_info.get('url'))
        servers = [server for server in storage_conf.keys()]
        server = platform.node().split('.')[0].lower()
        if server in servers:
            self.app.log.info(
                "Performing cleanup on production server \"{}\"...".format(
                    server))
            dirs = [d.lstrip() for d in storage_conf.get(server).split(',')]

            #Collect old runs (> 30 days in nosync folder) to remove
            old_runs = []
            for d in dirs:
                nosync_dir = os.path.join(d, 'nosync')
                for fc in glob.iglob(os.path.join(nosync_dir, '1*')):
                    if os.path.isdir(fc):
                        fc_name = os.path.basename(fc)
                        #Check that there is no check file indicating to not remove the run
                        if not os.path.exists(os.path.join(
                                fc, 'no_remove.txt')):
                            stats = os.stat(os.path.join(
                                fc, 'RTAComplete.txt'))
                            mod_time = datetime.now() - datetime.fromtimestamp(
                                stats.st_mtime)
                            if mod_time.days >= 30:
                                old_runs.append(fc)
                        else:
                            self.app.log.warn(
                                "no_remove.txt file found in {}, skipping run".
                                format(fc_name))

            #NAS servers
            if 'nas' in server:
                #Collect newly finished runs
                fc_list = []
                for d in dirs:
                    for fc in glob.glob(os.path.join(d, '1*')):
                        if os.path.exists(os.path.join(fc, 'RTAComplete.txt')):
                            fc_list.append(fc)

                #Move to nosync
                retries = 5
                for fc in fc_list:
                    fc_name = os.path.basename(fc)
                    while retries:
                        if 'Finished' in last_lines(
                                storage_conf.get('lsyncd_log'), 1)[0]:
                            break
                        retries -= 1
                        time.sleep(3)
                    if retries:
                        self.app.log.info("lsyncd process seems to be up to speed, and run {} " \
                                "is finished, moving it to nosync".format(fc_name))
                        shutil.move(
                            fc, os.path.join(os.path.dirname(fc), 'nosync'))
                        #Touch RTAComplete.txt file to that the modification date is the date when
                        #it was moved to nosync
                        try:
                            open(
                                os.path.join(os.path.dirname(fc), 'nosync',
                                             os.path.basename(fc),
                                             'RTAComplete.txt'), 'w').close()
                        except IOError:
                            self.app.log.warn("No RTAComplete.txt file was found for run {}." \
                                    " Please check".format(os.path.basename(fc_name)))
                        fc_db_id = f_conn.id_view.get(fc_name)
                        if fc_db_id:
                            f_conn.set_storage_status(fc_db_id, 'NAS_nosync')
                        else:
                            self.app.log.warn(
                                "Flowcell {} not found in the database, not changing status."
                                .format(fc_name))
                    else:
                        self.app.log.warn("lsyncd process doesn't seem to be finished. " \
                                "Skipping run {}".format(os.path.basename(fc)))

                #Remove old runs
                for fc in old_runs:
                    fc_name = os.path.basename(fc)
                    #Check that the run has been archived in swestore before removing permanently
                    if fc_name in f_conn.get_storage_status(
                            'swestore_archived').keys():
                        self.app.log.info("Run {} has been in nosync for more than 30 days " \
                            "and is archived in swestore. Permanently removing it from the NAS".format(fc_name))
                        shutil.rmtree(fc)
                    else:
                        self.app.log.warn("Run {} has been in nosync for more than 30 " \
                            "days, but has not yet been archived in swestore. " \
                            "Not removing, please check it".format(fc_name))

            #Processing servers (b5)
            else:
                #Collect finished runs
                fc_list = []
                for d in dirs:
                    for fc in glob.glob(os.path.join(d, '1*')):
                        if os.path.exists(
                                os.path.join(
                                    fc,
                                    'second_read_processing_completed.txt')):
                            fc_list.append(fc)

                #Move to nosync
                for fc in fc_list:
                    fc_name = os.path.basename(fc)
                    self.app.log.info(
                        "Moving run {} to nosync".format(fc_name))
                    shutil.move(fc, os.path.join(os.path.dirname(fc),
                                                 'nosync'))

                #Remove old runs
                for fc in old_runs:
                    fc_name = os.path.basename(fc)
                    self.app.log.info("Run {} has been in nosync for more than 30 " \
                        "days, permanently removing it from {}".format(fc_name, server))
                    shutil.rmtree(fc)
        else:
            self.app.log.warn("You're running the cleanup functionality in {}. But this " \
                    "server doen't seem to be on your pm.conf file. Are you on the correct server?".format(server))
Пример #23
0
def project_status_note(project_name=None,
                        username=None,
                        password=None,
                        url=None,
                        use_ps_map=True,
                        use_bc_map=False,
                        check_consistency=False,
                        ordered_million_reads=None,
                        uppnex_id=None,
                        customer_reference=None,
                        exclude_sample_ids={},
                        project_alias=None,
                        sample_aliases={},
                        projectdb="projects",
                        samplesdb="samples",
                        flowcelldb="flowcells",
                        include_all_samples=False,
                        **kw):
    """Make a project status note. Used keywords:

    :param project_name: project name
    :param user: db user name
    :param password: db password
    :param url: db url
    :param use_ps_map: use project summary mapping
    :param use_bc_map: use project to barcode name mapping
    :param check_consistency: check consistency between mappings
    :param ordered_million_reads: number of ordered reads in millions
    :param uppnex_id: the uppnex id
    :param customer_reference: customer project name
    :param exclude_sample_ids: exclude some sample ids from project note
    :param project_alias: project alias name
    :param sample_aliases: sample alias names
    :param projectdb: project db name
    :param samplesdb: samples db name
    :param flowcelldb: flowcells db name
    :param include_all_samples: include all samples in report
    """
    # parameters
    parameters = {
        "project_name": project_name,
        "finished": "Not finished, or cannot yet assess if finished.",
    }
    # mapping project_summary to parameter keys
    ps_to_parameter = {
        "scilife_name": "scilife_name",
        "customer_name": "customer_name",
        "project_name": "project_name"
    }
    # mapping project sample to table
    table_keys = [
        'ScilifeID', 'CustomerID', 'BarcodeSeq', 'MSequenced', 'MOrdered',
        'Status'
    ]

    output_data = {
        'stdout': StringIO(),
        'stderr': StringIO(),
        'debug': StringIO()
    }
    # Connect and run
    s_con = SampleRunMetricsConnection(dbname=samplesdb,
                                       username=username,
                                       password=password,
                                       url=url)
    fc_con = FlowcellRunMetricsConnection(dbname=flowcelldb,
                                          username=username,
                                          password=password,
                                          url=url)
    p_con = ProjectSummaryConnection(dbname=projectdb,
                                     username=username,
                                     password=password,
                                     url=url)

    # Set report paragraphs
    paragraphs = project_note_paragraphs()
    headers = project_note_headers()
    # Set local param variable
    param = parameters

    # Get project summary from project database
    sample_aliases = _literal_eval_option(sample_aliases, default={})
    prj_summary = p_con.get_entry(project_name)
    if not prj_summary:
        LOG.warn("No such project '{}'".format(project_name))
        return
    LOG.debug("Working on project '{}'.".format(project_name))

    # Get sample run list and loop samples to make mapping sample -> {sampleruns}
    sample_run_list = _set_sample_run_list(project_name,
                                           flowcell=None,
                                           project_alias=project_alias,
                                           s_con=s_con)
    samples = {}
    for s in sample_run_list:
        prj_sample = p_con.get_project_sample(
            project_name, s.get("project_sample_name", None))
        if prj_sample:
            sample_name = prj_sample['project_sample'].get(
                "scilife_name", None)
            s_d = {s["name"]: {'sample': sample_name, 'id': s["_id"]}}
            samples.update(s_d)
        else:
            if s["barcode_name"] in sample_aliases:
                s_d = {
                    sample_aliases[s["barcode_name"]]: {
                        'sample': sample_aliases[s["barcode_name"]],
                        'id': s["_id"]
                    }
                }
                samples.update(s_d)
            else:
                s_d = {
                    s["name"]: {
                        'sample': s["name"],
                        'id': s["_id"],
                        'barcode_name': s["barcode_name"]
                    }
                }
                LOG.warn(
                    "No mapping found for sample run:\n  '{}'".format(s_d))

    # Convert to mapping from desired sample name to list of aliases
    # Less important for the moment; one solution is to update the
    # Google docs summary table to use the P names
    sample_dict = prj_summary['samples']
    param.update({
        key: prj_summary.get(ps_to_parameter[key], None)
        for key in ps_to_parameter.keys()
    })
    param["ordered_amount"] = param.get("ordered_amount",
                                        p_con.get_ordered_amount(project_name))
    param['customer_reference'] = param.get(
        'customer_reference', prj_summary.get('customer_reference'))
    param['uppnex_project_id'] = param.get('uppnex_project_id',
                                           prj_summary.get('uppnex_id'))

    # Override database values if options passed at command line
    if uppnex_id:
        param["uppnex_project_id"] = uppnex_id
    if customer_reference:
        param["customer_reference"] = customer_reference

    # Process options
    ordered_million_reads = _literal_eval_option(ordered_million_reads)
    exclude_sample_ids = _literal_eval_option(exclude_sample_ids, default={})

    ## Start collecting the data
    sample_table = []
    samples_excluded = []
    all_passed = True
    last_library_preps = p_con.get_latest_library_prep(project_name)
    last_library_preps_srm = [
        x for l in last_library_preps.values() for x in l
    ]
    LOG.debug(
        "Looping through sample map that maps project sample names to sample run metrics ids"
    )
    for k, v in samples.items():
        LOG.debug("project sample '{}' maps to '{}'".format(k, v))
        if not include_all_samples:
            if v['sample'] not in last_library_preps.keys():
                LOG.info(
                    "No library prep information for sample {}; keeping in report"
                    .format(v['sample']))
            else:
                if k not in last_library_preps_srm:
                    LOG.info(
                        "Sample run {} ('{}') is not latest library prep ({}) for project sample {}: excluding from report"
                        .format(k, v["id"],
                                last_library_preps[v['sample']].values()[0],
                                v['sample']))
                    continue
        else:
            pass

        if re.search("Unexpected", k):
            continue
        barcode_seq = s_con.get_entry(k, "sequence")
        # Exclude sample id?
        if _exclude_sample_id(exclude_sample_ids, v['sample'], barcode_seq):
            samples_excluded.append(v['sample'])
            continue
        # Get the project sample name from the sample run and set table values
        project_sample = sample_dict[v['sample']]
        vals = _set_sample_table_values(v['sample'], project_sample,
                                        barcode_seq, ordered_million_reads,
                                        param)
        if vals['Status'] == "N/A" or vals['Status'] == "NP":
            all_passed = False
        sample_table.append([vals[k] for k in table_keys])

    # Loop through samples in sample_dict for which there is no sample run information
    samples_in_table_or_excluded = list(set([x[0] for x in sample_table
                                             ])) + samples_excluded
    samples_not_in_table = list(
        set(sample_dict.keys()) - set(samples_in_table_or_excluded))
    for sample in samples_not_in_table:
        if re.search("Unexpected", sample):
            continue
        project_sample = sample_dict[sample]
        # Set project_sample_d: a dictionary mapping from sample run metrics name to sample run metrics database id
        project_sample_d = _set_project_sample_dict(project_sample)
        if project_sample_d:
            for k, v in project_sample_d.iteritems():
                barcode_seq = s_con.get_entry(k, "sequence")
                vals = _set_sample_table_values(sample, project_sample,
                                                barcode_seq,
                                                ordered_million_reads, param)
                if vals['Status'] == "N/A" or vals['Status'] == "NP":
                    all_passed = False
                sample_table.append([vals[k] for k in table_keys])
        else:
            barcode_seq = None
            vals = _set_sample_table_values(sample, project_sample,
                                            barcode_seq, ordered_million_reads,
                                            param)
            if vals['Status'] == "N/A" or vals['Status'] == "NP":
                all_passed = False
            sample_table.append([vals[k] for k in table_keys])
    if all_passed: param["finished"] = 'Project finished.'
    sample_table.sort()
    sample_table = list(sample_table
                        for sample_table, _ in itertools.groupby(sample_table))
    sample_table.insert(0, [
        'ScilifeID', 'CustomerID', 'BarcodeSeq', 'MSequenced', 'MOrdered',
        'Status'
    ])
    paragraphs["Samples"]["tpl"] = make_sample_table(sample_table)
    make_note("{}_project_summary.pdf".format(project_name), headers,
              paragraphs, **param)
    make_rest_note("{}_project_summary.rst".format(project_name),
                   sample_table=sample_table,
                   report="project_report",
                   **param)
    param.update(
        {k: "N/A"
         for k in param.keys() if param[k] is None or param[k] == ""})
    output_data["debug"].write(
        json.dumps({
            'param': param,
            'table': sample_table
        }))
    return output_data
Пример #24
0
class TestQCUpload(PmFullTest):
    def setUp(self):
        self.app = self.make_app(
            argv=["qc", "upload-qc", flowcells[0], "--mtime", "10000"],
            extensions=["scilifelab.pm.ext.ext_qc", "scilifelab.pm.ext.ext_couchdb"],
        )
        self._run_app()
        self.s_con = SampleRunMetricsConnection(dbname="samples-test", username="******", password="******")
        self.p_con = ProjectSummaryConnection(dbname="projects-test", username="******", password="******")
        self.fc_con = FlowcellRunMetricsConnection(dbname="flowcells-test", username="******", password="******")

    def test_samplesheet(self):
        """Test samplesheet upload"""
        fc = self.fc_con.get_entry("120924_AC003CCCXX")
        self.assertEqual(fc["samplesheet_csv"][0]["Index"], "TGACCA")
        self.assertEqual(fc["samplesheet_csv"][0]["Description"], "J__Doe_00_01")
        self.assertEqual(fc["samplesheet_csv"][0]["FCID"], "C003CCCXX")
        self.assertEqual(fc["samplesheet_csv"][1]["SampleRef"], "hg19")
        self.assertEqual(fc["samplesheet_csv"][2]["SampleID"], "P001_101_index3")

    def test_qc_upload(self):
        """Test running qc upload to server"""
        self.app = self.make_app(
            argv=["qc", "upload-qc", flowcells[1], "--mtime", "100"],
            extensions=["scilifelab.pm.ext.ext_qc", "scilifelab.pm.ext.ext_couchdb"],
        )
        self._run_app()
        s = self.s_con.get_entry("4_120924_AC003CCCXX_CGTTAA")
        self.assertIsNone(s["project_sample_name"])
        self.assertEqual(s["project_id"], "P003")

    def test_qc_update(self):
        """Test running qc update of a project id"""
        s = self.s_con.get_entry("4_120924_AC003CCCXX_CGTTAA")
        s["project_id"] = None
        self.assertIsNone(s["project_id"])
        self.s_con.save(s)
        self.app = self.make_app(
            argv=["qc", "update", "--sample_prj", projects[2], "--project_id", "P003", "--debug", "--force"],
            extensions=["scilifelab.pm.ext.ext_qc", "scilifelab.pm.ext.ext_couchdb"],
        )
        self._run_app()
        s = self.s_con.get_entry("4_120924_AC003CCCXX_CGTTAA")
        self.assertEqual(s["project_id"], "P003")

    def test_qc_update_sample_names(self):
        """Test running qc update of project sample names"""
        s1 = self.s_con.get_entry("1_120924_AC003CCCXX_TGACCA")
        s2 = self.s_con.get_entry("2_120924_AC003CCCXX_ACAGTG")
        s1["project_sample_name"] = None
        s2["project_sample_name"] = None
        self.assertIsNone(s1["project_sample_name"])
        self.assertIsNone(s2["project_sample_name"])
        self.s_con.save(s1)
        self.s_con.save(s2)
        sample_map = {"P001_101_index3": "P001_101_index3", "P001_102_index6": "P001_102"}
        self.app = self.make_app(
            argv=[
                "qc",
                "update",
                "--sample_prj",
                projects[0],
                "--names",
                "{}".format(sample_map),
                "--debug",
                "--force",
            ],
            extensions=["scilifelab.pm.ext.ext_qc", "scilifelab.pm.ext.ext_couchdb"],
        )
        self._run_app()
        s1 = self.s_con.get_entry("1_120924_AC003CCCXX_TGACCA")
        s2 = self.s_con.get_entry("2_120924_AC003CCCXX_ACAGTG")
        self.assertEqual(s1["project_sample_name"], "P001_101_index3")
        self.assertEqual(s2["project_sample_name"], "P001_102")
Пример #25
0
def sample_status_note(project_name=None,
                       flowcell=None,
                       username=None,
                       password=None,
                       url=None,
                       ordered_million_reads=None,
                       uppnex_id=None,
                       customer_reference=None,
                       bc_count=None,
                       project_alias=[],
                       projectdb="projects",
                       samplesdb="samples",
                       flowcelldb="flowcells",
                       phix=None,
                       **kw):
    """Make a sample status note. Used keywords:

    :param project_name: project name
    :param flowcell: flowcell id
    :param username: db username
    :param password: db password
    :param url: db url
    :param ordered_million_reads: number of ordered reads in millions
    :param uppnex_id: the uppnex id
    :param customer_reference: customer project name
    :param project_alias: project alias name
    :param phix: phix error rate
    """
    # Cutoffs
    cutoffs = {
        "phix_err_cutoff": 2.0,
        "qv_cutoff": 30,
    }

    # parameters
    parameters = {
        "project_name": None,
        "start_date": None,
        "FC_id": None,
        "scilifelab_name": None,
        "rounded_read_count": None,
        "phix_error_rate": None,
        "avg_quality_score": None,
        "success": None,
        "run_mode": None,
    }
    # key mapping from sample_run_metrics to parameter keys
    srm_to_parameter = {
        "project_name": "sample_prj",
        "FC_id": "flowcell",
        "scilifelab_name": "barcode_name",
        "start_date": "date",
        "rounded_read_count": "bc_count"
    }

    LOG.debug("got parameters {}".format(parameters))
    output_data = {
        'stdout': StringIO(),
        'stderr': StringIO(),
        'debug': StringIO()
    }
    if not _assert_flowcell_format(flowcell):
        LOG.warn(
            "Wrong flowcell format {}; skipping. Please use the flowcell id (format \"[A-Z0-9]+XX\")"
            .format(flowcell))
        return output_data
    output_data = _update_sample_output_data(output_data, cutoffs)

    # Connect and run
    s_con = SampleRunMetricsConnection(dbname=samplesdb,
                                       username=username,
                                       password=password,
                                       url=url)
    fc_con = FlowcellRunMetricsConnection(dbname=flowcelldb,
                                          username=username,
                                          password=password,
                                          url=url)
    p_con = ProjectSummaryConnection(dbname=projectdb,
                                     username=username,
                                     password=password,
                                     url=url)

    # Set up paragraphs
    paragraphs = sample_note_paragraphs()
    headers = sample_note_headers()

    # Get project
    project = p_con.get_entry(project_name)
    if not project:
        LOG.warn("No such project '{}'".format(project_name))
        return output_data

    # Set samples list
    sample_run_list = _set_sample_run_list(project_name, flowcell,
                                           project_alias, s_con)
    if len(sample_run_list) == 0:
        LOG.warn(
            "No samples for project '{}', flowcell '{}'. Maybe there are no sample run metrics in statusdb?"
            .format(project_name, flowcell))
        return output_data

    # Set options
    ordered_million_reads = _literal_eval_option(ordered_million_reads)
    bc_count = _literal_eval_option(bc_count)
    phix = _literal_eval_option(phix)

    # Count number of times a sample has been run on a flowcell; if several, make lane-specific reports
    sample_count = Counter([x.get("barcode_name") for x in sample_run_list])

    # Loop samples and collect information
    s_param_out = []
    for s in sample_run_list:
        s_param = {}
        LOG.debug(
            "working on sample '{}', sample run metrics name '{}', id '{}'".
            format(s.get("barcode_name", None), s.get("name", None),
                   s.get("_id", None)))
        s_param.update(parameters)
        s_param.update(
            {key: s[srm_to_parameter[key]]
             for key in srm_to_parameter.keys()})
        fc = "{}_{}".format(s.get("date"), s.get("flowcell"))
        # Get instrument
        try:
            s_param.update(instrument[fc_con.get_instrument(str(fc))])
        except:
            LOG.warn(
                "Failed to set instrument and software versions for flowcell {} in report due to missing RunInfo -> Instrument field in statusdb. Either rerun 'pm qc update-qc' or search-and-replace 'NN' in the sample report."
                .format(fc))
            s_param.update(instrument['default'])
        # Get run mode
        s_param["run_mode"] = fc_con.get_run_mode(str(fc))
        s_param.update(software_versions)
        s_param["phix_error_rate"] = fc_con.get_phix_error_rate(
            str(fc), s["lane"])
        if phix:
            s_param["phix_error_rate"] = _get_phix_error_rate(s["lane"], phix)
        s_param['avg_quality_score'] = calc_avg_qv(s)
        if not s_param['avg_quality_score']:
            LOG.warn(
                "Calculation of average quality failed for sample {}, id {}".
                format(s.get("name"), s.get("_id")))

        # Compare phix error and qv to cutoffs
        err_stat = "OK"
        qv_stat = "OK"
        if s_param["phix_error_rate"] > cutoffs["phix_err_cutoff"]:
            err_stat = "HIGH"
        elif s_param["phix_error_rate"] == -1:
            err_stat = "N/A"
        if s_param["avg_quality_score"] < cutoffs["qv_cutoff"]:
            qv_stat = "LOW"
        output_data["stdout"].write(
            "{:>18}\t{:>6}\t{:>12}\t{:>12}\t{:>12}\t{:>12}\n".format(
                s["barcode_name"], s["lane"], s_param["phix_error_rate"],
                err_stat, s_param["avg_quality_score"], qv_stat))

        # Update/set remaning sample run parameters, falling back on project defaults if *key* is missing
        s_param['ordered_amount'] = s_param.get(
            'ordered_amount', p_con.get_ordered_amount(project_name))
        s_param['customer_reference'] = s_param.get(
            'customer_reference', project.get('customer_reference'))
        s_param['uppnex_project_id'] = s_param.get('uppnex_project_id',
                                                   project.get('uppnex_id'))

        # Override database settings if options passed at command line
        if ordered_million_reads:
            s_param["ordered_amount"] = _get_ordered_million_reads(
                s["barcode_name"], ordered_million_reads)
        if bc_count:
            s_param["rounded_read_count"] = _round_read_count_in_millions(
                _get_bc_count(s["barcode_name"], bc_count, s))
        else:
            s_param["rounded_read_count"] = _round_read_count_in_millions(
                s_param["rounded_read_count"])
        if uppnex_id:
            s_param["uppnex_project_id"] = uppnex_id
        if customer_reference:
            s_param["customer_reference"] = customer_reference

        # Get the project sample name corresponding to the sample run
        project_sample = p_con.get_project_sample(
            project_name, s.get("project_sample_name", None))
        if project_sample:
            LOG.debug(
                "project sample run metrics mapping found: '{}' : '{}'".format(
                    s["name"], project_sample["sample_name"]))
            project_sample_item = project_sample['project_sample']
            # Set project_sample_d: a dictionary mapping from sample run metrics name to sample run metrics database id
            project_sample_d = _set_project_sample_dict(project_sample_item)
            if not project_sample_d:
                LOG.warn(
                    "No sample_run_metrics information for sample '{}', barcode name '{}', id '{}'\n\tProject summary information {}"
                    .format(s["name"], s["barcode_name"], s["_id"],
                            project_sample))
            # Check if sample run metrics name present in project database: if so, verify that database ids are consistent
            if s["name"] not in project_sample_d.keys():
                LOG.warn(
                    "no such sample run metrics '{}' in project sample run metrics dictionary"
                    .format(s["name"]))
            else:
                if s["_id"] == project_sample_d[s["name"]]:
                    LOG.debug(
                        "project sample run metrics mapping found: '{}' : '{}'"
                        .format(s["name"], project_sample_d[s["name"]]))
                else:
                    LOG.warn(
                        "inconsistent mapping for '{}': '{}' != '{}' (project summary id)"
                        .format(s["name"], s["_id"],
                                project_sample_d[s["name"]]))
            s_param['customer_name'] = project_sample_item.get(
                "customer_name", None)

        # No project sample found. Manual upload to database necessary.
        else:
            s_param['customer_name'] = None
            LOG.warn(
                "No project sample name found for sample run name '{}'".format(
                    s["barcode_name"]))
            LOG.info(
                "Please run 'pm qc upload-qc FLOWCELL_ID --extensive-matching' to update project sample names "
            )
            LOG.info(
                "or 'pm qc update --sample_prj PROJECT_NAME --names BARCODE_TO_SAMPLE_MAP to update project sample names."
            )
            LOG.info("Please refer to the pm documentation for examples.")
            query_ok(force=kw.get("force", False))

        # Finally assess sequencing success, update parameters and set outputs
        s_param['success'] = sequencing_success(s_param, cutoffs)
        s_param.update({
            k: "N/A"
            for k in s_param.keys()
            if s_param[k] is None or s_param[k] == "" or s_param[k] == -1.0
        })
        if sample_count[s.get("barcode_name")] > 1:
            outfile = "{}_{}_{}_{}.pdf".format(s["barcode_name"], s["date"],
                                               s["flowcell"], s["lane"])
        else:
            outfile = "{}_{}_{}.pdf".format(s["barcode_name"], s["date"],
                                            s["flowcell"])
        s_param["outfile"] = outfile
        s_param_out.append(s_param)

    # Write final output to reportlab and rst files
    output_data["debug"].write(
        json.dumps({
            's_param': s_param_out,
            'sample_runs':
            {s["name"]: s["barcode_name"]
             for s in sample_run_list}
        }))
    notes = [
        make_note(headers=headers, paragraphs=paragraphs, **sp)
        for sp in s_param_out
    ]
    rest_notes = make_sample_rest_notes(
        "{}_{}_{}_sample_summary.rst".format(project_name, s.get("date", None),
                                             s.get("flowcell", None)),
        s_param_out)
    concatenate_notes(
        notes, "{}_{}_{}_sample_summary.pdf".format(project_name,
                                                    s.get("date", None),
                                                    s.get("flowcell", None)))
    return output_data
Пример #26
0
    def storage_cleanup(self):
        storage_conf = self.app.config.get_section_dict('storage')
        db_info = self.app.config.get_section_dict('db')
        f_conn = FlowcellRunMetricsConnection(username=db_info.get('user'),
                                              password=db_info.get('password'),
                                              url=db_info.get('url'))
        servers = [server for server in storage_conf.keys()]
        server = platform.node().split('.')[0].lower()
        if server in servers:
            self.app.log.info("Performing cleanup on production server \"{}\"...".format(server))
            dirs = [d.lstrip() for d in storage_conf.get(server).split(',')]

            #Collect old runs (> 30 days in nosync folder) to remove
            old_runs = []
            for d in dirs:
                nosync_dir = os.path.join(d, 'nosync')
                for fc in glob.iglob(os.path.join(nosync_dir, '1*')):
                    if os.path.isdir(fc):
                        fc_name = os.path.basename(fc)
                        #Check that there is no check file indicating to not remove the run
                        if not os.path.exists(os.path.join(fc, 'no_remove.txt')):
                            stats = os.stat(os.path.join(fc, 'RTAComplete.txt'))
                            mod_time = datetime.now() - datetime.fromtimestamp(stats.st_mtime)
                            if mod_time.days >= 30:
                                old_runs.append(fc)
                        else:
                            self.app.log.warn("no_remove.txt file found in {}, skipping run".format(fc_name))

            #NAS servers
            if 'nas' in server:
                #Collect newly finished runs
                fc_list = []
                for d in dirs:
                    for fc in glob.glob(os.path.join(d, '1*')):
                        if os.path.exists(os.path.join(fc, 'RTAComplete.txt')):
                            fc_list.append(fc)

                #Move to nosync
                retries = 5
                for fc in fc_list:
                    fc_name = os.path.basename(fc)
                    while retries:
                        if 'Finished' in last_lines(storage_conf.get('lsyncd_log'), 1)[0]:
                            break
                        retries -= 1
                        time.sleep(3)
                    if retries:
                        self.app.log.info("lsyncd process seems to be up to speed, and run {} " \
                                "is finished, moving it to nosync".format(fc_name))
                        shutil.move(fc, os.path.join(os.path.dirname(fc), 'nosync'))
                        #Touch RTAComplete.txt file to that the modification date is the date when
                        #it was moved to nosync
                        try:
                            open(os.path.join(os.path.dirname(fc), 'nosync', os.path.basename(fc), 'RTAComplete.txt'), 'w').close()
                        except IOError:
                            self.app.log.warn("No RTAComplete.txt file was found for run {}." \
                                    " Please check".format(os.path.basename(fc_name)))
                        fc_db_id = f_conn.id_view.get(fc_name)
                        if fc_db_id:
                            f_conn.set_storage_status(fc_db_id, 'NAS_nosync')
                        else:
                            self.app.log.warn("Flowcell {} not found in the database, not changing status.".format(fc_name))
                    else:
                        self.app.log.warn("lsyncd process doesn't seem to be finished. " \
                                "Skipping run {}".format(os.path.basename(fc)))

                #Remove old runs
                for fc in old_runs:
                    fc_name = os.path.basename(fc)
                    #Check that the run has been archived in swestore before removing permanently
                    if fc_name in f_conn.get_storage_status('swestore_archived').keys():
                        self.app.log.info("Run {} has been in nosync for more than 30 days " \
                            "and is archived in swestore. Permanently removing it from the NAS".format(fc_name))
                        shutil.rmtree(fc)
                    else:
                        self.app.log.warn("Run {} has been in nosync for more than 30 " \
                            "days, but has not yet been archived in swestore. " \
                            "Not removing, please check it".format(fc_name))

            #Processing servers (b5)
            else:
                #Collect finished runs
                fc_list = []
                for d in dirs:
                    for fc in glob.glob(os.path.join(d, '1*')):
                        if os.path.exists(os.path.join(fc, 'second_read_processing_completed.txt')):
                            fc_list.append(fc)

                #Move to nosync
                for fc in fc_list:
                    fc_name = os.path.basename(fc)
                    self.app.log.info("Moving run {} to nosync".format(fc_name))
                    shutil.move(fc, os.path.join(os.path.dirname(fc), 'nosync'))

                #Remove old runs
                for fc in old_runs:
                    fc_name = os.path.basename(fc)
                    self.app.log.info("Run {} has been in nosync for more than 30 " \
                        "days, permanently removing it from {}".format(fc_name, server))
                    shutil.rmtree(fc)
        else:
            self.app.log.warn("You're running the cleanup functionality in {}. But this " \
                    "server doen't seem to be on your pm.conf file. Are you on the correct server?".format(server))
Пример #27
0
def sample_status_note(project_name=None, flowcell=None, username=None, password=None, url=None,
                       ordered_million_reads=None, uppnex_id=None, customer_reference=None, bc_count=None,
                       project_alias=[], projectdb="projects", samplesdb="samples", flowcelldb="flowcells",
                       phix=None, **kw):
    """Make a sample status note. Used keywords:

    :param project_name: project name
    :param flowcell: flowcell id
    :param username: db username
    :param password: db password
    :param url: db url
    :param ordered_million_reads: number of ordered reads in millions
    :param uppnex_id: the uppnex id
    :param customer_reference: customer project name
    :param project_alias: project alias name
    :param phix: phix error rate
    """
    # Cutoffs
    cutoffs = {
        "phix_err_cutoff" : 2.0,
        "qv_cutoff" : 30,
        }
    
    # parameters
    parameters = {
        "project_name" : None,
        "start_date" : None,
        "FC_id" : None,
        "scilifelab_name" : None,
        "rounded_read_count" : None,
        "phix_error_rate" : None,
        "avg_quality_score" : None,
        "success" : None,
        "run_mode":None,
        }
    # key mapping from sample_run_metrics to parameter keys
    srm_to_parameter = {"project_name":"sample_prj", "FC_id":"flowcell", 
                        "scilifelab_name":"barcode_name", "start_date":"date", "rounded_read_count":"bc_count"}
    
    LOG.debug("got parameters {}".format(parameters))
    output_data = {'stdout':StringIO(), 'stderr':StringIO(), 'debug':StringIO()}
    if not _assert_flowcell_format(flowcell):
        LOG.warn("Wrong flowcell format {}; skipping. Please use the flowcell id (format \"[A-Z0-9]+XX\")".format(flowcell) )
        return output_data
    output_data = _update_sample_output_data(output_data, cutoffs)

    # Connect and run
    s_con = SampleRunMetricsConnection(dbname=samplesdb, username=username, password=password, url=url)
    fc_con = FlowcellRunMetricsConnection(dbname=flowcelldb, username=username, password=password, url=url)
    p_con = ProjectSummaryConnection(dbname=projectdb, username=username, password=password, url=url)

    # Set up paragraphs
    paragraphs = sample_note_paragraphs()
    headers = sample_note_headers()

    # Get project
    project = p_con.get_entry(project_name)
    if not project:
        LOG.warn("No such project '{}'".format(project_name))
        return output_data

    # Set samples list
    sample_run_list = _set_sample_run_list(project_name, flowcell, project_alias, s_con)
    if len(sample_run_list) == 0:
        LOG.warn("No samples for project '{}', flowcell '{}'. Maybe there are no sample run metrics in statusdb?".format(project_name, flowcell))
        return output_data
    
    # Set options
    ordered_million_reads = _literal_eval_option(ordered_million_reads)
    bc_count = _literal_eval_option(bc_count)
    phix = _literal_eval_option(phix)

    # Count number of times a sample has been run on a flowcell; if several, make lane-specific reports
    sample_count = Counter([x.get("barcode_name") for x in sample_run_list])

    # Loop samples and collect information
    s_param_out = []
    for s in sample_run_list:
        s_param = {}
        LOG.debug("working on sample '{}', sample run metrics name '{}', id '{}'".format(s.get("barcode_name", None), s.get("name", None), s.get("_id", None)))
        s_param.update(parameters)
        s_param.update({key:s[srm_to_parameter[key]] for key in srm_to_parameter.keys()})
        fc = "{}_{}".format(s.get("date"), s.get("flowcell"))
        # Get instrument
        try:
            s_param.update(instrument[fc_con.get_instrument(str(fc))])
        except:
            LOG.warn("Failed to set instrument and software versions for flowcell {} in report due to missing RunInfo -> Instrument field in statusdb. Either rerun 'pm qc update-qc' or search-and-replace 'NN' in the sample report.".format(fc))
            s_param.update(instrument['default'])
        # Get run mode
        s_param["run_mode"] = fc_con.get_run_mode(str(fc))
        s_param.update(software_versions)
        s_param["phix_error_rate"] = fc_con.get_phix_error_rate(str(fc), s["lane"])
        if phix:
            s_param["phix_error_rate"] = _get_phix_error_rate(s["lane"], phix)
        s_param['avg_quality_score'] = calc_avg_qv(s)
        if not s_param['avg_quality_score']:
            LOG.warn("Calculation of average quality failed for sample {}, id {}".format(s.get("name"), s.get("_id")))

        # Compare phix error and qv to cutoffs
        err_stat = "OK"
        qv_stat = "OK"
        if s_param["phix_error_rate"] > cutoffs["phix_err_cutoff"]:
            err_stat = "HIGH"
        elif s_param["phix_error_rate"] == -1:
            err_stat = "N/A"
        if s_param["avg_quality_score"] < cutoffs["qv_cutoff"]:
            qv_stat = "LOW"
        output_data["stdout"].write("{:>18}\t{:>6}\t{:>12}\t{:>12}\t{:>12}\t{:>12}\n".format(s["barcode_name"], s["lane"], s_param["phix_error_rate"], err_stat, s_param["avg_quality_score"], qv_stat))

        # Update/set remaning sample run parameters, falling back on project defaults if *key* is missing
        s_param['ordered_amount'] = s_param.get('ordered_amount', p_con.get_ordered_amount(project_name))
        s_param['customer_reference'] = s_param.get('customer_reference', project.get('customer_reference'))
        s_param['uppnex_project_id'] = s_param.get('uppnex_project_id', project.get('uppnex_id'))

        # Override database settings if options passed at command line
        if ordered_million_reads:
            s_param["ordered_amount"] = _get_ordered_million_reads(s["barcode_name"], ordered_million_reads)
        if bc_count:
            s_param["rounded_read_count"] = _round_read_count_in_millions(_get_bc_count(s["barcode_name"], bc_count, s))
        else:
            s_param["rounded_read_count"] = _round_read_count_in_millions(s_param["rounded_read_count"])
        if uppnex_id:
            s_param["uppnex_project_id"] = uppnex_id
        if customer_reference:
            s_param["customer_reference"] = customer_reference

        # Get the project sample name corresponding to the sample run
        project_sample = p_con.get_project_sample(project_name, s.get("project_sample_name", None))
        if project_sample:
            LOG.debug("project sample run metrics mapping found: '{}' : '{}'".format(s["name"], project_sample["sample_name"]))
            project_sample_item = project_sample['project_sample']
            # Set project_sample_d: a dictionary mapping from sample run metrics name to sample run metrics database id
            project_sample_d = _set_project_sample_dict(project_sample_item)
            if not project_sample_d:
                LOG.warn("No sample_run_metrics information for sample '{}', barcode name '{}', id '{}'\n\tProject summary information {}".format(s["name"], s["barcode_name"], s["_id"], project_sample))
            # Check if sample run metrics name present in project database: if so, verify that database ids are consistent
            if s["name"] not in project_sample_d.keys():
                LOG.warn("no such sample run metrics '{}' in project sample run metrics dictionary".format(s["name"]) )
            else:
                if s["_id"] == project_sample_d[s["name"]]:
                    LOG.debug("project sample run metrics mapping found: '{}' : '{}'".format(s["name"], project_sample_d[s["name"]]))
                else:
                    LOG.warn("inconsistent mapping for '{}': '{}' != '{}' (project summary id)".format(s["name"], s["_id"], project_sample_d[s["name"]]))
            s_param['customer_name'] = project_sample_item.get("customer_name", None)

        # No project sample found. Manual upload to database necessary.
        else:
            s_param['customer_name'] = None
            LOG.warn("No project sample name found for sample run name '{}'".format(s["barcode_name"]))
            LOG.info("Please run 'pm qc upload-qc FLOWCELL_ID --extensive-matching' to update project sample names ")
            LOG.info("or 'pm qc update --sample_prj PROJECT_NAME --names BARCODE_TO_SAMPLE_MAP to update project sample names.")
            LOG.info("Please refer to the pm documentation for examples.")
            query_ok(force=kw.get("force", False))

        # Finally assess sequencing success, update parameters and set outputs
        s_param['success'] = sequencing_success(s_param, cutoffs)
        s_param.update({k:"N/A" for k in s_param.keys() if s_param[k] is None or s_param[k] ==  "" or s_param[k] == -1.0})
        if sample_count[s.get("barcode_name")] > 1:
            outfile = "{}_{}_{}_{}.pdf".format(s["barcode_name"], s["date"], s["flowcell"], s["lane"])
        else:
            outfile = "{}_{}_{}.pdf".format(s["barcode_name"], s["date"], s["flowcell"])
        s_param["outfile"] = outfile
        s_param_out.append(s_param)

    # Write final output to reportlab and rst files
    output_data["debug"].write(json.dumps({'s_param': s_param_out, 'sample_runs':{s["name"]:s["barcode_name"] for s in sample_run_list}}))
    notes = [make_note(headers=headers, paragraphs=paragraphs, **sp) for sp in s_param_out]
    rest_notes = make_sample_rest_notes("{}_{}_{}_sample_summary.rst".format(project_name, s.get("date", None), s.get("flowcell", None)), s_param_out)
    concatenate_notes(notes, "{}_{}_{}_sample_summary.pdf".format(project_name, s.get("date", None), s.get("flowcell", None)))
    return output_data
Пример #28
0
    def swestore(self):
        """This function is the entry point for tasks having to do with packaging and sending runs to swestore
        """
        db_info = self.app.config.get_section_dict('db')
        f_conn = FlowcellRunMetricsConnection(username=db_info.get('user'),
                                              password=db_info.get('password'),
                                              url=db_info.get('url'))
        swestore_paths = set(self.config.get('archive','swestore_staging').split(','))
        run = self.pargs.tarball if self.pargs.tarball else self.pargs.flowcell
        swestore_dir = get_path_swestore_staging(run, swestore_paths)
        # Create a tarball out of the run folder
        if self.pargs.package_run:

            # We require a flowcell argument
            if not self._check_pargs(["flowcell"]):
                return

            self.pargs.tarball = package_run(self, swestore_dir, **vars(self.pargs))
            if not self.pargs.tarball:
                self.log.error("No tarball was created, exiting")
                return
            if self.pargs.clean:
                rm_run(self,self.config.get('archive','root'), flowcell=self.pargs.flowcell)

            if self.pargs.clean_from_staging:
                #Check that the run has been archived on the NAS before removing it, otherwise it will keep synching
                if self.pargs.flowcell in f_conn.get_storage_status('NAS_nosync').keys():
                    rm_run(self, swestore_dir, flowcell=self.pargs.flowcell)
                else:
                    self.log.warn("Run storage status is not NAS_nosync, not removing run from swestore_stage!")

        if not self.pargs.tarball:
            self.log.error("Required argument --tarball was not specified")
            return

        if not os.path.exists(os.path.join(swestore_dir, self.pargs.tarball)):
            self.log.error("Tarball {} does not exist".format(self.pargs.tarball))
            return

        # Upload a tarball to a remote host
        if self.pargs.remote_upload:
            result = upload_tarball(self,
                                    **dict(self.config.get_section_dict('archive').items() + vars(self.pargs).items()))
            if not result:
                return
            if self.pargs.clean:
                rm_tarball(self,tarball=self.pargs.tarball)

        # Send the tarball to Swestore using irods
        if self.pargs.send_to_swestore:
            result = send_to_swestore(self,**dict(self.config.get_section_dict('archive').items() + vars(self.pargs).items()))
            if not result:
                # If archiving failed, we need to give a non-zero exit code in order for a remote instance to detect the failure
                sys.exit(1)
            if self.pargs.clean:
                rm_tarball(self,tarball=self.pargs.tarball)
            #Set the run as archived in StatusDB
            fc_id = self.pargs.flowcell if self.pargs.flowcell else self.pargs.tarball.split('.')[0]
            fc_db_id = f_conn.id_view.get(fc_id)
            if fc_db_id:
                f_conn.set_storage_status(fc_db_id, 'swestore_archived')
            else:
                self.log.warn("Flowcell {} not found in the database, not changing status.".format(fc_id))
            # Log to statusdb
            if self.pargs.log_to_db:
                # implement this
                raise NotImplementedError("logging to db functionality not implemented")
Пример #29
0
class TestQCUpload(PmFullTest):
    def setUp(self):
        """FIXME: All other tests depend on data being uploaded, so
        these are not real unit tests. The setup to TestQCUpload has to
        be run prior to other tests, else unexpected failures will
        occur."""
        self.app = self.make_app(
            argv=['qc', 'upload-qc', flowcells[0], '--mtime', '10000'],
            extensions=[
                'scilifelab.pm.ext.ext_qc', 'scilifelab.pm.ext.ext_couchdb'
            ])
        self._run_app()
        self.app = self.make_app(
            argv=['qc', 'upload-qc', flowcells[1], '--mtime', '10000'],
            extensions=[
                'scilifelab.pm.ext.ext_qc', 'scilifelab.pm.ext.ext_couchdb'
            ])
        self._run_app()
        self.s_con = SampleRunMetricsConnection(dbname="samples-test",
                                                username="******",
                                                password="******")
        self.p_con = ProjectSummaryConnection(dbname="projects-test",
                                              username="******",
                                              password="******")
        self.fc_con = FlowcellRunMetricsConnection(dbname="flowcells-test",
                                                   username="******",
                                                   password="******")

    def test_samplesheet(self):
        """Test samplesheet upload"""
        fc = self.fc_con.get_entry("120924_AC003CCCXX")
        self.assertEqual(fc["samplesheet_csv"][0]["Index"], "TGACCA")
        self.assertEqual(fc["samplesheet_csv"][0]["Description"],
                         "J__Doe_00_01")
        self.assertEqual(fc["samplesheet_csv"][0]["FCID"], "C003CCCXX")
        self.assertEqual(fc["samplesheet_csv"][1]["SampleRef"], "hg19")
        self.assertEqual(fc["samplesheet_csv"][2]["SampleID"],
                         "P002_101_index3")

    def test_qc_upload(self):
        """Test running qc upload to server. Slightly circular testing
        here - I setup the module with qc update so by definition the
        test must 'work'"""
        self.app = self.make_app(
            argv=['qc', 'upload-qc', flowcells[1], '--mtime', '100'],
            extensions=[
                'scilifelab.pm.ext.ext_qc', 'scilifelab.pm.ext.ext_couchdb'
            ])
        self._run_app()
        s = self.s_con.get_entry("4_120924_AC003CCCXX_CGTTAA")
        self.assertIsNone(s["project_sample_name"])
        self.assertEqual(s["project_id"], "P003")

    def test_qc_update(self):
        """Test running qc update of a project id"""
        s = self.s_con.get_entry("4_120924_AC003CCCXX_CGTTAA")
        s["project_id"] = None
        self.assertIsNone(s["project_id"])
        self.s_con.save(s)
        self.app = self.make_app(argv=[
            'qc', 'update', '--sample_prj', projects[2], '--project_id',
            'P003', '--debug', '--force'
        ],
                                 extensions=[
                                     'scilifelab.pm.ext.ext_qc',
                                     'scilifelab.pm.ext.ext_couchdb'
                                 ])
        self._run_app()
        s = self.s_con.get_entry("4_120924_AC003CCCXX_CGTTAA")
        self.assertEqual(s["project_id"], "P003")

    def test_qc_update_sample_names(self):
        """Test running qc update of project sample names"""
        s1 = self.s_con.get_entry("1_120924_AC003CCCXX_TGACCA")
        s2 = self.s_con.get_entry("2_120924_AC003CCCXX_ACAGTG")
        s1["project_sample_name"] = None
        s2["project_sample_name"] = None
        self.assertIsNone(s1["project_sample_name"])
        self.assertIsNone(s2["project_sample_name"])
        self.s_con.save(s1)
        self.s_con.save(s2)
        sample_map = {
            'P001_101_index3': 'P001_101_index3',
            'P001_102_index6': 'P001_102'
        }
        self.app = self.make_app(argv=[
            'qc', 'update', '--sample_prj', projects[0], '--names',
            "{}".format(sample_map), '--debug', '--force'
        ],
                                 extensions=[
                                     'scilifelab.pm.ext.ext_qc',
                                     'scilifelab.pm.ext.ext_couchdb'
                                 ])
        self._run_app()
        s1 = self.s_con.get_entry("1_120924_AC003CCCXX_TGACCA")
        s2 = self.s_con.get_entry("2_120924_AC003CCCXX_ACAGTG")
        self.assertEqual(s1["project_sample_name"], "P001_101_index3")
        self.assertEqual(s2["project_sample_name"], "P001_102")