Пример #1
0
 def _collect_casava_qc(self):
     qc_objects = []
     runinfo_csv = os.path.join(os.path.join(self._meta.root_path, self.pargs.flowcell), "{}.csv".format(fc_id(self.pargs.flowcell)))
     if not os.path.exists(runinfo_csv):
         LOG.warn("No such file {}: trying fallback SampleSheet.csv".format(runinfo_csv))
         runinfo_csv = os.path.join(os.path.join(self._meta.root_path, self.pargs.flowcell), "SampleSheet.csv")
     try:
         with open(runinfo_csv) as fh:
             runinfo_reader = csv.reader(fh)
             runinfo = [x for x in runinfo_reader]
     except IOError as e:
         self.app.log.warn(str(e))
         raise e
     fcdir = os.path.join(os.path.abspath(self._meta.root_path), self.pargs.flowcell)
     (fc_date, fc_name) = fc_parts(self.pargs.flowcell)
     ## Check modification time
     demux_stats = None
     if modified_within_days(fcdir, self.pargs.mtime):
         fc_kw = dict(fc_date = fc_date, fc_name=fc_name)
         parser = FlowcellRunMetricsParser(fcdir)
         fcobj = FlowcellRunMetricsDocument(fc_date, fc_name)
         fcobj["RunInfo"] = parser.parseRunInfo(**fc_kw)
         fcobj["RunParameters"] = parser.parseRunParameters(**fc_kw)
         fcobj["illumina"] = parser.parse_illumina_metrics(fullRTA=False, **fc_kw)
         fcobj["bc_metrics"] = parser.parse_bc_metrics(**fc_kw)
         fcobj["undemultiplexed_barcodes"] = parser.parse_undemultiplexed_barcode_metrics(**fc_kw)
         fcobj["illumina"].update({"Demultiplex_Stats" : parser.parse_demultiplex_stats_htm(**fc_kw)})
         fcobj["samplesheet_csv"] = parser.parse_samplesheet_csv(runinfo_csv=runinfo_csv, **fc_kw)
         demux_stats = fcobj["illumina"]["Demultiplex_Stats"]
         qc_objects.append(fcobj)
     qc_objects = self._parse_samplesheet(runinfo, qc_objects, fc_date, fc_name, fcdir, demultiplex_stats=demux_stats)
     return qc_objects
Пример #2
0
 def _collect_casava_qc(self):
     qc_objects = []
     runinfo_csv = os.path.join(os.path.join(self._meta.root_path, self.pargs.flowcell), "{}.csv".format(fc_id(self.pargs.flowcell)))
     if not os.path.exists(runinfo_csv):
         LOG.warn("No such file {}: trying fallback SampleSheet.csv".format(runinfo_csv))
         runinfo_csv = os.path.join(os.path.join(self._meta.root_path, self.pargs.flowcell), "SampleSheet.csv")
     try:
         with open(runinfo_csv) as fh:
             runinfo_reader = csv.reader(fh)
             runinfo = [x for x in runinfo_reader]
     except IOError as e:
         self.app.log.warn(str(e))
         raise e
     fcdir = os.path.join(os.path.abspath(self._meta.root_path), self.pargs.flowcell)
     (fc_date, fc_name) = fc_parts(self.pargs.flowcell)
     ## Check modification time
     if modified_within_days(fcdir, self.pargs.mtime):
         fc_kw = dict(fc_date = fc_date, fc_name=fc_name)
         parser = FlowcellRunMetricsParser(fcdir)
         fcobj = FlowcellRunMetricsDocument(fc_date, fc_name)
         fcobj["RunInfo"] = parser.parseRunInfo(**fc_kw)
         fcobj["RunParameters"] = parser.parseRunParameters(**fc_kw)
         fcobj["illumina"] = parser.parse_illumina_metrics(fullRTA=False, **fc_kw)
         fcobj["bc_metrics"] = parser.parse_bc_metrics(**fc_kw)
         fcobj["undemultiplexed_barcodes"] = parser.parse_undemultiplexed_barcode_metrics(**fc_kw)
         fcobj["illumina"].update({"Demultiplex_Stats" : parser.parse_demultiplex_stats_htm(**fc_kw)})
         fcobj["samplesheet_csv"] = parser.parse_samplesheet_csv(runinfo_csv=runinfo_csv, **fc_kw)
         qc_objects.append(fcobj)
     qc_objects = self._parse_samplesheet(runinfo, qc_objects, fc_date, fc_name, fcdir, demultiplex_stats=fcobj["illumina"]["Demultiplex_Stats"])
     return qc_objects
Пример #3
0
    def upload_qc(self):
        if not self._check_pargs(['flowcell']):
            return
        url = self.pargs.url if self.pargs.url else self.app.config.get(
            "db", "url")
        if not url:
            self.app.log.warn("Please provide a valid url: got {}".format(url))
            return
        if not validate_fc_directory_format(self.pargs.flowcell):
            self.app.log.warn(
                "Path '{}' does not conform to bcbio flowcell directory format; aborting"
                .format(self.pargs.flowcell))
            return

        runinfo_csv = os.path.join(os.path.abspath(self.pargs.flowcell),
                                   "{}.csv".format(fc_id(self.pargs.flowcell)))
        runinfo_yaml = os.path.join(os.path.abspath(self.pargs.flowcell),
                                    "run_info.yaml")
        (fc_date, fc_name) = fc_parts(self.pargs.flowcell)
        if int(fc_date) < 120815:
            self.log.info(
                "Assuming pre-casava based file structure for {}".format(
                    fc_id(self.pargs.flowcell)))
            qc_objects = self._collect_pre_casava_qc()
        else:
            self.log.info("Assuming casava based file structure for {}".format(
                fc_id(self.pargs.flowcell)))
            qc_objects = self._collect_casava_qc()

        if len(qc_objects) == 0:
            self.log.info("No out-of-date qc objects for {}".format(
                fc_id(self.pargs.flowcell)))
            return
        else:
            self.log.info("Retrieved {} updated qc objects".format(
                len(qc_objects)))

        s_con = SampleRunMetricsConnection(dbname=self.app.config.get(
            "db", "samples"),
                                           **vars(self.app.pargs))
        fc_con = FlowcellRunMetricsConnection(dbname=self.app.config.get(
            "db", "flowcells"),
                                              **vars(self.app.pargs))
        p_con = ProjectSummaryConnection(dbname=self.app.config.get(
            "db", "projects"),
                                         **vars(self.app.pargs))
        for obj in qc_objects:
            if self.app.pargs.debug:
                self.log.debug("{}: {}".format(str(obj), obj["_id"]))
            if isinstance(obj, FlowcellRunMetricsDocument):
                dry("Saving object {}".format(repr(obj)), fc_con.save(obj))
            if isinstance(obj, SampleRunMetricsDocument):
                project_sample = p_con.get_project_sample(
                    obj.get("sample_prj", None), obj.get("barcode_name", None),
                    self.pargs.extensive_matching)
                if project_sample:
                    obj["project_sample_name"] = project_sample['sample_name']
                dry("Saving object {}".format(repr(obj)), s_con.save(obj))
Пример #4
0
 def _collect_pre_casava_qc(self):
     qc_objects = []
     as_yaml = False
     runinfo_csv = os.path.join(
         os.path.join(self._meta.root_path, self.pargs.flowcell),
         "{}.csv".format(fc_id(self.pargs.flowcell)))
     if not os.path.exists(runinfo_csv):
         LOG.warn("No such file {}: trying fallback SampleSheet.csv".format(
             runinfo_csv))
         runinfo_csv = os.path.join(
             os.path.join(self._meta.root_path, self.pargs.flowcell),
             "SampleSheet.csv")
     runinfo_yaml = os.path.join(os.path.abspath(self.pargs.flowcell),
                                 "run_info.yaml")
     try:
         if os.path.exists(runinfo_csv):
             with open(runinfo_csv) as fh:
                 runinfo_reader = csv.reader(fh)
                 runinfo = [x for x in runinfo_reader]
         else:
             as_yaml = True
             with open(runinfo_yaml) as fh:
                 runinfo = yaml.load(fh)
     except IOError as e:
         self.app.log.warn(str(e))
         raise e
     fcdir = os.path.abspath(self.pargs.flowcell)
     (fc_date, fc_name) = fc_parts(self.pargs.flowcell)
     ## Check modification time
     if modified_within_days(fcdir, self.pargs.mtime):
         fc_kw = dict(fc_date=fc_date, fc_name=fc_name)
         parser = FlowcellRunMetricsParser(fcdir)
         fcobj = FlowcellRunMetricsDocument(**fc_kw)
         fcobj["RunInfo"] = parser.parseRunInfo(**fc_kw)
         fcobj["RunParameters"] = parser.parseRunParameters(**fc_kw)
         fcobj["illumina"] = parser.parse_illumina_metrics(fullRTA=False,
                                                           **fc_kw)
         fcobj["bc_metrics"] = parser.parse_bc_metrics(**fc_kw)
         fcobj["filter_metrics"] = parser.parse_filter_metrics(**fc_kw)
         fcobj["samplesheet_csv"] = parser.parse_samplesheet_csv(
             runinfo_csv=runinfo_csv, **fc_kw)
         fcobj["run_info_yaml"] = parser.parse_run_info_yaml(**fc_kw)
         qc_objects.append(fcobj)
     else:
         return qc_objects
     qc_objects = self._parse_samplesheet(runinfo,
                                          qc_objects,
                                          fc_date,
                                          fc_name,
                                          fcdir,
                                          as_yaml=as_yaml)
     return qc_objects
Пример #5
0
    def upload_qc(self):
        if not self._check_pargs(["flowcell"]):
            return
        url = self.pargs.url if self.pargs.url else self.app.config.get("db", "url")
        if not url:
            self.app.log.warn("Please provide a valid url: got {}".format(url))
            return
        if not validate_fc_directory_format(self.pargs.flowcell):
            self.app.log.warn(
                "Path '{}' does not conform to bcbio flowcell directory format; aborting".format(self.pargs.flowcell)
            )
            return

        runinfo_csv = os.path.join(os.path.abspath(self.pargs.flowcell), "{}.csv".format(fc_id(self.pargs.flowcell)))
        runinfo_yaml = os.path.join(os.path.abspath(self.pargs.flowcell), "run_info.yaml")
        (fc_date, fc_name) = fc_parts(self.pargs.flowcell)
        if int(fc_date) < 120815:
            self.log.info("Assuming pre-casava based file structure for {}".format(fc_id(self.pargs.flowcell)))
            qc_objects = self._collect_pre_casava_qc()
        else:
            self.log.info("Assuming casava based file structure for {}".format(fc_id(self.pargs.flowcell)))
            qc_objects = self._collect_casava_qc()

        if len(qc_objects) == 0:
            self.log.info("No out-of-date qc objects for {}".format(fc_id(self.pargs.flowcell)))
            return
        else:
            self.log.info("Retrieved {} updated qc objects".format(len(qc_objects)))

        s_con = SampleRunMetricsConnection(dbname=self.app.config.get("db", "samples"), **vars(self.app.pargs))
        fc_con = FlowcellRunMetricsConnection(dbname=self.app.config.get("db", "flowcells"), **vars(self.app.pargs))
        p_con = ProjectSummaryConnection(dbname=self.app.config.get("db", "projects"), **vars(self.app.pargs))
        for obj in qc_objects:
            if self.app.pargs.debug:
                self.log.debug("{}: {}".format(str(obj), obj["_id"]))
            if isinstance(obj, FlowcellRunMetricsDocument):
                dry("Saving object {}".format(repr(obj)), fc_con.save(obj))
            if isinstance(obj, SampleRunMetricsDocument):
                project_sample = p_con.get_project_sample(
                    obj.get("sample_prj", None), obj.get("barcode_name", None), self.pargs.extensive_matching
                )
                if project_sample:
                    obj["project_sample_name"] = project_sample["sample_name"]
                dry("Saving object {}".format(repr(obj)), s_con.save(obj))
Пример #6
0
 def _collect_pre_casava_qc(self):
     qc_objects = []
     as_yaml = False
     runinfo_csv = os.path.join(
         os.path.join(self._meta.root_path, self.pargs.flowcell), "{}.csv".format(fc_id(self.pargs.flowcell))
     )
     if not os.path.exists(runinfo_csv):
         LOG.warn("No such file {}: trying fallback SampleSheet.csv".format(runinfo_csv))
         runinfo_csv = os.path.join(os.path.join(self._meta.root_path, self.pargs.flowcell), "SampleSheet.csv")
     runinfo_yaml = os.path.join(os.path.abspath(self.pargs.flowcell), "run_info.yaml")
     try:
         if os.path.exists(runinfo_csv):
             with open(runinfo_csv) as fh:
                 runinfo_reader = csv.reader(fh)
                 runinfo = [x for x in runinfo_reader]
         else:
             as_yaml = True
             with open(runinfo_yaml) as fh:
                 runinfo = yaml.load(fh)
     except IOError as e:
         self.app.log.warn(str(e))
         raise e
     fcdir = os.path.abspath(self.pargs.flowcell)
     (fc_date, fc_name) = fc_parts(self.pargs.flowcell)
     ## Check modification time
     if modified_within_days(fcdir, self.pargs.mtime):
         fc_kw = dict(fc_date=fc_date, fc_name=fc_name)
         parser = FlowcellRunMetricsParser(fcdir)
         fcobj = FlowcellRunMetricsDocument(**fc_kw)
         fcobj["RunInfo"] = parser.parseRunInfo(**fc_kw)
         fcobj["RunParameters"] = parser.parseRunParameters(**fc_kw)
         fcobj["illumina"] = parser.parse_illumina_metrics(fullRTA=False, **fc_kw)
         fcobj["bc_metrics"] = parser.parse_bc_metrics(**fc_kw)
         fcobj["filter_metrics"] = parser.parse_filter_metrics(**fc_kw)
         fcobj["samplesheet_csv"] = parser.parse_samplesheet_csv(runinfo_csv=runinfo_csv, **fc_kw)
         fcobj["run_info_yaml"] = parser.parse_run_info_yaml(**fc_kw)
         qc_objects.append(fcobj)
     else:
         return qc_objects
     qc_objects = self._parse_samplesheet(runinfo, qc_objects, fc_date, fc_name, fcdir, as_yaml=as_yaml)
     return qc_objects
Пример #7
0
def _make_casava_archive_files(fc, ssname, prefix, startiter = 1, nseqout=1000):
    fc_dir = os.path.join(ARCHIVE, fc)
    if not os.path.exists(fc_dir):
        safe_makedir(fc_dir)
    with open(os.path.join(fc_dir, "{}.csv".format(ssname)), "w") as fh:
        fh.write(SAMPLESHEETS[ssname])
    with open(os.path.join(fc_dir, "RunInfo.xml"), "w") as fh:
        fh.write(RUNINFO.render(**{'flowcell':os.path.basename(fc), 'fc_id':fc_id(fc), 'date':fc_parts(fc)[0], 'instrument':fc.split("_")[1]}))
    with open(os.path.join(fc_dir, "runParameters.xml"), "w") as fh:
        fh.write(RUNPARAMETERS.render(**{'flowcell':os.path.basename(fc), 'fc_id':fc_id(fc), 'date':fc_parts(fc)[0], 'instrument':fc.split("_")[1]}))

    outf1 = []
    outf2 = []
    basecall_stats_dir = os.path.join(fc_dir, "Unaligned", "Basecall_Stats_{}".format(ssname))
    if not os.path.exists(basecall_stats_dir):
        safe_makedir(basecall_stats_dir)
    for d in [os.path.join(basecall_stats_dir, x) for x in ["css", "Plots"]]:
        if not os.path.exists(d):
            safe_makedir(d)
    
    for row in SAMPLESHEETS[ssname].split("\n"):
        vals = row.split(",")
        if vals[0] == "FCID":
            header = row
            continue
        if len(vals) == 0:
            continue
        outdir = os.path.join(fc_dir, "Unaligned", "Project_{}".format(vals[5]), "Sample_{}".format(vals[2]))
        if not os.path.exists(outdir):
            safe_makedir(outdir)
        with open(os.path.join(outdir, "SampleSheet.csv"), "w") as fh:
            LOG.info("Writing to {}".format(os.path.join(outdir, "SampleSheet.csv")))
            fh.write("{}\n".format(header))
            fh.write("{}\n".format(row))
        r1 = os.path.join(outdir, "{}_{}_L00{}_R1_001.fastq.gz".format(vals[2], vals[4], vals[1]))
        r2 = os.path.join(outdir, "{}_{}_L00{}_R2_001.fastq.gz".format(vals[2], vals[4], vals[1]))
        if os.path.exists(r1):
            LOG.info("{} already exists: if you want to rerun file generation remove {}".format(r1, r1))
            return 
        outf1.append(r1)
        outf2.append(r2)

    ## Write sequences
    with open("{}_1.fastq".format(prefix), "r") as fh:
        _write_sample_fastq(fh, outf1, startiter=startiter, nseqout=nseqout)
    with open("{}_2.fastq".format(prefix), "r") as fh:
        _write_sample_fastq(fh, outf2, startiter=startiter, nseqout=nseqout)