示例#1
0
 def parse_samplesheet_csv(self):
     log.info("parse_samplesheet_csv: going to read {}.csv in directory {}".format(self["RunInfo"]["Flowcell"][1:], self.path))
     infile = os.path.join(os.path.abspath(self.path), "{}.csv".format(self["RunInfo"]["Flowcell"][1:]))
     try:
         fp = open(infile)
         runinfo = json.dumps([x for x in csv.reader(fp)])
         fp.close()
         self["run_info_csv"] = runinfo
     except:
         log.warn("No such file {}".format(infile))
示例#2
0
文件: __init__.py 项目: vals/bcbb
 def parse_samplesheet_csv(self):
     log.info("parse_samplesheet_csv: going to read {}.csv in directory {}".format(self["RunInfo"]["Flowcell"][1:], self.path))
     infile = os.path.join(os.path.abspath(self.path), "{}.csv".format(self["RunInfo"]["Flowcell"][1:]))
     try:
         fp = open(infile)
         runinfo = json.dumps([x for x in csv.reader(fp)])
         fp.close()
         self["run_info_csv"] = runinfo
     except:
         log.warn("No such file {}".format(infile))
示例#3
0
 def _parseRunInfo(self, fn="RunInfo.xml"):
     log.info("_parseRunInfo: going to read RunInfo.xml in directory {}".format(self.path))
     try:
         fp = open(os.path.join(os.path.abspath(self.path), fn))
         parser = RunInfoParser()
         data = parser.parse(fp)
         fp.close()
         self["RunInfo"] = data
     except:
         log.warn("No such file %s" % os.path.join(os.path.abspath(self.path), fn))
示例#4
0
文件: __init__.py 项目: vals/bcbb
 def read_picard_metrics(self):
     log.info("read_picard_metrics for sample {}, project {}, lane {} in run {}".format(self["barcode_name"], self["sample_prj"], self["lane"], self["flowcell"]))
     picard_parser = ExtendedPicardMetricsParser()
     pattern = "{}_[0-9]+_[0-9A-Za-z]+(_nophix)?_{}-.*.(align|hs|insert|dup)_metrics".format(self["lane"], self["barcode_id"])
     try:
         files = self.filter_files(pattern)
         metrics = picard_parser.extract_metrics(files)
         self["picard_metrics"] = metrics
     except:
         log.warn("no picard metrics for sample {}".format(self["barcode_name"]))
示例#5
0
文件: __init__.py 项目: vals/bcbb
 def _parseRunInfo(self, fn="RunInfo.xml"):
     log.info("_parseRunInfo: going to read RunInfo.xml in directory {}".format(self.path))
     try:
         fp = open(os.path.join(os.path.abspath(self.path), fn))
         parser = RunInfoParser()
         data = parser.parse(fp)
         fp.close()
         self["RunInfo"] = data
     except:
         log.warn("No such file %s" % os.path.join(os.path.abspath(self.path), fn))
示例#6
0
 def read_picard_metrics(self):
     log.info("read_picard_metrics for sample {}, project {}, lane {} in run {}".format(self["barcode_name"], self["sample_prj"], self["lane"], self["flowcell"]))
     picard_parser = ExtendedPicardMetricsParser()
     pattern = "{}_[0-9]+_[0-9A-Za-z]+(_nophix)?_{}-.*.(align|hs|insert|dup)_metrics".format(self["lane"], self["barcode_id"])
     try:
         files = self.filter_files(pattern)
         metrics = picard_parser.extract_metrics(files)
         self["picard_metrics"] = metrics
     except:
         log.warn("no picard metrics for sample {}".format(self["barcode_name"]))
示例#7
0
文件: __init__.py 项目: vals/bcbb
 def parse_run_info_yaml(self, run_info_yaml="run_info.yaml"):
     log.info("parse_run_info_yaml: going to read {} in directory {}".format(run_info_yaml, self.path))
     infile = os.path.join(os.path.abspath(self.path), run_info_yaml)
     try:
         fp = open(infile)
         runinfo = yaml.load(fp)
         fp.close()
         self["run_info_yaml"] = runinfo
     except:
         log.warn("No such file {}".format(infile))
示例#8
0
 def parse_run_info_yaml(self, run_info_yaml="run_info.yaml"):
     log.info("parse_run_info_yaml: going to read {} in directory {}".format(run_info_yaml, self.path))
     infile = os.path.join(os.path.abspath(self.path), run_info_yaml)
     try:
         fp = open(infile)
         runinfo = yaml.load(fp)
         fp.close()
         self["run_info_yaml"] = runinfo
     except:
         log.warn("No such file {}".format(infile))
示例#9
0
文件: lane.py 项目: aminmg/bcbb
def process_lane(lane_items, fc_name, fc_date, dirs, config):
    """Prepare lanes, potentially splitting based on barcodes.
    """

    lane_name = "%s_%s_%s" % (lane_items[0]['lane'], fc_date, fc_name)
    full_fastq1, full_fastq2 = get_fastq_files(dirs["fastq"], dirs["work"],
                                               lane_items[0], fc_name, config=config)

    # Filter phiX
    custom_config = _update_config_w_custom(config, lane_items[0])
    if custom_config["algorithm"].get("filter_phix", False):
        # If we are starting from demultiplexed material, we will skip a lane-wise screening
        # Screening will be performed on a sample basis
        if custom_config["algorithm"].get("demultiplexed", False):
            logger.warn("Will not filter phix lane-wise on already demultiplexed files. " \
                "You will have to specify genomes_filter_out option for each sample")

        else:
            logger.info("Filtering phiX from %s" % lane_name)
            info = {"genomes_filter_out": "spiked_phix", "description": lane_name}
            processed = remove_contaminants(full_fastq1, full_fastq2, info, lane_name, info["description"], dirs, custom_config)
            (full_fastq1, full_fastq2, _, lane_name) = processed[0][0:4]

    logger.info("Demultiplexing %s" % lane_name)
    bc_files = split_by_barcode(full_fastq1, full_fastq2, lane_items,
                                lane_name, dirs, config)

    out = []
    for item in lane_items:
        config = _update_config_w_custom(config, item)
        # Can specify all barcodes but might not have actual sequences
        # Would be nice to have a good way to check this is okay here.
        if item["barcode_id"] in bc_files:
            fastq1, fastq2 = bc_files[item["barcode_id"]]
            cur_lane_name = lane_name
            cur_lane_desc = item["description"]
            if item.get("name", "") and config["algorithm"].get("include_short_name", True):
                cur_lane_desc = "%s : %s" % (item["name"], cur_lane_desc)

            if item["barcode_id"] is not None:
                cur_lane_name += "_%s" % (item["barcode_id"])

            if config["algorithm"].get("trim_reads", False):
                trim_info = brun_trim_fastq([x for x in [fastq1, fastq2] if x is not None],
                                            dirs, config)
                fastq1 = trim_info[0]
                if fastq2 is not None:
                    fastq2 = trim_info[1]

            out.append((fastq1, fastq2, item, cur_lane_name, cur_lane_desc,
                        dirs, config))

    return out
示例#10
0
 def parse_fastq_screen(self):
     log.info("parse_fastq_screen for sample {}, lane {} in run {}".format(self["barcode_name"], self["lane"], self["flowcell"]))
     parser = MetricsParser()
     pattern = "{}_[0-9]+_[0-9A-Za-z]+(_nophix)?_{}_[12]_fastq_screen.txt".format(self["lane"], self["barcode_id"])
     files = self.filter_files(pattern)
     try:
         fp = open(files[0])
         data = parser.parse_fastq_screen_metrics(fp)
         fp.close()
         self["metrics"]["fastq_scr"] = data
     except:
         log.warn("no fastq screen metrics for sample {}".format(self["barcode_name"]))
示例#11
0
 def parse_bc_metrics(self):
     log.info("parse_bc_metrics for lane {} in flowcell {}".format(self["lane"], self["flowcell"]))
     pattern = "{}*barcode/{}_[0-9]+_[0-9A-Za-z]+(_nophix)?.bc_metrics".format(self["lane"], self["lane"])
     files = self.filter_files(pattern)
     try:
         parser = MetricsParser()
         fp = open(files[0])
         data = parser.parse_bc_metrics(fp)
         fp.close()
         self["bc_metrics"] = data
     except:
         log.warn("No bc_metrics info for lane {}".format(self["lane"]))
示例#12
0
文件: __init__.py 项目: vals/bcbb
 def parse_fastq_screen(self):
     log.info("parse_fastq_screen for sample {}, lane {} in run {}".format(self["barcode_name"], self["lane"], self["flowcell"]))
     parser = MetricsParser()
     pattern = "{}_[0-9]+_[0-9A-Za-z]+(_nophix)?_{}_[12]_fastq_screen.txt".format(self["lane"], self["barcode_id"])
     files = self.filter_files(pattern)
     try:
         fp = open(files[0])
         data = parser.parse_fastq_screen_metrics(fp)
         fp.close()
         self["metrics"]["fastq_scr"] = data
     except:
         log.warn("no fastq screen metrics for sample {}".format(self["barcode_name"]))
示例#13
0
文件: __init__.py 项目: vals/bcbb
 def parse_bc_metrics(self):
     log.info("parse_bc_metrics for lane {} in flowcell {}".format(self["lane"], self["flowcell"]))
     pattern = "{}*barcode/{}_[0-9]+_[0-9A-Za-z]+(_nophix)?.bc_metrics".format(self["lane"], self["lane"])
     files = self.filter_files(pattern)
     try:
         parser = MetricsParser()
         fp = open(files[0])
         data = parser.parse_bc_metrics(fp)
         fp.close()
         self["bc_metrics"] = data
     except:
         log.warn("No bc_metrics info for lane {}".format(self["lane"]))
示例#14
0
 def parse_filter_metrics(self, re_str="*filter[_.]metrics"):
     log.info("parse_filter_metrics for lane {} in flowcell {}".format(self["lane"], self["flowcell"]))
     pattern = "nophix/{}_[0-9]+_[0-9A-Za-z]+(_nophix)?.filter_metrics".format(self["lane"])
     files = self.filter_files(pattern)
     self["filter_metrics"] = {"reads":None, "reads_aligned":None, "reads_fail_align":None}
     try:
         fp = open(files[0])
         parser = MetricsParser()
         data = parser.parse_filter_metrics(fp)
         fp.close()
         self["filter_metrics"] = data
     except:
         log.warn("No filter nophix metrics for lane {}".format(self["lane"]))
示例#15
0
文件: __init__.py 项目: vals/bcbb
 def parse_filter_metrics(self, re_str="*filter[_.]metrics"):
     log.info("parse_filter_metrics for lane {} in flowcell {}".format(self["lane"], self["flowcell"]))
     pattern = "nophix/{}_[0-9]+_[0-9A-Za-z]+(_nophix)?.filter_metrics".format(self["lane"])
     files = self.filter_files(pattern)
     self["filter_metrics"] = {"reads":None, "reads_aligned":None, "reads_fail_align":None}
     try:
         fp = open(files[0])
         parser = MetricsParser()
         data = parser.parse_filter_metrics(fp)
         fp.close()
         self["filter_metrics"] = data
     except:
         log.warn("No filter nophix metrics for lane {}".format(self["lane"]))
示例#16
0
 def parse_bc_metrics(self):
     """Parse bc metrics at sample level"""
     log.info("parse_bc_metrics for sample {}, project {} in flowcell {}".format(self["barcode_name"], self["sample_prj"], self["flowcell"]))
     pattern = "{}_[0-9]+_[0-9A-Za-z]+(_nophix)?[\._]bc[\._]metrics".format(self["lane"])
     files = self.filter_files(pattern)
     try:
         parser = MetricsParser()
         fp = open(files[0])
         data = parser.parse_bc_metrics(fp)
         fp.close()
         self["bc_count"] = data[str(self["barcode_id"])]
     except:
         log.warn("No bc_metrics info for lane {}".format(self["lane"]))
示例#17
0
文件: __init__.py 项目: vals/bcbb
 def parse_bc_metrics(self):
     """Parse bc metrics at sample level"""
     log.info("parse_bc_metrics for sample {}, project {} in flowcell {}".format(self["barcode_name"], self["sample_prj"], self["flowcell"]))
     pattern = "{}_[0-9]+_[0-9A-Za-z]+(_nophix)?[\._]bc[\._]metrics".format(self["lane"])
     files = self.filter_files(pattern)
     try:
         parser = MetricsParser()
         fp = open(files[0])
         data = parser.parse_bc_metrics(fp)
         fp.close()
         self["bc_count"] = data[str(self["barcode_id"])]
     except:
         log.warn("No bc_metrics info for lane {}".format(self["lane"]))
示例#18
0
文件: __init__.py 项目: vals/bcbb
 def read_fastqc_metrics(self):
     log.info("read_fastq_metrics for sample {}, project {}, lane {} in run {}".format(self["barcode_name"], self["sample_prj"], self["lane"], self["flowcell"]))
     if self["barcode_name"] == "unmatched":
         return
     self["fastqc"] = {'stats':None}
     pattern = "fastqc/{}_[0-9]+_[0-9A-Za-z]+(_nophix)?_{}-*".format(self["lane"], self["barcode_id"])
     files = self.filter_files(pattern)
     try:
         fastqc_dir = os.path.dirname(files[0])
         fqparser = ExtendedFastQCParser(fastqc_dir)
         stats = fqparser.get_fastqc_summary()
         self["fastqc"] = {'stats':stats}
     except:
         log.warn("no fastq screen metrics for sample {}".format(self["barcode_name"]))
示例#19
0
 def read_fastqc_metrics(self):
     log.info("read_fastq_metrics for sample {}, project {}, lane {} in run {}".format(self["barcode_name"], self["sample_prj"], self["lane"], self["flowcell"]))
     if self["barcode_name"] == "unmatched":
         return
     self["fastqc"] = {'stats':None}
     pattern = "fastqc/{}_[0-9]+_[0-9A-Za-z]+(_nophix)?_{}-*".format(self["lane"], self["barcode_id"])
     files = self.filter_files(pattern)
     try:
         fastqc_dir = os.path.dirname(files[0])
         fqparser = ExtendedFastQCParser(fastqc_dir)
         stats = fqparser.get_fastqc_summary()
         self["fastqc"] = {'stats':stats}
     except:
         log.warn("no fastq screen metrics for sample {}".format(self["barcode_name"]))
示例#20
0
 def parse_filter_metrics(self):
     """CASAVA: Parse filter metrics at sample level"""
     log.info("parse_filter_metrics for lane {}, project {} in flowcell {}".format(self["lane"], self["sample_prj"], self["flowcell"]))
     pattern = "{}_[0-9]+_[0-9A-Za-z]+_{}(_nophix)?.filter_metrics".format(self["lane"], self["barcode_id"])
     files = self.filter_files(pattern)
     self["filter_metrics"] = {"reads":None, "reads_aligned":None, "reads_fail_align":None}
     try:
         fp = open(files[0])
         parser = MetricsParser()
         data = parser.parse_filter_metrics(fp)
         fp.close()
         self["filter_metrics"] = data
     except:
         log.warn("No filter nophix metrics for lane {}".format(self["lane"]))
示例#21
0
文件: __init__.py 项目: vals/bcbb
 def parse_filter_metrics(self):
     """CASAVA: Parse filter metrics at sample level"""
     log.info("parse_filter_metrics for lane {}, project {} in flowcell {}".format(self["lane"], self["sample_prj"], self["flowcell"]))
     pattern = "{}_[0-9]+_[0-9A-Za-z]+_{}(_nophix)?.filter_metrics".format(self["lane"], self["barcode_id"])
     files = self.filter_files(pattern)
     self["filter_metrics"] = {"reads":None, "reads_aligned":None, "reads_fail_align":None}
     try:
         fp = open(files[0])
         parser = MetricsParser()
         data = parser.parse_filter_metrics(fp)
         fp.close()
         self["filter_metrics"] = data
     except:
         log.warn("No filter nophix metrics for lane {}".format(self["lane"]))
示例#22
0
def _write_to_worksheet(client, ssheet, wsheet_title, rows, header, append, keys=[]):
    """Generic method to write a set of rows to a worksheet on google docs.
    """
    # Convert the worksheet title to unicode
    wsheet_title = _to_unicode(wsheet_title)

    # Add a new worksheet, possibly appending or replacing a pre-existing
    # worksheet according to the append-flag.
    wsheet = g_spreadsheet.add_worksheet(client, \
                                         ssheet, \
                                         wsheet_title, \
                                         len(rows) + 1, \
                                         len(header), \
                                         append)
    if wsheet is None:
        logger2.error("ERROR: Could not add a worksheet {!r} to " \
            "spreadsheet {!r}".format(wsheet_title, ssheet.title.text))
        return False
    
    # If keys are specified (will correspond to indexes in the header), delete pre-existing rows with matching keys
    if append and len(keys) > 0:
        wsheet_data = g_spreadsheet.get_cell_content(client, ssheet, wsheet, '2')
        wsheet_header = g_spreadsheet.get_header(client, ssheet, wsheet)
        try:
            wsheet_indexes = [wsheet_header.index(key) for key in keys]
            header_indexes = [header.index(key) for key in keys]
        except ValueError:
            logger2.warn("WARNING: Could not identify correct header for duplicate detection")
        else:
            for row in rows:
                try:
                    key = "#".join([row[i] for i in header_indexes])        
                    for i, wrow in enumerate(wsheet_data):
                        wkey = "#".join([wrow[j] for j in wsheet_indexes])
                        if wkey == key:
                            g_spreadsheet.delete_row(client, ssheet, wsheet, i+1)
                            wsheet_data.pop(i)
                            break
                except:
                    logger2.warn("WARNING: Could not identify/replace duplicate rows")

    # Write the data to the worksheet
    success = g_spreadsheet.write_rows(client, ssheet, wsheet, header, rows)
    if success:
        logger2.info("Wrote data to the {!r}:{!r} " \
                     "worksheet".format(ssheet.title.text, wsheet_title))
    else:
        logger2.error("ERROR: Could not write data to the {!r}:{!r} " \
                      "worksheet".format(ssheet.title.text, wsheet_title))
    return success
示例#23
0
文件: __init__.py 项目: vals/bcbb
 def parse_bc_metrics(self):
     """Parse bc metrics at sample level"""
     log.info("parse_bc_metrics for flowcell {}".format(self["RunInfo"]["Flowcell"]))
     for lane in self._lanes:
         pattern = "{}_[0-9]+_[0-9A-Za-z]+(_nophix)?[\._]bc[\._]metrics".format(lane)
         self["lanes"][str(lane)]["bc_metrics"] = {"reads":None, "reads_aligned":None, "reads_fail_align":None}
         files = self.filter_files(pattern)
         try:
             parser = MetricsParser()
             fp = open(files[0])
             data = parser.parse_bc_metrics(fp)
             fp.close()
             self["lanes"][str(lane)]["bc_metrics"] = data
         except:
             log.warn("No bc_metrics info for lane {}".format(lane))
示例#24
0
 def parse_bc_metrics(self):
     """Parse bc metrics at sample level"""
     log.info("parse_bc_metrics for flowcell {}".format(self["RunInfo"]["Flowcell"]))
     for lane in self._lanes:
         pattern = "{}_[0-9]+_[0-9A-Za-z]+(_nophix)?[\._]bc[\._]metrics".format(lane)
         self["lanes"][str(lane)]["bc_metrics"] = {"reads":None, "reads_aligned":None, "reads_fail_align":None}
         files = self.filter_files(pattern)
         try:
             parser = MetricsParser()
             fp = open(files[0])
             data = parser.parse_bc_metrics(fp)
             fp.close()
             self["lanes"][str(lane)]["bc_metrics"] = data
         except:
             log.warn("No bc_metrics info for lane {}".format(lane))
示例#25
0
文件: __init__.py 项目: vals/bcbb
 def parse_filter_metrics(self):
     """pre-CASAVA: Parse filter metrics at flowcell level"""
     log.info("parse_filter_metrics for flowcell {}".format(self["RunInfo"]["Flowcell"]))
     for lane in self._lanes:
         pattern = "{}_[0-9]+_[0-9A-Za-z]+(_nophix)?.filter_metrics".format(lane)
         self["lanes"][str(lane)]["filter_metrics"] = {"reads":None, "reads_aligned":None, "reads_fail_align":None}
         files = self.filter_files(pattern)
         try:
             fp = open(files[0])
             parser = MetricsParser()
             data = parser.parse_filter_metrics(fp)
             fp.close()
             self["lanes"][str(lane)]["filter_metrics"] = data
         except:
             log.warn("No filter nophix metrics for lane {}".format(lane))
示例#26
0
 def parse_filter_metrics(self):
     """pre-CASAVA: Parse filter metrics at flowcell level"""
     log.info("parse_filter_metrics for flowcell {}".format(self["RunInfo"]["Flowcell"]))
     for lane in self._lanes:
         pattern = "{}_[0-9]+_[0-9A-Za-z]+(_nophix)?.filter_metrics".format(lane)
         self["lanes"][str(lane)]["filter_metrics"] = {"reads":None, "reads_aligned":None, "reads_fail_align":None}
         files = self.filter_files(pattern)
         try:
             fp = open(files[0])
             parser = MetricsParser()
             data = parser.parse_filter_metrics(fp)
             fp.close()
             self["lanes"][str(lane)]["filter_metrics"] = data
         except:
             log.warn("No filter nophix metrics for lane {}".format(lane))
示例#27
0
def get_spreadsheet(ssheet_title, encoded_credentials):
    """Connect to Google docs and get a spreadsheet"""

    # Convert the spreadsheet title to unicode
    ssheet_title = _to_unicode(ssheet_title)

    # Create a client class which will make HTTP requests with Google Docs server.
    client = g_spreadsheet.get_client()
    bcbio.google.connection.authenticate(client, encoded_credentials)

    # Locate the spreadsheet
    ssheet = g_spreadsheet.get_spreadsheet(client, ssheet_title)

    # Check that we got a result back
    if not ssheet:
        logger2.warn("No document with specified title '%s' found in \
                      GoogleDocs repository" % ssheet_title)
        return (None, None)

    return (client, ssheet)
示例#28
0
 def parse_run_info_yaml(self, run_info_yaml):
     log.info("parse_run_info_yaml: going to read {} in directory {}".format(run_info_yaml, self.path))
     fp = open(run_info_yaml)
     runinfo = yaml.load(fp)
     fp.close()
     for info in runinfo:
         if not self["lane"].has_key(info["lane"]):
             lane = LaneQCMetrics(self.get_full_flowcell(), self.get_date(), info["lane"])
             self["lane"][info["lane"]] = lane
             ## Add sample for unmatched data
             sample = SampleQCMetrics(self.get_full_flowcell(), self.get_date(), info["lane"], "unmatched", "unmatched", "NA", "NA", "NA", "NA")
             bc_index = "%s_%s" % (info["lane"], "unmatched")
             self.sample[bc_index] = sample
         ## Lane could be empty
         try:
             for mp in info["multiplex"]:
                 sample = SampleQCMetrics(self.get_full_flowcell(), self.get_date(), info["lane"], mp["name"], mp["barcode_id"], mp.get("sample_prj", None), mp["sequence"], mp["barcode_type"], mp.get("genomes_filter_out", None))
                 bc_index = "%s_%s" % (info["lane"], mp["barcode_id"])
                 self.sample[bc_index] = sample
         except:
             log.warn("No multiplexing information for lane %s" % info['lane'])
     self["metrics"]["run_info_yaml"] = runinfo
示例#29
0
文件: __init__.py 项目: vals/bcbb
 def parse_run_info_yaml(self, run_info_yaml):
     log.info("parse_run_info_yaml: going to read {} in directory {}".format(run_info_yaml, self.path))
     fp = open(run_info_yaml)
     runinfo = yaml.load(fp)
     fp.close()
     for info in runinfo:
         if not self["lane"].has_key(info["lane"]):
             lane = LaneQCMetrics(self.get_full_flowcell(), self.get_date(), info["lane"])
             self["lane"][info["lane"]] = lane
             ## Add sample for unmatched data
             sample = SampleQCMetrics(self.get_full_flowcell(), self.get_date(), info["lane"], "unmatched", "unmatched", "NA", "NA", "NA", "NA")
             bc_index = "%s_%s" % (info["lane"], "unmatched")
             self.sample[bc_index] = sample
         ## Lane could be empty
         try:
             for mp in info["multiplex"]:
                 sample = SampleQCMetrics(self.get_full_flowcell(), self.get_date(), info["lane"], mp["name"], mp["barcode_id"], mp.get("sample_prj", None), mp["sequence"], mp["barcode_type"], mp.get("genomes_filter_out", None))
                 bc_index = "%s_%s" % (info["lane"], mp["barcode_id"])
                 self.sample[bc_index] = sample
         except:
             log.warn("No multiplexing information for lane %s" % info['lane'])
     self["metrics"]["run_info_yaml"] = runinfo
示例#30
0
def split_sample_name(sample_name):
    """Split a sample name into parts consisting of 
        - project_name [PNNN]
        - sample_number [NNN]
        - reception_qc [F]
        - prep_version [B]
        - index_id [indexN]
    """

    splits = sample_name.split("_")
    prep = ""
    try:
        if len(splits) < 2:
            raise ValueError()
        if splits[0][0] != 'P':
            raise ValueError()
        if type(int(splits[0][1:])) != int:
            raise ValueError()
        while splits[1][-1] in "FB":
            prep = "%c%s" % (splits[1][-1], prep)
            splits[1] = splits[1][0:-1]
        if type(int(splits[1])) != int:
            raise ValueError()
    except:
        logger2.warn(
            "Sample name '%s' does not follow the expected format PXXX_XXX[FB]"
            % sample_name)
    if len(prep) > 0:
        splits[1] = "%s%s" % (splits[1], prep)

    name = []
    index = []
    for s in splits:
        if len(index) == 0 and s.find('index') < 0:
            name.append(s)
        else:
            index.append(s)
    return "_".join(name), "_".join(index)
示例#31
0
文件: flowcell.py 项目: aminmg/bcbb
def split_sample_name(sample_name):
    """Split a sample name into parts consisting of 
        - project_name [PNNN]
        - sample_number [NNN]
        - reception_qc [F]
        - prep_version [B]
        - index_id [indexN]
    """
    
    splits = sample_name.split("_")
    prep = ""
    try:
        if len(splits) < 2:
            raise ValueError()
        if splits[0][0] != 'P':
            raise ValueError()
        if type(int(splits[0][1:])) != int:
            raise ValueError()
        while splits[1][-1] in "FB":
            prep = "%c%s" % (splits[1][-1],prep)
            splits[1] = splits[1][0:-1]
        if type(int(splits[1])) != int:
            raise ValueError()
    except:
        logger2.warn("Sample name '%s' does not follow the expected format PXXX_XXX[FB]" % sample_name)
    if len(prep) > 0:
        splits[1] = "%s%s" % (splits[1],prep)
        
    name = []
    index = []
    for s in splits:
        if len(index) == 0 and s.find('index') < 0:
            name.append(s)
        else:
            index.append(s)
    return "_".join(name), "_".join(index)
示例#32
0
def split_by_barcode(fastq1, fastq2, multiplex, base_name, dirs, config):
    """Split a fastq file into multiplex pieces using barcode details.
    """
    unmatched_str = "unmatched"
    demultiplexed = config["algorithm"].get("demultiplexed", False)
    if len(multiplex) == 1 and multiplex[0]["barcode_id"] is None:
        return {None: (fastq1, fastq2)}

    bc_dir = os.path.join(dirs["work"], "%s_barcode" % base_name)
    nomatch_file = "%s_%s_1_fastq.txt" % (base_name, unmatched_str)
    metrics_file = "%s.bc_metrics" % base_name
    out_files = []
    for info in multiplex:
        if demultiplexed:
            out_tuple = [info["barcode_id"]]
            # If the data is already demultiplexed, the sequence files must have been specified in the config
            out_tuple.extend(get_fastq_files(dirs["fastq"], dirs["work"],
                                               info, "", config=config))
            #out_tuple.extend([fastq1,fastq2])
            out_files.append(tuple(out_tuple))
            continue

        fq_fname = lambda x: os.path.join(bc_dir, "%s_%s_%s_fastq.txt" %
                             (base_name, info["barcode_id"], x))
        bc_file1 = fq_fname("1")
        bc_file2 = fq_fname("2") if fastq2 else None
        out_files.append((info["barcode_id"], bc_file1, bc_file2))

    if not utils.file_exists(bc_dir) and not demultiplexed:
        with file_transaction(bc_dir) as tx_bc_dir:
            with utils.chdir(tx_bc_dir):
                tag_file, need_trim = _make_tag_file(multiplex, unmatched_str, config)
                cl = [config["program"]["barcode"], tag_file,
                      "%s_--b--_--r--_fastq.txt" % base_name, fastq1]
                if fastq2:
                    cl.append(fastq2)

                cl.append("--mismatch=%s" % config["algorithm"]["bc_mismatch"])
                cl.append("--metrics=%s" % metrics_file)
                if int(config["algorithm"]["bc_read"]) > 1:
                    cl.append("--read=%s" % config["algorithm"]["bc_read"])

                if int(config["algorithm"]["bc_position"]) == 5:
                    cl.append("--five")

                if config["algorithm"].get("bc_allow_indels", True) is False:
                    cl.append("--noindel")

                if "bc_offset" in config["algorithm"]:
                    cl.append("--bc_offset=%s" % config["algorithm"]["bc_offset"])

                subprocess.check_call(cl)

    else:
        with utils.curdir_tmpdir() as tmp_dir:
            with utils.chdir(tmp_dir):
                _, need_trim = _make_tag_file(multiplex, unmatched_str, config)

    out = {}
    for b, f1, f2 in out_files:
        if os.path.exists(f1):
            if b in need_trim:
                f1, f2 = _basic_trim(f1, f2, need_trim[b], config)

            out[b] = (f1, f2)

    if not demultiplexed:
        return out

    casava_stats = _find_demultiplex_stats_htm(base_name, config)
    if not casava_stats:
        logger2.warn("Demultiplex_Stats.htm not found! " \
                     "Barcode stats will be meaningless.")
        bc_metrics = {int(multiplex[0]["lane"]): \
                        {None: {
                             "read_count": 0,
                             "name": None,
                             "barcode_id": None}}
                             }
    else:
        bc_metrics = _parse_demultiplex_stats_htm(casava_stats)

    _write_demultiplex_metrics(multiplex, bc_metrics, metrics_file)
    
    return out
示例#33
0
文件: qcreport.py 项目: vals/bcbb
def report_to_statusdb(fc_name, fc_date, run_info_yaml, dirs, config):
    """
    Create statusdb report on a couchdb server.

    A FlowcellQCMetrics object holds information about a flowcell. QC
    results are stored at the flowcell level and sample level
    depending on analysis. Lane level QC data are stored in the
    FlowcellQCMetrics object.
    """
    success = True
    try:
        statusdb_config = config.get("statusdb", None)
        if statusdb_config is None:
            log.info("Could not find statusdb section in configuration. No statusdb reporting will be done")
            return False
        statusdb_url =  statusdb_config.get("url", None)
        if statusdb_url is None:
            log.warn("No url field found in statusdb configuration section.")
            return False

        # Add email notification
        email = statusdb_config.get("statusdb_email_notification", None)
        smtp_host = config.get("smtp_host", "")
        smtp_port = config.get("smtp_port", "")
        log_handler = create_log_handler({'email': email, 'smtp_host': smtp_host, 'smtp_port': smtp_port}, True)

        with log_handler.applicationbound():
            with logbook.Processor(lambda record: record.extra.__setitem__('run', "%s_%s" % (fc_date, fc_name))):
                log.info("Started creating QC Metrics report on statusdb for %s_%s on %s" % (fc_date, fc_name, datetime.now().isoformat()))

                # Create object and parse all available metrics; no checking
                # is currently done for missing files
                try:
                    qc_obj = FlowcellQCMetrics(fc_date, fc_name, run_info_yaml, dirs.get("work", None), dirs.get("flowcell", None))
                except:
                    qc_obj = None
                # FIXME: error checking!
                if qc_obj is not None:
                    try:
                        # Save data at a sample level
                        log.info("Connecting to server at %s" % statusdb_url)
                        try:
                            couch = couchdb.Server(url="http://%s" % statusdb_url)
                        except:
                            log.warn("Connecting to server at %s failed" % statusdb_url)
                        log.info("Connecting to server at %s succeeded" % statusdb_url)
                        db=couch['qc']
                        # Save samples
                        for s in qc_obj.sample.keys():
                            obj = qc_obj.sample[s]
                            log.info("Saving sample %s" % obj.name())
                            _save_obj(db, obj, statusdb_url)
                        # Save flowcell object
                        _save_obj(db, qc_obj, statusdb_url)
                    except Exception as e:
                        success = False
                else:
                    log.warn("Couldn't populate FlowcellQCMetrics object. No QC data written to statusdb for %s_%s" % (fc_date, fc_name))
                    success = False
            if success:
                log.info("QC Metrics report successfully written to statusdb for %s_%s on %s" \
                         % (fc_date, fc_name, datetime.now().isoformat()))
            else:
                log.warn("Encountered exception when writing to statusdb for %s_%s on %s" \
                         % (fc_date, fc_name, datetime.now().isoformat()))

    except Exception as e:
        success = False
        log.warn("Encountered exception when writing QC metrics to statusdb: %s" % e)

    return success
示例#34
0
def report_to_statusdb(fc_name, fc_date, run_info_yaml, dirs, config):
    """
    Create statusdb report on a couchdb server.

    A FlowcellQCMetrics object holds information about a flowcell. QC
    results are stored at the flowcell level and sample level
    depending on analysis. Lane level QC data are stored in the
    FlowcellQCMetrics object.
    """
    success = True
    try:
        statusdb_config = config.get("statusdb", None)
        if statusdb_config is None:
            log.info(
                "Could not find statusdb section in configuration. No statusdb reporting will be done"
            )
            return False

        statusdb_url = statusdb_config.get("url", None)
        if statusdb_url is None:
            log.warn("No url field found in statusdb configuration section.")
            return False

        # Add email notification
        email = statusdb_config.get("statusdb_email_notification", None)
        smtp_host = config.get("smtp_host", "")
        smtp_port = config.get("smtp_port", "")
        log_handler = create_log_handler(
            {
                'email': email,
                'smtp_host': smtp_host,
                'smtp_port': smtp_port
            }, True)

        with log_handler.applicationbound():
            with logbook.Processor(lambda record: record.extra.__setitem__(
                    'run', "%s_%s" % (fc_date, fc_name))):
                log.info(
                    "Started creating QC Metrics report on statusdb for %s_%s on %s"
                    % (fc_date, fc_name, datetime.now().isoformat()))

                # Create object and parse all available metrics; no checking
                # is currently done for missing files
                try:
                    qc_obj = FlowcellQCMetrics(fc_date, fc_name, run_info_yaml,
                                               dirs.get("work", None),
                                               dirs.get("flowcell", None))
                except:
                    qc_obj = None
                # FIXME: error checking!
                if qc_obj is not None:
                    try:
                        # Save data at a sample level
                        log.info("Connecting to server at %s" % statusdb_url)
                        try:
                            couch = couchdb.Server(url="http://%s" %
                                                   statusdb_url)
                        except:
                            log.warn("Connecting to server at %s failed" %
                                     statusdb_url)
                        log.info("Connecting to server at %s succeeded" %
                                 statusdb_url)
                        db = couch['qc']
                        # Save samples
                        for s in qc_obj.sample.keys():
                            obj = qc_obj.sample[s]
                            log.info("Saving sample %s" % obj.name())
                            _save_obj(db, obj, statusdb_url)
                        # Save flowcell object
                        _save_obj(db, qc_obj, statusdb_url)
                    except Exception as e:
                        success = False
                else:
                    log.warn(
                        "Couldn't populate FlowcellQCMetrics object. No QC data written to statusdb for %s_%s"
                        % (fc_date, fc_name))
                    success = False
            if success:
                log.info("QC Metrics report successfully written to statusdb for %s_%s on %s" \
                         % (fc_date, fc_name, datetime.now().isoformat()))
            else:
                log.warn("Encountered exception when writing to statusdb for %s_%s on %s" \
                         % (fc_date, fc_name, datetime.now().isoformat()))

    except Exception as e:
        success = False
        log.warn(
            "Encountered exception when writing QC metrics to statusdb: %s" %
            e)

    return success
示例#35
0
文件: lane.py 项目: senthil10/bcbb
def process_lane(lane_items, fc_name, fc_date, dirs, config):
    """Prepare lanes, potentially splitting based on barcodes.
    """

    lane_name = "%s_%s_%s" % (lane_items[0]['lane'], fc_date, fc_name)
    full_fastq1, full_fastq2 = get_fastq_files(dirs["fastq"],
                                               dirs["work"],
                                               lane_items[0],
                                               fc_name,
                                               config=config)

    # Filter phiX
    custom_config = _update_config_w_custom(config, lane_items[0])
    if custom_config["algorithm"].get("filter_phix", False):
        # If we are starting from demultiplexed material, we will skip a lane-wise screening
        # Screening will be performed on a sample basis
        if custom_config["algorithm"].get("demultiplexed", False):
            logger.warn("Will not filter phix lane-wise on already demultiplexed files. " \
                "You will have to specify genomes_filter_out option for each sample")

        else:
            logger.info("Filtering phiX from %s" % lane_name)
            info = {
                "genomes_filter_out": "spiked_phix",
                "description": lane_name
            }
            processed = remove_contaminants(full_fastq1, full_fastq2, info,
                                            lane_name, info["description"],
                                            dirs, custom_config)
            (full_fastq1, full_fastq2, _, lane_name) = processed[0][0:4]

    logger.info("Demultiplexing %s" % lane_name)
    bc_files = split_by_barcode(full_fastq1, full_fastq2, lane_items,
                                lane_name, dirs, config)

    out = []
    for item in lane_items:
        config = _update_config_w_custom(config, item)
        # Can specify all barcodes but might not have actual sequences
        # Would be nice to have a good way to check this is okay here.
        if item["barcode_id"] in bc_files:
            fastq1, fastq2 = bc_files[item["barcode_id"]]
            cur_lane_name = lane_name
            cur_lane_desc = item["description"]
            if item.get("name", "") and config["algorithm"].get(
                    "include_short_name", True):
                cur_lane_desc = "%s : %s" % (item["name"], cur_lane_desc)

            if item["barcode_id"] is not None:
                cur_lane_name += "_%s" % (item["barcode_id"])

            if config["algorithm"].get("trim_reads", False):
                trim_info = brun_trim_fastq(
                    [x for x in [fastq1, fastq2] if x is not None], dirs,
                    config)
                fastq1 = trim_info[0]
                if fastq2 is not None:
                    fastq2 = trim_info[1]

            out.append((fastq1, fastq2, item, cur_lane_name, cur_lane_desc,
                        dirs, config))

    return out
示例#36
0
def split_by_barcode(fastq1, fastq2, multiplex, base_name, dirs, config):
    """Split a fastq file into multiplex pieces using barcode details.
    """
    unmatched_str = "unmatched"
    demultiplexed = config["algorithm"].get("demultiplexed", False)
    if len(multiplex) == 1 and multiplex[0]["barcode_id"] is None:
        return {None: (fastq1, fastq2)}

    bc_dir = os.path.join(dirs["work"], "%s_barcode" % base_name)
    nomatch_file = "%s_%s_1_fastq.txt" % (base_name, unmatched_str)
    metrics_file = "%s.bc_metrics" % base_name
    out_files = []
    for info in multiplex:
        if demultiplexed:
            out_tuple = [info["barcode_id"]]
            # If the data is already demultiplexed, the sequence files must have been specified in the config
            out_tuple.extend(
                get_fastq_files(dirs["fastq"],
                                dirs["work"],
                                info,
                                "",
                                config=config))
            #out_tuple.extend([fastq1,fastq2])
            out_files.append(tuple(out_tuple))
            continue

        fq_fname = lambda x: os.path.join(
            bc_dir, "%s_%s_%s_fastq.txt" % (base_name, info["barcode_id"], x))
        bc_file1 = fq_fname("1")
        bc_file2 = fq_fname("2") if fastq2 else None
        out_files.append((info["barcode_id"], bc_file1, bc_file2))

    if not utils.file_exists(bc_dir) and not demultiplexed:
        with file_transaction(bc_dir) as tx_bc_dir:
            with utils.chdir(tx_bc_dir):
                tag_file, need_trim = _make_tag_file(multiplex, unmatched_str,
                                                     config)
                cl = [
                    config["program"]["barcode"], tag_file,
                    "%s_--b--_--r--_fastq.txt" % base_name, fastq1
                ]
                if fastq2:
                    cl.append(fastq2)

                cl.append("--mismatch=%s" % config["algorithm"]["bc_mismatch"])
                cl.append("--metrics=%s" % metrics_file)
                if int(config["algorithm"]["bc_read"]) > 1:
                    cl.append("--read=%s" % config["algorithm"]["bc_read"])

                if int(config["algorithm"]["bc_position"]) == 5:
                    cl.append("--five")

                if config["algorithm"].get("bc_allow_indels", True) is False:
                    cl.append("--noindel")

                if "bc_offset" in config["algorithm"]:
                    cl.append("--bc_offset=%s" %
                              config["algorithm"]["bc_offset"])

                subprocess.check_call(cl)

    else:
        with utils.curdir_tmpdir() as tmp_dir:
            with utils.chdir(tmp_dir):
                _, need_trim = _make_tag_file(multiplex, unmatched_str, config)

    out = {}
    for b, f1, f2 in out_files:
        if os.path.exists(f1):
            if b in need_trim:
                f1, f2 = _basic_trim(f1, f2, need_trim[b], config)

            out[b] = (f1, f2)

    if not demultiplexed:
        return out

    casava_stats = _find_demultiplex_stats_htm(base_name, config)
    if not casava_stats:
        logger2.warn("Demultiplex_Stats.htm not found! " \
                     "Barcode stats will be meaningless.")
        bc_metrics = {int(multiplex[0]["lane"]): \
                        {None: {
                             "read_count": 0,
                             "name": None,
                             "barcode_id": None}}
                             }
    else:
        bc_metrics = _parse_demultiplex_stats_htm(casava_stats)

    _write_demultiplex_metrics(multiplex, bc_metrics, metrics_file)

    return out
示例#37
0
def create_report_on_gdocs(fc_date, fc_name, run_info_yaml, dirs, config):
    """Create reports on gdocs containing both demultiplexed read counts and QC data.
    """
    success = True
    try:
        # Inject the fc_date and fc_name in the email subject
        def record_processor(record):
            return record.extra.__setitem__('run', "%s_%s" % (fc_date, fc_name))

        # Parse the run_info.yaml file
        log.debug("Loading this run_info: {}".format(run_info_yaml))
        with open(run_info_yaml, "r") as fh:
            run_info = yaml.load(fh)

        # Get the gdocs account credentials
        encoded_credentials = get_credentials(config)
        if not encoded_credentials:
            log.warn("Could not find Google Docs account credentials in configuration. \
                      No sequencing report was written")
            return False

        # Get the required parameters from the post_process.yaml configuration file
        gdocs = config.get("gdocs_upload", None)

        # Add email notification
        email = gdocs.get("gdocs_email_notification", None)
        smtp_host = config.get("smtp_host", "")
        smtp_port = config.get("smtp_port", "")
        log_handler = create_log_handler({'email': email, \
                                          'smtp_host': smtp_host, \
                                          'smtp_port': smtp_port}, True)

    except Exception as e:
        success = False
        log.warn("Encountered exception when writing sequencing report to Google Docs: %s" % e)

    with log_handler.applicationbound(), logbook.Processor(record_processor):
        try:
            log.info("Started creating sequencing report on Google docs for %s_%s on %s" \
                % (fc_date, fc_name, datetime.datetime.now().isoformat()))

            # Get a flowcell object
            fc = Flowcell(fc_name, fc_date, run_info, dirs.get("work", None))

            # Get the GDocs demultiplex result file title
            gdocs_dmplx_spreadsheet = gdocs.get("gdocs_dmplx_file", None)
            # Get the GDocs QC file title
            gdocs_qc_spreadsheet = gdocs.get("gdocs_qc_file", None)

            # FIXME: Make the bc stuff use the Flowcell module
            if gdocs_dmplx_spreadsheet is not None:
                # Upload the data
                bc_metrics.write_run_report_to_gdocs(fc, fc_date, \
                    fc_name, gdocs_dmplx_spreadsheet, encoded_credentials, append=True)
            else:
                log.warn("Could not find Google Docs demultiplex results file \
                    title in configuration. No demultiplex counts were \
                    written to Google Docs for %s_%s" % (fc_date, fc_name))

            # Parse the QC metrics
            try:
                qc = RTAQCMetrics(dirs.get("flowcell", None))
            except:
                qc = None

            if gdocs_qc_spreadsheet is not None and qc is not None:
                qc_metrics.write_run_report_to_gdocs(fc, qc, gdocs_qc_spreadsheet, encoded_credentials)
            else:
                log.warn("Could not find Google Docs QC file title in configuration. " \
                         "No QC data were written to Google Docs " \
                         "for %s_%s".format(fc_date, fc_name))

            # Get the projects parent folder
            projects_folder = gdocs.get("gdocs_projects_folder", None)

            # Write the bc project summary report
            if projects_folder is not None:
                create_project_report_on_gdocs(fc, qc, \
                    encoded_credentials, projects_folder)

        except Exception as e:
            success = False
            log.warn("Encountered exception when writing sequencing report " \
                     "to Google Docs: {}".format(e))

        if success:
            log.info("Sequencing report successfully created on Google " \
                     "docs for {}_{} on {}".format(fc_date, fc_name, datetime.datetime.now().isoformat()))
        else:
            log.warn("Encountered exception when writing sequencing " \
                     "report for %s_%s to Google docs on %s" \
                     % (fc_date, fc_name, datetime.datetime.now().isoformat()))

    return success
示例#38
0
def create_report_on_gdocs(fc_date, fc_name, run_info_yaml, dirs, config):
    """Create reports on gdocs containing both demultiplexed read counts and QC data.
    """
    success = True
    try:
        # Inject the fc_date and fc_name in the email subject
        def record_processor(record):
            return record.extra.__setitem__('run',
                                            "%s_%s" % (fc_date, fc_name))

        # Parse the run_info.yaml file
        log.debug("Loading this run_info: {}".format(run_info_yaml))
        with open(run_info_yaml, "r") as fh:
            run_info = yaml.load(fh)

        # Get the gdocs account credentials
        encoded_credentials = get_credentials(config)
        if not encoded_credentials:
            log.warn(
                "Could not find Google Docs account credentials in configuration. \
                      No sequencing report was written")
            return False

        # Get the required parameters from the post_process.yaml configuration file
        gdocs = config.get("gdocs_upload", None)

        # Add email notification
        email = gdocs.get("gdocs_email_notification", None)
        smtp_host = config.get("smtp_host", "")
        smtp_port = config.get("smtp_port", "")
        log_handler = create_log_handler({'email': email, \
                                          'smtp_host': smtp_host, \
                                          'smtp_port': smtp_port}, True)

    except Exception as e:
        success = False
        log.warn(
            "Encountered exception when writing sequencing report to Google Docs: %s"
            % e)

    with log_handler.applicationbound(), logbook.Processor(record_processor):
        try:
            log.info("Started creating sequencing report on Google docs for %s_%s on %s" \
                % (fc_date, fc_name, datetime.datetime.now().isoformat()))

            # Get a flowcell object
            fc = Flowcell(fc_name, fc_date, run_info, dirs.get("work", None))

            # Get the GDocs demultiplex result file title
            gdocs_dmplx_spreadsheet = gdocs.get("gdocs_dmplx_file", None)
            # Get the GDocs QC file title
            gdocs_qc_spreadsheet = gdocs.get("gdocs_qc_file", None)

            # FIXME: Make the bc stuff use the Flowcell module
            if gdocs_dmplx_spreadsheet is not None:
                # Upload the data
                bc_metrics.write_run_report_to_gdocs(fc, fc_date, \
                    fc_name, gdocs_dmplx_spreadsheet, encoded_credentials, append=True)
            else:
                log.warn("Could not find Google Docs demultiplex results file \
                    title in configuration. No demultiplex counts were \
                    written to Google Docs for %s_%s" % (fc_date, fc_name))

            # Parse the QC metrics
            try:
                qc = RTAQCMetrics(dirs.get("flowcell", None))
            except:
                qc = None

            if gdocs_qc_spreadsheet is not None and qc is not None:
                qc_metrics.write_run_report_to_gdocs(fc, qc,
                                                     gdocs_qc_spreadsheet,
                                                     encoded_credentials)
            else:
                log.warn("Could not find Google Docs QC file title in configuration. " \
                         "No QC data were written to Google Docs " \
                         "for %s_%s".format(fc_date, fc_name))

            # Get the projects parent folder
            projects_folder = gdocs.get("gdocs_projects_folder", None)

            # Write the bc project summary report
            if projects_folder is not None:
                create_project_report_on_gdocs(fc, qc, \
                    encoded_credentials, projects_folder)

        except Exception as e:
            success = False
            log.warn("Encountered exception when writing sequencing report " \
                     "to Google Docs: {}".format(e))

        if success:
            log.info("Sequencing report successfully created on Google " \
                     "docs for {}_{} on {}".format(fc_date, fc_name, datetime.datetime.now().isoformat()))
        else:
            log.warn("Encountered exception when writing sequencing " \
                     "report for %s_%s to Google docs on %s" \
                     % (fc_date, fc_name, datetime.datetime.now().isoformat()))

    return success