Exemplo n.º 1
0
def main(run_name, gdocs_spreadsheet, encoded_credentials_file, run_info_yaml, analysis_dir, archive_dir, gdocs_worksheet, gdocs_projects_folder, append, split_on_project):

    log.info("Processing run: %s" % run_name)
    
    # If not supplied, assume that the configuration file is named run_info.yaml and resides in the archive dir
    if not run_info_yaml:
        run_info_yaml = os.path.join(archive_dir,"run_info.yaml")
        log.info("No configuration file supplied, assuming it is '%s'" % run_info_yaml)
        
    if not os.path.exists(run_info_yaml):
        log.warn("Could not find required run_info.yaml configuration file at '%s'" % run_info_yaml)
        return
    with open(run_info_yaml) as in_handle:
        run_info = {'details': yaml.load(in_handle)}

    # Get the google docs crdentials
    gdocs_credentials = ""
    if not os.path.exists(encoded_credentials_file):
        log.warn("The Google Docs credentials file could not be found. No demultiplex data was written")
        return
    with open(encoded_credentials_file) as fh:
        gdocs_credentials = fh.read().strip()
    

    fc_name, fc_date = get_flowcell_info(run_name)
    
    # Get the barcode statistics
    bc_metrics = get_bc_stats(fc_date,fc_name,analysis_dir,run_info)
    
    # Write the report
    write_run_report_to_gdocs(fc_date,fc_name,bc_metrics,gdocs_spreadsheet,gdocs_credentials,gdocs_worksheet,append,split_on_project)
    
    # Write the bc project summary report
    if gdocs_projects_folder:
        write_project_report_to_gdocs(fc_date,fc_name,bc_metrics,gdocs_credentials,gdocs_projects_folder)
Exemplo n.º 2
0
 def _make_bc_metrics(self, runname, analysisdir):
     """Parses the run_info and generates lane folders and barcode metrics corresponding to the lanes and barcodes used"""
     fc_name, fc_date = get_flowcell_info(runname)
     barcode_dir_suffix = "_%s_%s_barcode" % (fc_date,fc_name)
     
     for lane in self.run_info:
         lane_name = str(lane['lane'])
         bc_dir = os.path.join(analysisdir,"%s%s" % (lane_name,barcode_dir_suffix))
         
         # Create the directory if it doesn't exist
         if not os.path.exists(bc_dir):      
             os.makedirs(bc_dir)
         
         # Create, or if it exists, append to the bc_metrics file
         bc_file = os.path.join(bc_dir,"%s_%s_%s_bc.metrics" % (lane_name,fc_date,fc_name))
         with open(bc_file,"a") as fh:
             bcw = UnicodeWriter(fh,dialect='excel-tab')
             
             # Loop over the barcodes and generate random read counts
             bcs = lane.get("multiplex",[])
             for bc in bcs:
                 bc_id = str(bc['barcode_id'])
                 bc_count = random.randint(1,10000000)
                 bcw.writerow([bc_id,bc_count])
             # Lastly write some unmatched counts, or in case no multiplex data was given, a 'trim' entry
             if len(bcs):
                 bcw.writerow(['unmatched',random.randint(1,10000000)])
             else:
                 bcw.writerow(['trim',random.randint(1,100000000)])
Exemplo n.º 3
0
def organize(dirs, config, run_info_yaml):
    """Organize run information from a passed YAML file or the Galaxy API.

    Creates the high level structure used for subsequent processing.
    """
    if run_info_yaml and os.path.exists(run_info_yaml):
        logger.info("Using input YAML configuration: %s" % run_info_yaml)
        run_details = _run_info_from_yaml(dirs["flowcell"], run_info_yaml, config)
    else:
        logger.info("Fetching run details from Galaxy instance")
        fc_name, fc_date = get_flowcell_info(dirs["flowcell"])
        galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key'])
        run_details = []
        galaxy_info = galaxy_api.run_details(fc_name, fc_date)
        for item in galaxy_info["details"]:
            item["upload"] = {"method": "galaxy", "run_id": galaxy_info["run_id"],
                              "fc_name": fc_name, "fc_date": fc_date}
            run_details.append(item)
    out = []
    for item in run_details:
        item["config"] = config_utils.update_w_custom(config, item)
        item["dirs"] = dirs
        if "name" not in item:
            item["name"] = ["", item["description"]]
        item = add_reference_resources(item)
        out.append(item)
    return out
Exemplo n.º 4
0
def _generate_fastq(fc_dir, config, compress_fastq):
    """Generate fastq files for the current flowcell.
    """
    fc_name, fc_date = get_flowcell_info(fc_dir)
    short_fc_name = "%s_%s" % (fc_date, fc_name)
    fastq_dir = get_fastq_dir(fc_dir)
    basecall_dir = os.path.split(fastq_dir)[0]
    postprocess_dir = config.get("postprocess_dir", "")
    if postprocess_dir:
        fastq_dir = os.path.join(postprocess_dir, os.path.basename(fc_dir), "fastq")

    if not fastq_dir == fc_dir:  # and not os.path.exists(fastq_dir):

        with utils.chdir(basecall_dir):
            lanes = sorted(list(set([f.split("_")[1] for f in
                glob.glob("*qseq.txt")])))
            cl = ["solexa_qseq_to_fastq.py", short_fc_name,
                  ",".join(lanes)]
            if postprocess_dir:
                cl += ["-o", fastq_dir]
            if compress_fastq:
                cl += ["--gzip"]

            logger2.debug("Converting qseq to fastq on all lanes.")
            subprocess.check_call(cl)

    return fastq_dir
Exemplo n.º 5
0
    def _make_bc_metrics(self, runname, analysisdir):
        """Parses the run_info and generates lane folders and barcode metrics corresponding to the lanes and barcodes used"""
        fc_name, fc_date = get_flowcell_info(runname)
        barcode_dir_suffix = "_%s_%s_barcode" % (fc_date, fc_name)

        for lane in self.run_info:
            lane_name = str(lane['lane'])
            bc_dir = os.path.join(analysisdir, "%s%s" % (lane_name, barcode_dir_suffix))

            # Create the directory if it doesn't exist
            if not os.path.exists(bc_dir):
                os.makedirs(bc_dir)

            # Create, or if it exists, append to the bc_metrics file
            bc_file = os.path.join(bc_dir, "%s_%s_%s.bc_metrics" % (lane_name, fc_date, fc_name))
            with open(bc_file, "a") as fh:
                bcw = UnicodeWriter(fh, dialect='excel-tab')

                # Loop over the barcodes and generate random read counts
                bcs = lane.get("multiplex", [])
                for bc in bcs:
                    bc_id = str(bc['barcode_id'])
                    bc_count = random.randint(1, 10000000)
                    bcw.writerow([bc_id, bc_count])
                # Lastly write some unmatched counts, or in case no multiplex data was given, a 'trim' entry
                if len(bcs):
                    bcw.writerow(['unmatched', random.randint(1, 10000000)])
                else:
                    bcw.writerow(['trim', random.randint(1, 100000000)])
Exemplo n.º 6
0
def _run_info_from_yaml(fc_dir, run_info_yaml):
    """Read run information from a passed YAML file.
    """
    with open(run_info_yaml) as in_handle:
        loaded = yaml.load(in_handle)
    fc_name = None
    try:
        fc_name, fc_date = get_flowcell_info(fc_dir)
    except ValueError:
        pass
    if isinstance(loaded, dict):
        if loaded.has_key("fc_name") and loaded.has_key("fc_date"):
            fc_name = loaded["fc_name"].replace(" ", "_")
            fc_date = str(loaded["fc_date"]).replace(" ", "_")
        loaded = loaded["details"]
    if fc_name is None:
        fc_name, fc_date = _unique_flowcell_info()
    run_details = []
    for i, item in enumerate(loaded):
        if not item.has_key("lane"):
            item["lane"] = _generate_lane(item["files"], i)
        if not item.has_key("description"):
            item["description"] = str(item["lane"])
        run_details.append(item)
    run_info = dict(details=run_details, run_id="")
    return fc_name, fc_date, run_info
Exemplo n.º 7
0
def main(config_file, fc_dir, analysis_dir):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)
    galaxy_api = GalaxyApiAccess(config["galaxy_url"], config["galaxy_api_key"])

    fc_name, fc_date = get_flowcell_info(fc_dir)
    folder_name = "%s_%s" % (fc_date, fc_name)
    run_info = lims_run_details(galaxy_api, fc_name, folder_name)
    for (dl_folder, access_role, dbkey, lane, bc_id, name, desc) in run_info:
        print folder_name, lane, bc_id, name, desc, dl_folder
        library_id = get_galaxy_library(dl_folder, galaxy_api)
        folder, cur_galaxy_files = get_galaxy_folder(library_id, folder_name, name, desc, galaxy_api)
        print "Creating storage directory"
        base_select = "%s_%s" % (lane, folder_name)
        store_dir = move_to_storage(
            lane,
            bc_id,
            folder_name,
            select_upload_files(base_select, bc_id, fc_dir, analysis_dir),
            cur_galaxy_files,
            config,
        )
        if store_dir:
            print "Uploading directory of files to Galaxy"
            print galaxy_api.upload_directory(library_id, folder["id"], store_dir, dbkey, access_role)
    add_run_summary_metrics(analysis_dir, galaxy_api)
Exemplo n.º 8
0
def main(config_file, fc_dir, analysis_dir, run_info_yaml=None):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)
    fc_name, fc_date = get_flowcell_info(fc_dir)
    galaxy_api = GalaxyApiAccess(config["galaxy_url"], config["galaxy_api_key"])

    # run_info will override some galaxy details, if present
    if run_info_yaml:
        with open(run_info_yaml) as in_handle:
            run_details = yaml.load(in_handle)
        run_info = dict(details=run_details, run_id="")
    else:
        run_info = galaxy_api.run_details(fc_name, fc_date)

    base_folder_name = "%s_%s" % (fc_date, fc_name)
    run_details = lims_run_details(run_info, fc_name, base_folder_name)
    for (library_name, access_role, dbkey, lane, bc_id, name, desc, local_name) in run_details:
        library_id = get_galaxy_library(library_name, galaxy_api) if library_name else None
        upload_files = list(select_upload_files(local_name, bc_id, fc_dir, analysis_dir, config))

        if len(upload_files) > 0:
            print lane, bc_id, name, desc, library_name
            print "Creating storage directory"
            if library_id:
                folder, cur_galaxy_files = get_galaxy_folder(library_id, base_folder_name, name, desc, galaxy_api)
            else:
                cur_galaxy_files = []
            store_dir = move_to_storage(lane, bc_id, base_folder_name, upload_files, cur_galaxy_files, config)
            if store_dir and library_id:
                print "Uploading directory of files to Galaxy"
                print galaxy_api.upload_directory(library_id, folder["id"], store_dir, dbkey, access_role)
    if galaxy_api:
        add_run_summary_metrics(analysis_dir, galaxy_api)
Exemplo n.º 9
0
def _casava_report_to_metrics(run_info_file, casava_report, dirs):
    """Convert the supplied CASAVA demultiplex report into bcbb-style
    metric files, based on the configuration in the run_info_file.
    Metric files are written to the workdir
    """
    
    metric_files = []
    metrics = defaultdict(dict)
    for report in casava_report:
        for lane, data in dmx._parse_demultiplex_stats_htm(report).items():
            for sequence, metric in data.items():
                # Assert that we are not overwriting a previously parsed metric
                assert not (lane in metrics and sequence in metrics[lane]), \
                "Conflicting demultiplex metrics found for lane {} and index {}. " \
                "This means that there are multiple demultiplex results for the same sample. " \
                "Please review and rectify before proceeding!".format(lane,sequence)
                metrics[lane][sequence] = metric
                
    with open(run_info_file) as fh:
        info = yaml.load(fh)

        fc_name, fc_date = fc.get_flowcell_info(dirs["flowcell"])
        for item in info:
            metrics_file = "{}_{}_{}.bc_metrics".format(item["lane"], fc_date, fc_name)
            multiplex = item.get("multiplex", [])
            for plex in multiplex:
                plex["lane"] = item["lane"]

            dmx._write_demultiplex_metrics(multiplex, metrics, os.path.join(dirs["work"], metrics_file))
            metric_files.append(metrics_file)
    return metric_files
Exemplo n.º 10
0
def _generate_fastq(fc_dir, config, compress_fastq):
    """Generate fastq files for the current flowcell.
    """
    fc_name, fc_date = get_flowcell_info(fc_dir)
    short_fc_name = "%s_%s" % (fc_date, fc_name)
    fastq_dir = get_fastq_dir(fc_dir)
    basecall_dir = os.path.split(fastq_dir)[0]
    postprocess_dir = config.get("postprocess_dir", "")
    if postprocess_dir:
        fastq_dir = os.path.join(postprocess_dir, os.path.basename(fc_dir), "fastq")

    if not fastq_dir == fc_dir:# and not os.path.exists(fastq_dir):

        with utils.chdir(basecall_dir):
            lanes = sorted(list(set([f.split("_")[1] for f in
                glob.glob("*qseq.txt")])))
            cl = ["solexa_qseq_to_fastq.py", short_fc_name,
                  ",".join(lanes)]
            if postprocess_dir:
                cl += ["-o", fastq_dir]
            if compress_fastq:
                cl += ["--gzip"]

            logger2.debug("Converting qseq to fastq on all lanes.")
            subprocess.check_call(cl)

    return fastq_dir
Exemplo n.º 11
0
def _run_info_from_yaml(fc_dir, run_info_yaml):
    """Read run information from a passed YAML file.
    """
    with open(run_info_yaml) as in_handle:
        loaded = yaml.load(in_handle)
    fc_name = None
    try:
        fc_name, fc_date = get_flowcell_info(fc_dir)
    except ValueError:
        pass
    if isinstance(loaded, dict):
        if loaded.has_key("fc_name") and loaded.has_key("fc_date"):
            fc_name = loaded["fc_name"].replace(" ", "_")
            fc_date = str(loaded["fc_date"]).replace(" ", "_")
        loaded = loaded["details"]
    if fc_name is None:
        fc_name, fc_date = _unique_flowcell_info()
    run_details = []
    for i, item in enumerate(loaded):
        if not item.has_key("lane"):
            item["lane"] = _generate_lane(item["files"], i)
        if not item.has_key("description"):
            item["description"] = str(item["lane"])
        run_details.append(item)
    lanes = [x["lane"] for x in run_details]
#  WARNING! Commented to figure out a way to fix multiple projects per lane
#    assert len(lanes) == len(set(lanes)), "Non unique lanes: %s" % lanes
    run_info = dict(details=run_details, run_id="")
    return fc_name, fc_date, run_info
Exemplo n.º 12
0
def _casava_report_to_metrics(run_info_file, casava_report, dirs):
    """Convert the supplied CASAVA demultiplex report into bcbb-style
    metric files, based on the configuration in the run_info_file.
    Metric files are written to the workdir
    """

    metric_files = []
    metrics = defaultdict(dict)
    for report in casava_report:
        for lane, data in dmx._parse_demultiplex_stats_htm(report).items():
            for sequence, metric in data.items():
                # Assert that we are not overwriting a previously parsed metric
                assert not (lane in metrics and sequence in metrics[lane]), \
                "Conflicting demultiplex metrics found for lane {} and index {}. " \
                "This means that there are multiple demultiplex results for the same sample. " \
                "Please review and rectify before proceeding!".format(lane,sequence)
                metrics[lane][sequence] = metric

    with open(run_info_file) as fh:
        info = yaml.load(fh)

        fc_name, fc_date = fc.get_flowcell_info(dirs["flowcell"])
        for item in info:
            metrics_file = "{}_{}_{}.bc_metrics".format(
                item["lane"], fc_date, fc_name)
            multiplex = item.get("multiplex", [])
            for plex in multiplex:
                plex["lane"] = item["lane"]

            dmx._write_demultiplex_metrics(
                multiplex, metrics, os.path.join(dirs["work"], metrics_file))
            metric_files.append(metrics_file)
    return metric_files
Exemplo n.º 13
0
def run_main(config, config_file, fc_dir, run_info_yaml):
    work_dir = os.getcwd()
    align_dir = os.path.join(work_dir, "alignments")

    fc_name, fc_date = get_flowcell_info(fc_dir)
    run_info = _get_run_info(fc_name, fc_date, config, run_info_yaml)
    fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir),
                                                        config, config_file)
    config_file = os.path.join(config_dir, os.path.basename(config_file))
    dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir, "align": align_dir,
            "work": work_dir, "flowcell": fc_dir, "config": config_dir}
    run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name)

    # process each flowcell lane
    lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items)
    lane_items = _run_parallel("process_lane", lanes, dirs, config)
    _run_parallel("process_alignment", lane_items, dirs, config)
    # process samples, potentially multiplexed across multiple lanes
    sample_files, sample_fastq, sample_info = \
                  organize_samples(dirs, fc_name, fc_date, run_items)
    samples = ((n, sample_fastq[n], sample_info[n], bam_files, dirs, config, config_file)
               for n, bam_files in sample_files)
    _run_parallel("process_sample", samples, dirs, config)

    write_metrics(run_info, fc_name, fc_date, dirs)
Exemplo n.º 14
0
def run_has_samplesheet(fc_dir, config, require_single=True):
    """Checks if there's a suitable SampleSheet.csv present for the run.

    Returns the path to the samplesheet if one is found, None otherwise.
    """
    fc_name, _ = get_flowcell_info(fc_dir)
    sheet_dirs = config.get("samplesheet_directories", [])
    fcid_sheet = {}
    for ss_dir in (s for s in sheet_dirs if os.path.exists(s)):
        with utils.chdir(ss_dir):
            for ss in glob.glob("*.csv"):
                fc_ids = _get_flowcell_id(ss, require_single)
                for fcid in fc_ids:
                    if fcid:
                        fcid_sheet[fcid] = os.path.join(ss_dir, ss)

    # difflib handles human errors while entering data on the SampleSheet.
    # Only one best candidate is returned (if any). 0.85 cutoff allows for
    # maximum of 2 mismatches in fcid

    potential_fcids = difflib.get_close_matches(fc_name, fcid_sheet.keys(), 1, 0.85)
    if len(potential_fcids) > 0 and potential_fcids[0] in fcid_sheet:
        return fcid_sheet[potential_fcids[0]]
    else:
        return None
Exemplo n.º 15
0
def main(config_file, fc_dir):
    work_dir = os.getcwd()
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)
    galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key'])
    fc_name, fc_date = get_flowcell_info(fc_dir)
    run_info = galaxy_api.run_details(fc_name)
    fastq_dir = get_fastq_dir(fc_dir)
    #print "Generating fastq files"
    #all_lanes = [i['lane'] for i in run_info["details"]]
    #short_fc_name = "%s_%s" % (fc_date, fc_name)
    #fastq_dir = generate_fastq(fc_dir, short_fc_name, all_lanes)
    if config["algorithm"]["num_cores"] > 1:
        pool = Pool(config["algorithm"]["num_cores"])
        try:
            pool.map(_process_wrapper,
                    ((i, fastq_dir, fc_name, fc_date, config, config_file)
                        for i in run_info["details"]))
        except:
            pool.terminate()
            raise
    else:
        map(_process_wrapper,
            ((i, fastq_dir, fc_name, fc_date, config, config_file)
                for i in run_info["details"]))
    write_metrics(run_info, work_dir, fc_dir, fastq_dir)
Exemplo n.º 16
0
def organize(dirs, config, run_info_yaml):
    """Organize run information from a passed YAML file or the Galaxy API.

    Creates the high level structure used for subsequent processing.
    """
    if run_info_yaml and os.path.exists(run_info_yaml):
        logger.info("Using input YAML configuration: %s" % run_info_yaml)
        run_details = _run_info_from_yaml(dirs["flowcell"], run_info_yaml,
                                          config)
    else:
        logger.info("Fetching run details from Galaxy instance")
        fc_name, fc_date = get_flowcell_info(dirs["flowcell"])
        galaxy_api = GalaxyApiAccess(config['galaxy_url'],
                                     config['galaxy_api_key'])
        run_details = []
        galaxy_info = galaxy_api.run_details(fc_name, fc_date)
        for item in galaxy_info["details"]:
            item["upload"] = {
                "method": "galaxy",
                "run_id": galaxy_info["run_id"],
                "fc_name": fc_name,
                "fc_date": fc_date
            }
            run_details.append(item)
    out = []
    for item in run_details:
        item["config"] = config_utils.update_w_custom(config, item)
        item["dirs"] = dirs
        if "name" not in item:
            item["name"] = ["", item["description"]]
        item = _add_reference_resources(item)
        out.append(item)
    return out
Exemplo n.º 17
0
def _run_info_from_yaml(fc_dir, run_info_yaml):
    """Read run information from a passed YAML file.
    """
    with open(run_info_yaml) as in_handle:
        loaded = yaml.load(in_handle)
    fc_name = None
    if fc_dir:
        try:
            fc_name, fc_date = get_flowcell_info(fc_dir)
        except ValueError:
            pass
    global_config = {}
    if isinstance(loaded, dict):
        if loaded.has_key("fc_name") and loaded.has_key("fc_date"):
            fc_name = loaded["fc_name"].replace(" ", "_")
            fc_date = str(loaded["fc_date"]).replace(" ", "_")
            global_config = copy.deepcopy(loaded)
            del global_config["details"]
        loaded = loaded["details"]
    if fc_name is None:
        fc_name, fc_date = _unique_flowcell_info()
    run_details = []
    for i, item in enumerate(loaded):
        if not item.has_key("lane"):
            item["lane"] = _generate_lane(item["files"], i)
        if not item.has_key("description"):
            item["description"] = str(item["lane"])
        item["description_filenames"] = global_config.get(
            "description_filenames", False)
        run_details.append(item)
    run_info = dict(details=run_details, run_id="")
    return fc_name, fc_date, run_info
Exemplo n.º 18
0
def get_flowcell(fc_dir, run_info_yaml, config={}):
    # Just get the name of the flowcell directory minus the path
    fc_name, fc_date = get_flowcell_info(os.path.basename(os.path.normpath(fc_dir)))
    with open(run_info_yaml, "r") as fh:
        run_info = yaml.load(fh)

    return Flowcell(fc_name, fc_date, run_info, fc_dir)
Exemplo n.º 19
0
def run_main(config, config_file, fc_dir, run_info_yaml):
    work_dir = os.getcwd()
    fc_name, fc_date = get_flowcell_info(fc_dir)

    if run_info_yaml and os.path.exists(run_info_yaml):
        log.info("Found YAML samplesheet, using %s instead of Galaxy API" % run_info_yaml)
        with open(run_info_yaml) as in_handle:
            run_details = yaml.load(in_handle)
        run_info = dict(details=run_details, run_id="")
    else:
        log.info("Fetching run details from Galaxy instance")
        galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key'])
        run_info = galaxy_api.run_details(fc_name, fc_date)
    fastq_dir = get_fastq_dir(fc_dir)
    run_items = _add_multiplex_across_lanes(run_info["details"], fastq_dir, fc_name)
    align_dir = os.path.join(work_dir, "alignments")

    # process each flowcell lane
    with utils.cpmap(config["algorithm"]["num_cores"]) as cpmap:
        for _ in cpmap(process_lane,
                       ((i, fastq_dir, fc_name, fc_date, align_dir, config, config_file)
                        for i in run_items)):
            pass
    # process samples, potentially multiplexed across multiple lanes
    sample_files, sample_fastq, sample_info = organize_samples(align_dir,
            fastq_dir, work_dir, fc_name, fc_date, run_items)
    with utils.cpmap(config["algorithm"]["num_cores"]) as cpmap:
        for _ in cpmap(process_sample, ((name, sample_fastq[name], sample_info[name],
                                         bam_files, work_dir, config, config_file)
                                        for name, bam_files in sample_files)):
            pass
    write_metrics(run_info, work_dir, fc_dir, fc_name, fc_date, fastq_dir)
Exemplo n.º 20
0
def get_flowcell(fc_dir, run_info_yaml, config={}):
    # Just get the name of the flowcell directory minus the path
    fc_name, fc_date = get_flowcell_info(
        os.path.basename(os.path.normpath(fc_dir)))
    with open(run_info_yaml, "r") as fh:
        run_info = yaml.load(fh)

    return Flowcell(fc_name, fc_date, run_info, fc_dir)
Exemplo n.º 21
0
def _run_info_from_yaml(fc_dir, run_info_yaml, config):
    """Read run information from a passed YAML file.
    """
    with open(run_info_yaml) as in_handle:
        loaded = yaml.load(in_handle)
    fc_name, fc_date = None, None
    if fc_dir:
        try:
            fc_name, fc_date = get_flowcell_info(fc_dir)
        except ValueError:
            pass
    global_config = {}
    global_vars = {}
    if isinstance(loaded, dict):
        global_config = copy.deepcopy(loaded)
        del global_config["details"]
        if "fc_name" in loaded and "fc_date" in loaded:
            fc_name = loaded["fc_name"].replace(" ", "_")
            fc_date = str(loaded["fc_date"]).replace(" ", "_")
        global_vars = global_config.pop("globals", {})
        loaded = loaded["details"]
    run_details = []
    for i, item in enumerate(loaded):
        item = _normalize_files(item, fc_dir)
        if "lane" not in item:
            item["lane"] = str(i + 1)
        item["lane"] = _clean_characters(str(item["lane"]))
        if "description" not in item:
            if len(item.get("files",
                            [])) == 1 and item["files"][0].endswith(".bam"):
                item["description"] = get_sample_name(item["files"][0])
            else:
                raise ValueError(
                    "No `description` sample name provided for input #%s" %
                    (i + 1))
        item["description"] = _clean_characters(str(item["description"]))
        upload = global_config.get("upload", {})
        # Handle specifying a local directory directly in upload
        if isinstance(upload, basestring):
            upload = {"dir": upload}
        if fc_name and fc_date:
            upload["fc_name"] = fc_name
            upload["fc_date"] = fc_date
        upload["run_id"] = ""
        item["upload"] = upload
        item["algorithm"] = _replace_global_vars(item["algorithm"],
                                                 global_vars)
        item["algorithm"] = genome.abs_file_paths(item["algorithm"],
                                                  ignore_keys=[
                                                      "variantcaller",
                                                      "realign", "recalibrate",
                                                      "phasing", "svcaller"
                                                  ])
        item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date)
        item["test_run"] = global_config.get("test_run", False)
        run_details.append(item)
    _check_sample_config(run_details, run_info_yaml)
    return run_details
Exemplo n.º 22
0
def _find_casava_report(fc_dir):
    """Locate the CASAVA demultiplex report under the root directory
    of an illumina flowcell output directory
    """
    
    fc_name, _ = fc.get_flowcell_info(fc_dir)
    
    casava_report_glob = os.path.join(fc_dir,"Unaligned*","Basecall_Stats_*{}".format(fc_name[1:]),"Demultiplex_Stats.htm")
    return glob.glob(casava_report_glob)
Exemplo n.º 23
0
def _run_info_from_yaml(fc_dir, run_info_yaml, config):
    """Read run information from a passed YAML file.
    """
    with open(run_info_yaml) as in_handle:
        loaded = yaml.load(in_handle)
    fc_name, fc_date = None, None
    if fc_dir:
        try:
            fc_name, fc_date = get_flowcell_info(fc_dir)
        except ValueError:
            pass
    global_config = {}
    global_vars = {}
    if isinstance(loaded, dict):
        global_config = copy.deepcopy(loaded)
        del global_config["details"]
        if "fc_name" in loaded and "fc_date" in loaded:
            fc_name = loaded["fc_name"].replace(" ", "_")
            fc_date = str(loaded["fc_date"]).replace(" ", "_")
        global_vars = global_config.pop("globals", {})
        loaded = loaded["details"]
  
    run_details = []
    for i, item in enumerate(loaded):
        item = _normalize_files(item, fc_dir)
        if "lane" not in item:
            item["lane"] = str(i + 1)
        item["lane"] = _clean_characters(str(item["lane"]))
        if "description" not in item:
            if len(item.get("files", [])) == 1 and item["files"][0].endswith(".bam"):
                item["description"] = get_sample_name(item["files"][0])
            else:
                raise ValueError("No `description` sample name provided for input #%s" % (i + 1))
        item["description"] = _clean_characters(str(item["description"]))
        upload = global_config.get("upload", {})
        # Handle specifying a local directory directly in upload
        if isinstance(upload, basestring):
            upload = {"dir": upload}
        if fc_name and fc_date:
            upload["fc_name"] = fc_name
            upload["fc_date"] = fc_date
        upload["run_id"] = ""
        item["upload"] = upload
        item["algorithm"] = _replace_global_vars(item["algorithm"], global_vars)
        item["algorithm"] = genome.abs_file_paths(item["algorithm"],
                                                  ignore_keys=["variantcaller", "realign", "recalibrate",
                                                               "phasing", "svcaller"])
        
        item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date)
        item["test_run"] = global_config.get("test_run", False)
       
        
        run_details.append(item)
    _check_sample_config(run_details, run_info_yaml)
    return run_details
Exemplo n.º 24
0
def _find_casava_report(fc_dir):
    """Locate the CASAVA demultiplex report under the root directory
    of an illumina flowcell output directory
    """

    fc_name, _ = fc.get_flowcell_info(fc_dir)

    casava_report_glob = os.path.join(fc_dir, "Unaligned*",
                                      "Basecall_Stats_*{}".format(fc_name[1:]),
                                      "Demultiplex_Stats.htm")
    return glob.glob(casava_report_glob)
Exemplo n.º 25
0
def _find_samplesheet(fc_dir):
    """Locate the samplesheet in the root directory
    of an illumina flowcell output directory
    """
    
    fc_name, _ = fc.get_flowcell_info(fc_dir)
    
    for name in (fc_name, fc_name[1:], "SampleSheet"):
        ssheet = os.path.join(fc_dir, "{}.csv".format(name))
        if os.path.exists(ssheet):
            return ssheet 
    return None
Exemplo n.º 26
0
def get_run_info(fc_dir, config, run_info_yaml):
    """Retrieve run information from a passed YAML file or the Galaxy API.
    """
    if run_info_yaml and os.path.exists(run_info_yaml):
        logger.info("Found YAML samplesheet, using %s instead of Galaxy API" % run_info_yaml)
        fc_name, fc_date, run_info = _run_info_from_yaml(fc_dir, run_info_yaml)
    else:
        logger.info("Fetching run details from Galaxy instance")
        fc_name, fc_date = get_flowcell_info(fc_dir)
        galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key'])
        run_info = galaxy_api.run_details(fc_name, fc_date)
    return fc_name, fc_date, _organize_runs_by_lane(run_info)
Exemplo n.º 27
0
def _find_samplesheet(fc_dir):
    """Locate the samplesheet in the root directory
    of an illumina flowcell output directory
    """

    fc_name, _ = fc.get_flowcell_info(fc_dir)

    for name in (fc_name, fc_name[1:], "SampleSheet"):
        ssheet = os.path.join(fc_dir, "{}.csv".format(name))
        if os.path.exists(ssheet):
            return ssheet
    return None
Exemplo n.º 28
0
def get_run_info(fc_dir, config, run_info_yaml):
    """Retrieve run information from a passed YAML file or the Galaxy API.
    """
    if run_info_yaml and os.path.exists(run_info_yaml):
        logger.info("Found YAML samplesheet, using %s instead of Galaxy API" % run_info_yaml)
        fc_name, fc_date, run_info = _run_info_from_yaml(fc_dir, run_info_yaml)
    else:
        logger.info("Fetching run details from Galaxy instance")
        fc_name, fc_date = get_flowcell_info(fc_dir)
        galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key'])
        run_info = galaxy_api.run_details(fc_name, fc_date)
    return fc_name, fc_date, _organize_runs_by_lane(run_info)
Exemplo n.º 29
0
    def test_create_bc_report(self):
        """Create a demultiplex report and upload it to gdocs
        """
        # Parse the config
        config_file = os.path.join(self.data_dir, "post_process.yaml")
        self.config = load_config(config_file)

        # Loop over the runs
        for name in self.runname:
            print "\nProcessing %s" % name
            fc_name, fc_date = get_flowcell_info(name)
            analysisdir = os.path.join(self.workdir, name)
            create_bc_report_on_gdocs(fc_date, fc_name, analysisdir, {'details': self.run_info}, self.config)
Exemplo n.º 30
0
    def test_create_bc_report(self):
        """Create a demultiplex report and upload it to gdocs
        """
        # Parse the config
        config_file = os.path.join(self.data_dir, "post_process.yaml")
        self.config = load_config(config_file)

        # Loop over the runs
        for name in self.runname:
            print "\nProcessing %s" % name
            fc_name, fc_date = get_flowcell_info(name)
            analysisdir = os.path.join(self.workdir, name)
            assert create_report_on_gdocs(fc_date, fc_name, self.run_info_file, {"work": analysisdir, "flowcell": analysisdir}, self.config), "Report creation failed"
Exemplo n.º 31
0
Arquivo: lane.py Projeto: vals/bcbb
def get_flowcell_id(run_info, fc_dir, check_bc=True, glob_ext="_fastq.txt"):
    for lane in run_info:
        for bc in lane:
            if check_bc:
                glob_str = "%s_*_barcode/*%s" % (bc['lane'], glob_ext)
            else:
                glob_str = "%s_*%s" % (lane, glob_ext)
                next
    files = glob.glob(os.path.join(fc_dir, glob_str))
    try:
        (name, date) = get_flowcell_info(os.path.basename(files[0]))
    except:
        raise StandardError("No flowcell information found in " + str(fc_dir))
    return name, date
Exemplo n.º 32
0
def _find_demultiplex_stats_htm(base_name, config):
    
    try:
        fc_name, _ = get_flowcell_info(base_name)
        basecall_stats_dir = os.path.join(config["analysis"]["base_dir"],"Basecall_Stats_%s" % fc_name) 
        # If directory doesn't exist, try stripping first character from name (which may corrspond to flowcell position)
        if not os.path.exists(basecall_stats_dir):
            basecall_stats_dir = os.path.join(config["analysis"]["base_dir"],"Basecall_Stats_%s" % fc_name[1:])
        
        casava_stats = os.path.join(basecall_stats_dir, "Demultiplex_Stats.htm")
        assert os.path.exists(casava_stats)
        return casava_stats
    except:
        return None
Exemplo n.º 33
0
Arquivo: lane.py Projeto: hussius/bcbb
def get_flowcell_id(run_info, fc_dir, check_bc=True, glob_ext="_fastq.txt"):
    lane = None
    for info in run_info:
        lane = info.get("lane", "")
    if check_bc:
        glob_str = "%s_*_barcode/*%s" % (lane, glob_ext)
    else:
        glob_str = "%s_*%s" % (lane, glob_ext)
    files = glob.glob(os.path.join(fc_dir, glob_str))
    try:
        (name, date) = get_flowcell_info(os.path.basename(files[0]))
    except:
        raise StandardError("No flowcell information found in " + str(fc_dir))
    return name, date
Exemplo n.º 34
0
def _generate_fastq(fc_dir):
    """Generate fastq files for the current flowcell.
    """
    fc_name, fc_date = get_flowcell_info(fc_dir)
    short_fc_name = "%s_%s" % (fc_date, fc_name)
    fastq_dir = get_fastq_dir(fc_dir)
    if not fastq_dir == fc_dir and not os.path.exists(fastq_dir):
        with utils.chdir(os.path.split(fastq_dir)[0]):
            lanes = sorted(list(set([f.split("_")[1] for f in
                glob.glob("*qseq.txt")])))
            cl = ["solexa_qseq_to_fastq.py", short_fc_name,
                    ",".join(lanes)]
            subprocess.check_call(cl)
    return fastq_dir
Exemplo n.º 35
0
def get_flowcell_id(run_info, fc_dir, check_bc=True, glob_ext="_fastq.txt"):
    for lane in run_info:
        for bc in lane:
            if check_bc:
                glob_str = "%s_*_barcode/*%s" % (bc['lane'], glob_ext)
            else:
                glob_str = "%s_*%s" % (lane, glob_ext)
                next
    files = glob.glob(os.path.join(fc_dir, glob_str))
    try:
        (name, date) = get_flowcell_info(os.path.basename(files[0]))
    except:
        raise StandardError("No flowcell information found in " + str(fc_dir))
    return name, date
Exemplo n.º 36
0
def _generate_fastq(fc_dir, config):
    """Generate fastq files for the current flowcell.
    """
    fc_name, fc_date = get_flowcell_info(fc_dir)
    short_fc_name = "%s_%s" % (fc_date, fc_name)
    fastq_dir = get_fastq_dir(fc_dir)
    basecall_dir = os.path.split(fastq_dir)[0]
    if not fastq_dir == fc_dir and not os.path.exists(fastq_dir):
        log.info("Generating fastq files for %s" % fc_dir)
        with utils.chdir(basecall_dir):
            lanes = sorted(
                list(set([f.split("_")[1] for f in glob.glob("*qseq.txt")])))
            cl = ["solexa_qseq_to_fastq.py", short_fc_name, ",".join(lanes)]
            log.info("Converting qseq to fastq on all lanes.")
            subprocess.check_call(cl)
            log.info("Qseq to fastq conversion completed.")
    return fastq_dir
Exemplo n.º 37
0
def _run_info_from_yaml(fc_dir, run_info_yaml, config):
    """Read run information from a passed YAML file.
    """
    with open(run_info_yaml) as in_handle:
        loaded = yaml.load(in_handle)
    fc_name, fc_date = None, None
    if fc_dir:
        try:
            fc_name, fc_date = get_flowcell_info(fc_dir)
        except ValueError:
            pass
    global_config = {}
    if isinstance(loaded, dict):
        global_config = copy.deepcopy(loaded)
        del global_config["details"]
        if loaded.has_key("fc_name") and loaded.has_key("fc_date"):
            fc_name = loaded["fc_name"].replace(" ", "_")
            fc_date = str(loaded["fc_date"]).replace(" ", "_")
        loaded = loaded["details"]
    run_details = []
    for i, item in enumerate(loaded):
        item = _normalize_files(item, fc_dir)
        if not item.has_key("lane"):
            item["lane"] = str(i + 1)
        item["lane"] = _clean_characters(str(item["lane"]))
        if not item.has_key("description"):
            if len(item.get("files",
                            [])) == 1 and item["files"][0].endswith(".bam"):
                item["description"] = get_sample_name(item["files"][0])
            else:
                raise ValueError(
                    "No `description` sample name provided for input #%s" %
                    (i + 1))
        item["description"] = _clean_characters(str(item["description"]))
        upload = global_config.get("upload", {})
        if fc_name and fc_date:
            upload["fc_name"] = fc_name
            upload["fc_date"] = fc_date
        upload["run_id"] = ""
        item["upload"] = upload
        item["algorithm"] = genome.abs_file_paths(
            item["algorithm"], ignore_keys=["variantcaller"])
        item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date)
        run_details.append(item)
    _check_sample_config(run_details, run_info_yaml)
    return run_details
Exemplo n.º 38
0
    def _make_qc_metrics(self, runname, analysisdir):
        """Writes RTA quality data for each read"""

        fc_name, fc_date = get_flowcell_info(runname)
        run_info_file = os.path.join(analysisdir, "RunInfo.xml")
        run_info_xml = "<RunInfo><Run Id=\"%s\" Number=\"%s\"><Flowcell>%s</Flowcell><Instrument>SN0000</Instrument><Date>%s</Date><Reads><Read Number=\"1\" NumCycles=\"101\" IsIndexedRead=\"N\" /><Read Number=\"2\" NumCycles=\"7\" IsIndexedRead=\"Y\" /><Read Number=\"3\" NumCycles=\"101\" IsIndexedRead=\"N\" /></Reads><FlowcellLayout LaneCount=\"8\" SurfaceCount=\"2\" SwathCount=\"3\" TileCount=\"8\" /><AlignToPhiX><Lane>1</Lane><Lane>2</Lane><Lane>3</Lane><Lane>4</Lane><Lane>5</Lane><Lane>6</Lane><Lane>7</Lane><Lane>8</Lane></AlignToPhiX></Run></RunInfo>" % (runname, 1, fc_name, fc_date)
        xmlobj = xml.etree.ElementTree.fromstring(run_info_xml)
        xml.etree.ElementTree.ElementTree(xmlobj).write(run_info_file, "utf-8", True)

        qc_dir = os.path.join(analysisdir, "Data", "reports", "Summary")
        # Create the directory if it doesn't exist
        if not os.path.exists(qc_dir):
            os.makedirs(qc_dir)

        for read in (1, 2, 3):
            xmlfile = os.path.join(qc_dir, "read%s.xml" % read)
            xmlobj = xml.etree.ElementTree.fromstring(read_qc[read - 1])
            xml.etree.ElementTree.ElementTree(xmlobj).write(xmlfile, "utf-8", True)
Exemplo n.º 39
0
def _generate_fastq(fc_dir, config):
    """Generate fastq files for the current flowcell.
    """
    fc_name, fc_date = get_flowcell_info(fc_dir)
    short_fc_name = "%s_%s" % (fc_date, fc_name)
    fastq_dir = get_fastq_dir(fc_dir)
    basecall_dir = os.path.split(fastq_dir)[0]
    if not fastq_dir == fc_dir and not os.path.exists(fastq_dir):
        log.info("Generating fastq files for %s" % fc_dir)
        with utils.chdir(basecall_dir):
            lanes = sorted(list(set([f.split("_")[1] for f in
                glob.glob("*qseq.txt")])))
            cl = ["solexa_qseq_to_fastq.py", short_fc_name,
                    ",".join(lanes)]
            log.info("Converting qseq to fastq on all lanes.")
            subprocess.check_call(cl)
            log.info("Qseq to fastq conversion completed.")
    return fastq_dir
Exemplo n.º 40
0
def _find_demultiplex_stats_htm(base_name, config):

    try:
        fc_name, _ = get_flowcell_info(base_name)
        basecall_stats_dir = os.path.join(config["analysis"]["base_dir"],
                                          "Basecall_Stats_%s" % fc_name)
        # If directory doesn't exist, try stripping first character from name (which may corrspond to flowcell position)
        if not os.path.exists(basecall_stats_dir):
            basecall_stats_dir = os.path.join(
                config["analysis"]["base_dir"],
                "Basecall_Stats_%s" % fc_name[1:])

        casava_stats = os.path.join(basecall_stats_dir,
                                    "Demultiplex_Stats.htm")
        assert os.path.exists(casava_stats)
        return casava_stats
    except:
        return None
Exemplo n.º 41
0
def _run_info_from_yaml(fc_dir, run_info_yaml, config):
    """Read run information from a passed YAML file.
    """
    with open(run_info_yaml) as in_handle:
        loaded = yaml.load(in_handle)
    fc_name, fc_date = None, None
    if fc_dir:
        try:
            fc_name, fc_date = get_flowcell_info(fc_dir)
        except ValueError:
            pass
    global_config = {}
    if isinstance(loaded, dict):
        global_config = copy.deepcopy(loaded)
        del global_config["details"]
        if loaded.has_key("fc_name") and loaded.has_key("fc_date"):
            fc_name = loaded["fc_name"].replace(" ", "_")
            fc_date = str(loaded["fc_date"]).replace(" ", "_")
        loaded = loaded["details"]
    run_details = []
    for i, item in enumerate(loaded):
        item = _normalize_files(item, fc_dir)
        if not item.has_key("lane"):
            item["lane"] = str(i+1)
        item["lane"] = _clean_characters(str(item["lane"]))
        if not item.has_key("description"):
            if len(item.get("files", [])) == 1 and item["files"][0].endswith(".bam"):
                item["description"] = get_sample_name(item["files"][0])
            else:
                raise ValueError("No `description` sample name provided for input #%s" % (i+1))
        item["description"] = _clean_characters(str(item["description"]))
        upload = global_config.get("upload", {})
        if fc_name and fc_date:
            upload["fc_name"] = fc_name
            upload["fc_date"] = fc_date
        upload["run_id"] = ""
        item["upload"] = upload
        item["algorithm"] = genome.abs_file_paths(item["algorithm"],
                                                  ignore_keys=["variantcaller"])
        item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date)
        run_details.append(item)
    _check_sample_config(run_details, run_info_yaml)
    return run_details
Exemplo n.º 42
0
def main(config_file, fc_dir):
    work_dir = os.getcwd()
    config = load_config(config_file)
    galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key'])
    fc_name, fc_date = get_flowcell_info(fc_dir)
    run_info = galaxy_api.run_details(fc_name)
    fastq_dir = get_fastq_dir(fc_dir)
    if config["algorithm"]["num_cores"] > 1:
        pool = Pool(config["algorithm"]["num_cores"])
        try:
            pool.map(_process_wrapper,
                    ((i, fastq_dir, fc_name, fc_date, config, config_file)
                        for i in run_info["details"]))
        except:
            pool.terminate()
            raise
    else:
        map(_process_wrapper,
            ((i, fastq_dir, fc_name, fc_date, config, config_file)
                for i in run_info["details"]))
Exemplo n.º 43
0
def main(config_file, fc_dir):
    work_dir = os.getcwd()
    config = load_config(config_file)
    galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key'])
    fc_name, fc_date = get_flowcell_info(fc_dir)
    run_info = galaxy_api.run_details(fc_name)
    fastq_dir = get_fastq_dir(fc_dir)
    if config["algorithm"]["num_cores"] > 1:
        pool = Pool(config["algorithm"]["num_cores"])
        try:
            pool.map(_process_wrapper,
                    ((i, fastq_dir, fc_name, fc_date, config, config_file)
                        for i in run_info["details"]))
        except:
            pool.terminate()
            raise
    else:
        map(_process_wrapper,
            ((i, fastq_dir, fc_name, fc_date, config, config_file)
                for i in run_info["details"]))
Exemplo n.º 44
0
def _run_info_from_yaml(fc_dir, run_info_yaml):
    """Read run information from a passed YAML file.
    """
    with open(run_info_yaml) as in_handle:
        loaded = yaml.load(in_handle)
    fc_name = None
    if fc_dir:
        try:
            fc_name, fc_date = get_flowcell_info(fc_dir)
        except ValueError:
            pass
    global_config = {}
    if isinstance(loaded, dict):
        global_config = copy.deepcopy(loaded)
        del global_config["details"]
        if loaded.has_key("fc_name") and loaded.has_key("fc_date"):
            fc_name = loaded["fc_name"].replace(" ", "_")
            fc_date = str(loaded["fc_date"]).replace(" ", "_")
        loaded = loaded["details"]
    if fc_name is None:
        fc_name, fc_date = _unique_flowcell_info()
    run_details = []
    for i, item in enumerate(loaded):
        if not item.has_key("lane"):
            if item.has_key("description"):
                item["lane"] = item["description"]
            elif item.has_key("files"):
                item["lane"] = _generate_lane(item["files"], i)
            else:
                raise ValueError("Unable to generate lane info for input %s" % item)
        if not item.has_key("description"):
            item["description"] = str(item["lane"])
        item["description_filenames"] = global_config.get("description_filenames", False)
        upload = global_config.get("upload")
        if upload:
            upload["fc_name"] = fc_name
            upload["fc_date"] = fc_date
        item["upload"] = upload
        run_details.append(item)
    run_info = dict(details=run_details, run_id="")
    return fc_name, fc_date, run_info
Exemplo n.º 45
0
def _run_info_from_yaml(fc_dir, run_info_yaml, config):
    """Read run information from a passed YAML file.
    """
    with open(run_info_yaml) as in_handle:
        loaded = yaml.load(in_handle)
    fc_name = None
    if fc_dir:
        try:
            fc_name, fc_date = get_flowcell_info(fc_dir)
        except ValueError:
            pass
    global_config = {}
    if isinstance(loaded, dict):
        global_config = copy.deepcopy(loaded)
        del global_config["details"]
        if loaded.has_key("fc_name") and loaded.has_key("fc_date"):
            fc_name = loaded["fc_name"].replace(" ", "_")
            fc_date = str(loaded["fc_date"]).replace(" ", "_")
        loaded = loaded["details"]
    if fc_name is None:
        fc_name, fc_date = _unique_flowcell_info()
    run_details = []
    for i, item in enumerate(loaded):
        if not item.has_key("lane"):
            item["lane"] = str(i + 1)
        item["lane"] = _clean_characters(str(item["lane"]))
        if not item.has_key("description"):
            item["description"] = str(item["lane"])
        item["description"] = _clean_characters(str(item["description"]))
        item["description_filenames"] = global_config.get(
            "description_filenames", False)
        upload = global_config.get("upload")
        if upload:
            upload["fc_name"] = fc_name
            upload["fc_date"] = fc_date
        item["upload"] = upload
        item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date)
        run_details.append(item)
    _check_sample_config(run_details, run_info_yaml)
    run_info = dict(details=run_details, run_id="")
    return fc_name, fc_date, run_info
Exemplo n.º 46
0
def _casava_report_to_metrics(run_info_file, casava_report, dirs):
    """Convert the supplied CASAVA demultiplex report into bcbb-style
    metric files, based on the configuration in the run_info_file.
    Metric files are written to the workdir
    """
    
    metric_files = []
    metrics = dmx._parse_demultiplex_stats_htm(casava_report)    
    with open(run_info_file) as fh:
        info = yaml.load(fh)

        fc_name, fc_date = fc.get_flowcell_info(dirs["flowcell"])
        for item in info:
            metrics_file = "{}_{}_{}.bc_metrics".format(item["lane"], fc_date, fc_name)
            multiplex = item.get("multiplex", [])
            for plex in multiplex:
                plex["lane"] = item["lane"]

            dmx._write_demultiplex_metrics(multiplex, metrics, os.path.join(dirs["work"], metrics_file))
            metric_files.append(metrics_file)
    return metric_files
Exemplo n.º 47
0
def main(config_file, fc_dir, run_info_yaml=None):
    work_dir = os.getcwd()
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)
    if run_info_yaml:
        with open(run_info_yaml) as in_handle:
            run_details = yaml.load(in_handle)
        run_info = dict(details=run_details, run_id="")
    else:
        galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key'])
        run_info = galaxy_api.run_details(fc_name)
    fc_name, fc_date = get_flowcell_info(fc_dir)
    run_items = _add_multiplex_to_control(run_info["details"])
    fastq_dir = get_fastq_dir(fc_dir)
    align_dir = os.path.join(work_dir, "alignments")

    # process each flowcell lane
    pool = (Pool(config["algorithm"]["num_cores"])
            if config["algorithm"]["num_cores"] > 1 else None)
    map_fn = pool.map if pool else map
    try:
        map_fn(_process_lane_wrapper,
                ((i, fastq_dir, fc_name, fc_date, align_dir, config, config_file)
                    for i in run_items))
    except:
        if pool:
            pool.terminate()
        raise
    # process samples, potentially multiplexed across multiple lanes
    sample_files, sample_fastq, sample_info = organize_samples(align_dir,
            fastq_dir, work_dir, fc_name, fc_date, run_items)
    try:
        map_fn(_process_sample_wrapper,
          ((name, sample_fastq[name], sample_info[name], bam_files, work_dir,
              config, config_file) for name, bam_files in sample_files))
    except:
        if pool:
            pool.terminate()
        raise
    write_metrics(run_info, work_dir, fc_dir, fc_name, fc_date, fastq_dir)
Exemplo n.º 48
0
def main(config_file, fc_dir, analysis_dir, run_info_yaml=None):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)
    fc_name, fc_date = get_flowcell_info(fc_dir)
    if run_info_yaml:
        with open(run_info_yaml) as in_handle:
            run_details = yaml.load(in_handle)
        run_info = dict(details=run_details, run_id="")
        galaxy_api = None
    else:
        galaxy_api = GalaxyApiAccess(config['galaxy_url'],
                                     config['galaxy_api_key'])
        run_info = galaxy_api.run_details(fc_name, fc_date)

    base_folder_name = "%s_%s" % (fc_date, fc_name)
    run_details = lims_run_details(run_info, fc_name, base_folder_name)
    for (library_name, access_role, dbkey, lane, bc_id, name, desc,
         local_name) in run_details:
        library_id = (get_galaxy_library(library_name, galaxy_api)
                      if library_name else None)
        upload_files = list(
            select_upload_files(local_name, bc_id, fc_dir, analysis_dir,
                                config))
        if len(upload_files) > 0:
            print lane, bc_id, name, desc, library_name
            print "Creating storage directory"
            if library_id:
                folder, cur_galaxy_files = get_galaxy_folder(
                    library_id, base_folder_name, name, desc, galaxy_api)
            else:
                cur_galaxy_files = []
            store_dir = move_to_storage(lane, bc_id, base_folder_name,
                                        upload_files, cur_galaxy_files, config)
            if store_dir and library_id:
                print "Uploading directory of files to Galaxy"
                print galaxy_api.upload_directory(library_id, folder['id'],
                                                  store_dir, dbkey,
                                                  access_role)
    if galaxy_api:
        add_run_summary_metrics(analysis_dir, galaxy_api)
Exemplo n.º 49
0
def run_has_samplesheet(fc_dir, config, require_single=True):
    """Checks if there's a suitable SampleSheet.csv present for the run.

    Returns the path to the samplesheet if one is found, None otherwise.
    """
    fc_name, _ = get_flowcell_info(fc_dir)
    sheet_dirs = config.get("samplesheet_directories", [])
    fcid_sheet = {}
    for ss_dir in (s for s in sheet_dirs if os.path.exists(s)):
        with utils.chdir(ss_dir):
            for ss in glob.glob("*.csv"):
                fc_ids = _get_flowcell_id(ss, require_single)
                for fcid in fc_ids:
                    if fcid:
                        fcid_sheet[fcid] = os.path.join(ss_dir, ss)

    # The pipeline leaves the flowcell position in the name, so account for that
    for fcid in [fc_name, fc_name[1:]]:
        if fcid in fcid_sheet:
            return fcid_sheet[fcid]
    
    return None
Exemplo n.º 50
0
def run_has_samplesheet(fc_dir, config, require_single=True):
    """Checks if there's a suitable SampleSheet.csv present for the run
    """
    fc_name, _ = get_flowcell_info(fc_dir)
    sheet_dirs = config.get("samplesheet_directories", [])
    fcid_sheet = {}
    for ss_dir in (s for s in sheet_dirs if os.path.exists(s)):
        with utils.chdir(ss_dir):
            for ss in glob.glob("*.csv"):
                fc_ids = _get_flowcell_id(ss, require_single)
                for fcid in fc_ids:
                    if fcid:
                        fcid_sheet[fcid] = os.path.join(ss_dir, ss)
    # difflib handles human errors while entering data on the SampleSheet.
    # Only one best candidate is returned (if any). 0.85 cutoff allows for
    # maximum of 2 mismatches in fcid

    potential_fcids = difflib.get_close_matches(fc_name, fcid_sheet.keys(), 1,
                                                0.85)
    if len(potential_fcids) > 0 and fcid_sheet.has_key(potential_fcids[0]):
        return fcid_sheet[potential_fcids[0]]
    else:
        return None
Exemplo n.º 51
0
def _casava_report_to_metrics(run_info_file, casava_report, dirs):
    """Convert the supplied CASAVA demultiplex report into bcbb-style
    metric files, based on the configuration in the run_info_file.
    Metric files are written to the workdir
    """

    metric_files = []
    metrics = dmx._parse_demultiplex_stats_htm(casava_report)
    with open(run_info_file) as fh:
        info = yaml.load(fh)

        fc_name, fc_date = fc.get_flowcell_info(dirs["flowcell"])
        for item in info:
            metrics_file = "{}_{}_{}.bc_metrics".format(
                item["lane"], fc_date, fc_name)
            multiplex = item.get("multiplex", [])
            for plex in multiplex:
                plex["lane"] = item["lane"]

            dmx._write_demultiplex_metrics(
                multiplex, metrics, os.path.join(dirs["work"], metrics_file))
            metric_files.append(metrics_file)
    return metric_files
Exemplo n.º 52
0
def run_main(config, config_file, fc_dir, run_info_yaml):
    work_dir = os.getcwd()
    fc_name, fc_date = get_flowcell_info(fc_dir)

    if run_info_yaml and os.path.exists(run_info_yaml):
        log.info("Found YAML samplesheet, using %s instead of Galaxy API" %
                 run_info_yaml)
        with open(run_info_yaml) as in_handle:
            run_details = yaml.load(in_handle)
        run_info = dict(details=run_details, run_id="")
    else:
        log.info("Fetching run details from Galaxy instance")
        galaxy_api = GalaxyApiAccess(config['galaxy_url'],
                                     config['galaxy_api_key'])
        run_info = galaxy_api.run_details(fc_name, fc_date)
    fastq_dir = get_fastq_dir(fc_dir)
    run_items = _add_multiplex_across_lanes(run_info["details"], fastq_dir,
                                            fc_name)
    align_dir = os.path.join(work_dir, "alignments")

    # process each flowcell lane
    with utils.cpmap(config["algorithm"]["num_cores"]) as cpmap:
        for _ in cpmap(
                process_lane,
            ((i, fastq_dir, fc_name, fc_date, align_dir, config, config_file)
             for i in run_items)):
            pass
    # process samples, potentially multiplexed across multiple lanes
    sample_files, sample_fastq, sample_info = organize_samples(
        align_dir, fastq_dir, work_dir, fc_name, fc_date, run_items)
    with utils.cpmap(config["algorithm"]["num_cores"]) as cpmap:
        for _ in cpmap(process_sample,
                       ((name, sample_fastq[name], sample_info[name],
                         bam_files, work_dir, config, config_file)
                        for name, bam_files in sample_files)):
            pass
    write_metrics(run_info, work_dir, fc_dir, fc_name, fc_date, fastq_dir)
Exemplo n.º 53
0
def main(run_id, config_file, run_info_file=None, dryrun=False):

    assert run_id, \
    "No run id was specified"
    assert os.path.exists(config_file), \
    "The configuration file, {}, could not be found".format(config_file)

    config = load_config(config_file)
    assert "gdocs_upload" in config, \
    "The configuration file, {}, has no section specifying the Google docs details".format(config_file)

    analysis_cfg = config.get("analysis", {})
    if "store_dir" in analysis_cfg:
        archive_dir = os.path.join(analysis_cfg["store_dir"], run_id)
    else:
        archive_dir = os.getcwd()

    analysis_dir = None
    if "base_dir" in analysis_cfg:
        analysis_dir = os.path.join(analysis_cfg["base_dir"], run_id)
    if analysis_dir is None or not os.path.exists(analysis_dir):
        analysis_dir = tempfile.mkdtemp()

    dirs = {
        "work": os.path.normpath(analysis_dir),
        "flowcell": os.path.normpath(archive_dir)
    }
    assert os.path.exists(dirs["flowcell"]), \
    "The flowcell directory, {}, could not be found".format(dirs["flowcell"])
    assert os.path.exists(dirs["work"]), \
    "The work directory, {}, could not be found".format(dirs["work"])

    if run_info_file is None:
        run_info_file = os.path.join(dirs["flowcell"], "run_info.yaml")

        if not os.path.exists(run_info_file):
            # Locate the samplesheet and convert to yaml
            samplesheet = _find_samplesheet(dirs["flowcell"])
            assert samplesheet, \
            "Could not locate samplesheet in {}, aborting..".format(dirs["flowcell"])
            fh, run_info_file = tempfile.mkstemp()
            os.close(fh)
            run_info_file = ssheet.csv2yaml(samplesheet, run_info_file)

    assert os.path.exists(run_info_file), \
    "The run info configuration file, {}, could not be found".format(run_info_file)

    fc_name, fc_date = fc.get_flowcell_info(dirs["flowcell"])
    # If we have no bc_metrics files in the workdir, we may be looking at a Casava run.
    # In that case, attempt to parse the Demultiplex_Stats.htm file and create bc_metrics files
    metric_files = glob.glob(
        os.path.join(dirs["work"], "*_barcode", "*bc[_.]metrics")) + glob.glob(
            os.path.join(dirs["work"], "*bc[_.]metrics"))
    if len(metric_files) == 0:
        casava_report = _find_casava_report(dirs["flowcell"])
        assert len(casava_report) > 0, \
        "Could not locate CASAVA demultiplex report in {}, aborting..".format(dirs["flowcell"])
        metric_files = _casava_report_to_metrics(run_info_file, casava_report,
                                                 dirs)

    assert len(metric_files) > 0, \
    "Could not locate or create required metric files, aborting.."

    print(
        "A report will be created on Google Docs based on the demultiplexed data in {}"
        .format(dirs["work"]))
    print("The configuration file is {0} and the run info file is {1}".format(
        config_file, run_info_file))
    print("The run was started on {0} and has flowcell id {1}".format(
        fc_date, fc_name))

    if not dryrun:
        create_report_on_gdocs(fc_date, fc_name, run_info_file, dirs, config)
    else:
        print("DRY-RUN: nothing uploaded")
Exemplo n.º 54
0
def generate_report(proj_conf):

    #######
    ### Metadata fetched from the 'Genomics project list' on Google Docs
    ###
    uppnex_proj = ''
    min_reads_per_sample = ''
    try:
        proj_data = ProjectMetaData(proj_conf['id'], proj_conf['config'])
        uppnex_proj = proj_data.uppnex_id
        project_id = proj_data.project_id
        queue_date = proj_data.queue_date
        no_samples = proj_data.no_samples
        lanes_plates = proj_data.lanes_plates
        min_reads_per_sample = proj_data.min_reads_per_sample
        customer_reference = proj_data.customer_reference
        application = proj_data.application
        no_finished_samples = proj_data.no_finished_samples
    except:
        print("WARNING: Could not fetch meta data from Google Docs")

    d = {
        'project_id': proj_conf['id'],
        'latex_opt': "",
        'summary': "",
        'infotable': "",
        'lanetable': "",
        'read1table': "",
        'read2table': "",
        'qcplots': "",
        'qc30plots': "",
        'errorrate': "",
        'yieldtable': "",
        'qualscale': proj_conf['qual_scale'],
        }

    ## Latex option (no of floats per page)
    floats_per_page = '.. raw:: latex\n\n   \setcounter{totalnumber}{8}'
    d.update(latex_opt=floats_per_page)

    ## General info table
    tab = Texttable()
    if not uppnex_proj or len(uppnex_proj) < 4 or uppnex_proj[0:4] != 'b201':
        uppnex_proj = "b201YXXX"
        print "WARNING: Could not find UPPNEX project"

    run_name_comp = proj_conf['flowcell'].split('_')
    simple_run_name = run_name_comp[0] + "_" + run_name_comp[3]
    proj_level_dir = fixProjName(proj_conf['id'])
    instr_id = run_name_comp[1]
    fc_name, fc_date = get_flowcell_info(proj_conf['flowcell'])
    tab.add_row(["Run name:", proj_conf['flowcell']])
    del_base = "/proj/"
    proj_id = proj_conf['id']
    try:
        if len(customer_reference) > 1:
            proj_id += ' (' + customer_reference + ')'
    except:
        pass

    if len(proj_id) > 30: 
        print "Project ID + customer reference too long: ", proj_id
    tab.add_rows([["Project id:", proj_id], 
                  ["Date:", fc_date],
                  ["Instrument ID:", instr_id],
                  ["Flow cell ID:", fc_name],
                  ["Uppnex project:", uppnex_proj],
                  ["Delivery directory:", del_base + uppnex_proj + "/INBOX/" + proj_level_dir + "/" + simple_run_name]])
    d.update(infotable=tab.draw())

    ## Lane table
    tab = Texttable()
    tab.add_row(["Lane", "Sample(s)"])
    for l in proj_conf['lanes']:
        main_proj = l['description'].split(',')[1].strip()
        samples = []
        if 'multiplex' in l:
            for mp in l['multiplex']:
                if 'sample_prj' in mp:
                    if mp['sample_prj'] == proj_conf['id']:
                        samples.append(mp['name'])
            tab.add_row([l['lane'], ", ".join(samples)])
        else:
            tab.add_row([l['lane'], "Non-multiplexed lane"])
    d.update(lanetable=tab.draw())

    tab_r1 = Texttable()
    tab_r2 = Texttable()
    tab_r1.set_cols_width([2, 12, 12, 12, 12, 12, 12, 30])
    tab_r2.set_cols_width([2, 12, 12, 12, 12, 12, 12, 30])
    tab_r1.add_row(["Lane", "Clu. dens. #/mm2", "% PF clusters", "Clu. PF #/mm2", "% phas/prephas", "% aln PhiX", "% error rate", "Comment"])
    tab_r2.add_row(["Lane", "Clu. dens. #/mm2", "% PF clusters", "Clu. PF #/mm2", "% phas/prephas", "% aln PhiX", "% error rate", "Comment"])

    # These should be moved to a cfg file. ( + perhaps provide an alternative for v1.5 FC )
    if (options.v1_5_fc):
        min_clupf = 300
    else:
        min_clupf = 475
    max_phas = 0.4
    max_prephas = 1.0  # 0.5
    max_mean_err = 2

    statspath = os.path.join(proj_conf['archive_dir'], proj_conf['flowcell'], "Data", "reports", "Summary")
    stats = summ.getQCstats(statspath)

    # Check quality criteria and add comments
    comm_r1 = ''
    comm_r2 = ''
    ok_r1 = True
    ok_r2 = True
    ok_cludens_r1 = True
    ok_cludens_r2 = True
    ok_err_rate = True
    ok_err_r1 = True
    ok_err_r2 = True

    for l in proj_conf['lanes']:

        # Cluster densities
        clu_dens_r1 = stats['raw_cluster_dens']['read1'][l['lane']]
        clu_dens_r2 = stats['raw_cluster_dens']['read2'][l['lane']]
        clu_dens_sd_r1 = stats['raw_cluster_dens_sd']['read1'][l['lane']]
        clu_dens_sd_r2 = stats['raw_cluster_dens_sd']['read2'][l['lane']]
        clu_dens_string_r1 = str(clu_dens_r1) + '+/-' + str(clu_dens_sd_r1)
        clu_dens_string_r2 = str(clu_dens_r2) + '+/-' + str(clu_dens_sd_r2)

        # Cluster PF densities
        clu_dens_pf_r1 = stats['pf_cluster_dens']['read1'][l['lane']]
        clu_dens_pf_r2 = stats['pf_cluster_dens']['read2'][l['lane']]
        clu_dens_pf_sd_r1 = stats['pf_cluster_dens_sd']['read1'][l['lane']]
        clu_dens_pf_sd_r2 = stats['pf_cluster_dens_sd']['read2'][l['lane']]
        clu_dens_pf_string_r1 = str(clu_dens_pf_r1) + '+/-' + str(clu_dens_pf_sd_r1)
        clu_dens_pf_string_r2 = str(clu_dens_pf_r2) + '+/-' + str(clu_dens_pf_sd_r2)

        # % PF clusters
        prc_pf_r1 = stats['prc_pf']['read1'][l['lane']]
        prc_pf_r2 = stats['prc_pf']['read2'][l['lane']]
        prc_pf_sd_r1 = stats['prc_pf_sd']['read1'][l['lane']]
        prc_pf_sd_r2 = stats['prc_pf_sd']['read2'][l['lane']]
        prc_pf_string_r1 = str(prc_pf_r1) + '+/-' + str(prc_pf_sd_r1)
        prc_pf_string_r2 = str(prc_pf_r2) + '+/-' + str(prc_pf_sd_r2)

        # % phasing and prephasing
        phas_r1 = stats['phasing']['read1'][l['lane']]
        phas_r2 = stats['phasing']['read2'][l['lane']]
        prephas_r1 = stats['prephasing']['read1'][l['lane']]
        prephas_r2 = stats['prephasing']['read2'][l['lane']]
        phas_string_r1 = str(phas_r1) + '/' + str(prephas_r1)
        phas_string_r2 = str(phas_r2) + '/' + str(prephas_r2)

        # % aligned
        aln_r1 = stats['prc_aligned']['read1'][l['lane']]
        aln_r2 = stats['prc_aligned']['read2'][l['lane']]
        aln_sd_r1 = stats['prc_aligned_sd']['read1'][l['lane']]
        aln_sd_r2 = stats['prc_aligned_sd']['read2'][l['lane']]
        aln_string_r1 = str(aln_r1) + '+/-' + str(aln_sd_r1)
        aln_string_r2 = str(aln_r2) + '+/-' + str(aln_sd_r2)

        # error rate
        err_r1 = stats['error_rate']['read1'][l['lane']]
        err_r2 = stats['error_rate']['read2'][l['lane']]
        err_sd_r1 = stats['error_rate_sd']['read1'][l['lane']]
        err_sd_r2 = stats['error_rate_sd']['read2'][l['lane']]
        err_str_r1 = str(err_r1) + '+/-' + str(err_sd_r1)
        err_str_r2 = str(err_r2) + '+/-' + str(err_sd_r2)

        comm_r1 = ""
        comm_r2 = ""

        # check criteria
        if float(clu_dens_pf_r1[:-1]) < min_clupf:
            ok_r1 = False
            ok_cludens_r1 = False
            comm_r1 += "Low cluster density. "
        if float(clu_dens_pf_r2[:-1]) < min_clupf:
            ok_r2 = False
            ok_cludens_r2 = False
            comm_r2 += "Low cluster density. "
        avg_error_rate = (float(err_r1) + float(err_r2)) / 2
        if avg_error_rate > max_mean_err:
            ok_err_rate = False
        if float(err_r1) > max_mean_err:
            comm_r1 += "High error rate. "
            ok_err_r1 = False
        if float(err_r2) > max_mean_err:
            comm_r2 += "High error rate. "
            ok_err_r2 = False

        if comm_r1 == "":
            comm_r1 = "OK"
        if comm_r2 == "":
            comm_r2 = "OK"

        tab_r1.add_row([l['lane'], clu_dens_string_r1, prc_pf_string_r1, clu_dens_pf_string_r1, phas_string_r1, aln_string_r1, err_str_r1, comm_r1])
        tab_r2.add_row([l['lane'], clu_dens_string_r2, prc_pf_string_r2, clu_dens_pf_string_r2, phas_string_r2, aln_string_r2, err_str_r2, comm_r2])

    # Reinitialize comments for the summary. (Which will be for several lanes, potentially)
    comm_r1 = ""
    comm_r2 = ""

    if not ok_cludens_r1:
        comm_r1 += "Low cluster density. "
    if not ok_cludens_r2:
        comm_r2 += "Low cluster density. "
    if not ok_err_rate:
        if not ok_err_r1:
            ok_r1 = False
            comm_r1 += "High error rate. "
        if not ok_err_r2:
            ok_r2 = False
            comm_r2 += "High error rate. "

    if (ok_r1 and ok_r2):
        comm_r1 = comm_r2 = "OK"
        d.update(summary = "Successful run in terms of error rate. ")
    else:  
        if (ok_r1): 
            comm_r1 = "OK"
            d.update(summary="Read 2 did not pass quality criteria: " + comm_r2)
        elif (ok_r2):
            comm_r2 = "OK"
            d.update(summary="Read 1 did not pass quality criteria: " + comm_r1)
        else:
            d.update(summary="Did not pass quality criteria. Read 1: " + comm_r1 + " Read 2: " + comm_r2)

    d.update(read1table=tab_r1.draw())
    d.update(read2table=tab_r2.draw())

    ## qcplots
    byCycleDir = os.path.join(proj_conf['archive_dir'], proj_conf['flowcell'], "Data", "reports", "ByCycle")
    res = []
    for l in proj_conf['lanes']:
        res.append(m2r.image(os.path.relpath(os.path.join(byCycleDir, "QScore_L%s.png" % (l['lane']))), width="100%"))
    d.update(qcplots="\n".join(res))

    ## qc30plots
    res = []
    for l in proj_conf['lanes']:
        res.append(m2r.image(os.path.relpath(os.path.join(byCycleDir, "NumGT30_L%s.png" % (l['lane']))), width="100%"))
    d.update(qc30plots="\n".join(res))

    ## qcplots
    res = []
    for l in proj_conf['lanes']:
        res.append(m2r.image(os.path.relpath(os.path.join(byCycleDir, "ErrRate_L%s.png" % (l['lane']))), width="100%"))
    d.update(errorrate="\n".join(res))

    ## Sequence yield table
    target_yield_per_lane = 143000000.0
    if (options.v1_5_fc):
        target_yield_per_lane = 60000000.0
    tab = Texttable()
    tab.add_row(['Lane', 'Sample', 'Number of sequences', 'Million sequences ordered', 'Comment'])

    run_info_yaml = os.path.join(proj_conf['archive_dir'], proj_conf['flowcell'], "run_info.yaml")

    if not os.path.exists(run_info_yaml):
        print("WARNING: could not find required run_info.yaml configuration file at '%s'" % run_info_yaml)
        return

    with open(run_info_yaml) as in_handle:
        run_info = yaml.load(in_handle)

    fc_name, fc_date = get_flowcell_info(proj_conf['flowcell'])
    low_yield = False

    bc_multiplier = 0.75  # Should move to cfg file

    ok_samples = []
    low_samples = []

    for l in proj_conf['lanes']:
	bc_file_name_prefix = os.path.join(proj_conf['analysis_dir'], proj_conf['flowcell'], '_'.join([l['lane'], fc_date, fc_name, "nophix_barcode"]), '_'.join([l['lane'], fc_date, fc_name, "nophix"]))
        bc_file = bc_file_name_prefix + ".bc_metrics"
	if not os.path.exists(bc_file):
		bc_file = bc_file_name_prefix + "_bc.metrics"
        try:
            bc_file = open(bc_file)
        except:
            sys.exit("Could not find bc metrics file " + bc_file)
        bc_count = {}
        for line in bc_file:
            c = line.strip().split()
            bc_count[c[0]]=c[1] + ' (~' + str (int ( round (float(c[1])/1000000) ) ) + " million)"
        no_samples = len(bc_count) - 1
        if no_samples == 0:
            print("WARNING: did not find a BC metrics file... Skipping lane %s for %s" % (l['lane'], proj_conf['id']))
            continue

        target_yield_per_sample = ''
        try:
            min_reads_per_sample = round(float(str(min_reads_per_sample)))
            target_yield_per_sample = min_reads_per_sample * 1000000
        except ValueError:
            min_reads_per_sample = ''
            target_yield_per_sample = bc_multiplier * target_yield_per_lane / no_samples

        sample_name = {}
        is_multiplexed = True
        is_rerun = False
        # Check here for each sample if it belongs to the project
        for entry in run_info:
            if entry['lane'] == l['lane']:
                projs = set()
                if 'multiplex' in entry:
                    for sample in entry['multiplex']:
                        if 'sample_prj' in sample:
                            projs.add(sample['sample_prj'])
                            if sample['sample_prj'].strip() == proj_conf['id']:
                                sample_name[sample['barcode_id']] = sample['name']
                else:
                    is_multiplexed = False
                if len(projs) > 1:
                    is_rerun = True
        samp_count = {}

        for k in bc_count.keys():
            if not k.isdigit():
                pass
            else:
                if int(k) in sample_name:
                    samp_count[sample_name[int(k)]] = bc_count[k]

        print "DEBUG: Target yield per sample = ", target_yield_per_sample
        print "DEBUG: Min reads per sample = ", min_reads_per_sample
        print "DEBUG: No samples: ", no_samples

        for k in sorted(samp_count.keys()):
            comment = ''
            if int(samp_count[k].split('(')[0]) < target_yield_per_sample:
                comment = 'Low. '
                low_yield = True
                low_samples.append(k)
            else:
                ok_samples.append(k)
            if is_rerun:
                comment += '(rerun lane)'
            tab.add_row([l['lane'], k, samp_count[k], min_reads_per_sample, comment])

        if is_multiplexed:
            comment = ''
            try:
                if int(bc_count['unmatched'].split('(')[0]) > target_yield_per_sample:
                    comment = 'High.'
                if is_rerun:
                    comment += '(rerun lane)'
                tab.add_row([l['lane'], 'unmatched', bc_count['unmatched'], min_reads_per_sample, comment])
            except:
                print('WARNING: insufficient or no barcode metrics for lane')
        else:
            comment = ''
            for k in bc_count.keys():
                if int(bc_count[k].split('(')[0]) < bc_multiplier * target_yield_per_lane:
                    comment = 'Low.'
                tab.add_row([l['lane'], "Non-multiplexed lane", bc_count[k], min_reads_per_sample, comment])

    delivery_type = "Final delivery. "
    if low_yield:
        delivery_type = "Partial delivery. "
        fail_comm = "Samples " + ", ".join(low_samples) + " yielded fewer sequences than expected. These will be re-run unless this was already a re-run and the total yield is now sufficient. "
    else:
        fail_comm = ""

    if low_yield:
        if len(ok_samples) > 0:
            ok_comm = "Samples " + ", ".join(ok_samples) + " yielded the expected number of sequences or more. "
        else:
            ok_comm = ""
    else:
        ok_comm = "All samples yielded the expected number of sequences or more. "

    comm = d['summary'] + fail_comm + ok_comm
    d.update(summary=comm)

    d.update(yieldtable=tab.draw())
    return d
Exemplo n.º 55
0
def main(flowcell_id, qual_scale, archive_dir, analysis_dir, config_file):
    if qual_scale not in ["phred64", "phred33"]: sys.exit("You must provide either 'phred64' or 'phred33' as the quality scale! Exiting ...")
    fp = os.path.join(archive_dir, flowcell_id, "run_info.yaml")
    with open(fp) as in_handle:
        run_info = yaml.load(in_handle)
    if config_file:
        config = load_config(config_file)
    else:
        config = {}
    project_ids = dict()
    for lane in run_info:
        (l, proj_id) = [x.strip() for x in lane['description'].split(",")]
        if proj_id in project_ids:
            if not lane in project_ids[proj_id]:
                project_ids[proj_id].append(lane)
        else:
            project_ids[proj_id] = [lane]
        # Check here if project is a "sub project" of the lane
        if not 'multiplex' in lane:
            continue
        for s in lane['multiplex']:
            if 'sample_prj' in s:
                if s['sample_prj'] in project_ids:
                    if lane not in project_ids[s['sample_prj']]:
                        project_ids[s['sample_prj']].append(lane)
                else:
                    project_ids[s['sample_prj']] = [lane]

    sphinx_defs = []
    for k in project_ids.keys():
        lanes = [x['lane'] for x in project_ids[k]]
        proj_file_tag = k + "_" + get_flowcell_info(flowcell_id)[1] + get_flowcell_info(flowcell_id)[0][0]
        print("INFO: saw project %s in lanes %s" % (k, ", ".join(lanes)))
        sphinx_defs.append("('%s', '%s_delivery.tex', 'Raw data delivery note', u'SciLifeLab Stockholm', 'howto'),\n" % (proj_file_tag, proj_file_tag))
        projectfile = "%s.mako" % (proj_file_tag)
        fp = open(projectfile, "w")
        fp.write(TEMPLATE)
        fp.close()
        mylookup = TemplateLookup(directories=['./'])
        tmpl = Template(filename=projectfile, lookup=mylookup)
        proj_conf = {
            'id' : k,
            'lanes' : project_ids[k],
            'archive_dir' : archive_dir, 
            'analysis_dir' : analysis_dir,
            'flowcell' : flowcell_id,
            'config' : config,
            'qual_scale': qual_scale,
            }
        d = generate_report(proj_conf)
        rstfile = "%s.rst" % (proj_file_tag)
        fp = open(rstfile, "w")
        fp.write(tmpl.render(**d))
        fp.close()

    sphinxconf = os.path.join(os.getcwd(), "conf.py")
    if not os.path.exists(sphinxconf):
        print("WARNING: no sphinx configuration file conf.py found: you have to edit conf.py yourself!")
    else:
        fp = open(sphinxconf)
        lines = fp.readlines()
        fp.close()
        sdout = []
        modify_conf = False
        for sd in sphinx_defs:
            if not sd in lines:
                sdout.append(sd)
                modify_conf = True
        if modify_conf:
            i = lines.index("latex_documents = [\n")
            newconf = lines[:i+3] + sdout + lines[i+3:]
            ## Change the preamble
            i = newconf.index("#'preamble': '',\n")
            newconf = newconf[:i+1] + _latex_preamble() + newconf[i+1:]
            ## Set the logo
            i = newconf.index("#latex_logo = None\n")
            newconf = newconf[:i+1] + _latex_logo() + newconf[i+1:]
            fp = open("conf.py", "w")
            fp.write("".join(newconf))
            fp.close()