Пример #1
0
def _generate_fastq(fc_dir, config, compress_fastq):
    """Generate fastq files for the current flowcell.
    """
    fc_name, fc_date = get_flowcell_info(fc_dir)
    short_fc_name = "%s_%s" % (fc_date, fc_name)
    fastq_dir = get_fastq_dir(fc_dir)
    basecall_dir = os.path.split(fastq_dir)[0]
    postprocess_dir = config.get("postprocess_dir", "")
    if postprocess_dir:
        fastq_dir = os.path.join(postprocess_dir, os.path.basename(fc_dir), "fastq")

    if not fastq_dir == fc_dir:# and not os.path.exists(fastq_dir):

        with utils.chdir(basecall_dir):
            lanes = sorted(list(set([f.split("_")[1] for f in
                glob.glob("*qseq.txt")])))
            cl = ["solexa_qseq_to_fastq.py", short_fc_name,
                  ",".join(lanes)]
            if postprocess_dir:
                cl += ["-o", fastq_dir]
            if compress_fastq:
                cl += ["--gzip"]

            logger2.debug("Converting qseq to fastq on all lanes.")
            subprocess.check_call(cl)

    return fastq_dir
Пример #2
0
def _generate_fastq(fc_dir, config, compress_fastq):
    """Generate fastq files for the current flowcell.
    """
    fc_name, fc_date = get_flowcell_info(fc_dir)
    short_fc_name = "%s_%s" % (fc_date, fc_name)
    fastq_dir = get_fastq_dir(fc_dir)
    basecall_dir = os.path.split(fastq_dir)[0]
    postprocess_dir = config.get("postprocess_dir", "")
    if postprocess_dir:
        fastq_dir = os.path.join(postprocess_dir, os.path.basename(fc_dir), "fastq")

    if not fastq_dir == fc_dir:  # and not os.path.exists(fastq_dir):

        with utils.chdir(basecall_dir):
            lanes = sorted(list(set([f.split("_")[1] for f in
                glob.glob("*qseq.txt")])))
            cl = ["solexa_qseq_to_fastq.py", short_fc_name,
                  ",".join(lanes)]
            if postprocess_dir:
                cl += ["-o", fastq_dir]
            if compress_fastq:
                cl += ["--gzip"]

            logger2.debug("Converting qseq to fastq on all lanes.")
            subprocess.check_call(cl)

    return fastq_dir
Пример #3
0
def finished_message(fn_name, run_module, directory, files_to_copy,
                     config, config_file, pushed=False):
    """Wait for messages with the give tag, passing on to the supplied handler.
    """
    logger2.debug("Calling remote function: %s" % fn_name)
    user = getpass.getuser()
    hostname = socket.gethostbyaddr(socket.gethostname())[0]
    data = dict(
            machine_type='illumina',
            hostname=hostname,
            user=user,
            directory=directory,
            to_copy=files_to_copy
            )
    dirs = {"work": os.getcwd(),
            "config": os.path.dirname(config_file)}

    runner = messaging.runner(run_module, dirs, config, config_file, wait=False)

    if pushed:
        config["directory"] = directory
        runner(fn_name, [[config]])

    else:
        runner(fn_name, [[data]])
Пример #4
0
def hmmscan(**kwargs):
    logging.debug(kwargs)
    args = {'seq': kwargs.get('seq'),
            'hmmdb': kwargs.get('hmmdb')}
    args2 = {'output': 'json'}
    range = kwargs.get('range', None)
    if range:
        args2['range'] = range
    return _hmmer("http://hmmer.janelia.org/search/hmmscan", args, args2)
Пример #5
0
def long_term_storage(remote_info, config_file):
    """Securely copy files from remote directory to the storage server.

    This requires ssh public keys to be setup so that no password entry
    is necessary, Fabric is used to manage setting up copies on the remote
    storage server.
    """
    config = load_config(config_file)
    logger.info("Copying run data over to remote storage: %s" % config["store_host"])
    logger.debug("The contents from AMQP for this dataset are:\n %s" % remote_info)
    _copy_for_storage(remote_info, config)
Пример #6
0
def _calculate_md5(fastq_dir):
    """Calculate the md5sum for the fastq files
    """
    glob_str = "*_fastq.txt"
    fastq_files = glob.glob(os.path.join(fastq_dir, glob_str))

    md5sum_file = os.path.join(fastq_dir, "md5sums.txt")
    with open(md5sum_file, 'w') as fh:
        for fastq_file in fastq_files:
            logger2.debug("Calculating md5 for %s using md5sum" % fastq_file)
            cl = ["md5sum", fastq_file]
            fh.write(subprocess.check_output(cl))
Пример #7
0
def _calculate_md5(fastq_dir):
    """Calculate the md5sum for the fastq files
    """
    glob_str = "*_fastq.txt"
    fastq_files = glob.glob(os.path.join(fastq_dir,glob_str))
    
    md5sum_file = os.path.join(fastq_dir,"md5sums.txt")
    with open(md5sum_file,'w') as fh:
        for fastq_file in fastq_files:
            logger2.debug("Calculating md5 for %s using md5sum" % fastq_file)
            cl = ["md5sum",fastq_file]
            fh.write(subprocess.check_output(cl))
Пример #8
0
def _generate_fastq_with_casava(fc_dir, config, r1=False):
    """Perform demultiplexing and generate fastq.gz files for the current
    flowecell using CASAVA (>1.8).
    """
    basecall_dir = os.path.join(fc_dir, "Data", "Intensities", "BaseCalls")
    casava_dir = config["program"].get("casava")
    unaligned_dir = os.path.join(fc_dir, "Unaligned")
    samplesheet_file = samplesheet.run_has_samplesheet(fc_dir, config)
    num_mismatches = config["algorithm"].get("mismatches", 1)
    num_cores = config["algorithm"].get("num_cores", 1)
    im_stats = config["algorithm"].get("ignore-missing-stats",False)
    im_bcl = config["algorithm"].get("ignore-missing-bcl",False)
    im_control = config["algorithm"].get("ignore-missing-control",False)
    
    # Write to log files
    configure_out = os.path.join(fc_dir,"configureBclToFastq.out")
    configure_err = os.path.join(fc_dir,"configureBclToFastq.err")
    casava_out = os.path.join(fc_dir,"bclToFastq_R{:d}.out".format(2-int(r1)))
    casava_err = os.path.join(fc_dir,"bclToFastq_R{:d}.err".format(2-int(r1)))

    cl = [os.path.join(casava_dir, "configureBclToFastq.pl")]
    cl.extend(["--input-dir", basecall_dir])
    cl.extend(["--output-dir", unaligned_dir])
    cl.extend(["--mismatches", str(num_mismatches)])
    cl.extend(["--fastq-cluster-count", "0"])
    if samplesheet_file is not None: cl.extend(["--sample-sheet", samplesheet_file])
    if im_stats: cl.append("--ignore-missing-stats")
    if im_bcl: cl.append("--ignore-missing-bcl")
    if im_control: cl.append("--ignore-missing-control")
    
    bm = _get_bases_mask(fc_dir)
    if bm is not None:
        cl.extend(["--use-bases-mask", bm])

    if r1:
        # Run configuration script
        logger2.info("Configuring BCL to Fastq conversion")
        logger2.debug(cl)
        
        co = open(configure_out,'w')
        ce = open(configure_err,'w')
        try:
            subprocess.check_call(cl,stdout=co,stderr=ce)
            co.close()
            ce.close()
        except subprocess.CalledProcessError, e:
            logger2.error("Configuring BCL to Fastq conversion for {:s} FAILED " \
                          "(exit code {}), please check log files {:s}, {:s}".format(fc_dir,
                                                                                     str(e.returncode),
                                                                                     configure_out,
                                                                                     configure_err))
            raise e
Пример #9
0
def long_term_storage(remote_info, config_file):
    """Securely copy files from remote directory to the storage server.

    This requires ssh public keys to be setup so that no password entry
    is necessary, Fabric is used to manage setting up copies on the remote
    storage server.
    """
    config = load_config(config_file)
    logger.info("Copying run data over to remote storage: %s" %
                config["store_host"])
    logger.debug("The contents from AMQP for this dataset are:\n %s" %
                 remote_info)
    _copy_for_storage(remote_info, config)
Пример #10
0
 def _log_messages(self, log_handler, subject="Test email"):
     try:
         with log_handler.applicationbound():
             with logbook.Processor(lambda record: record.extra.__setitem__('run', subject)):
                 logger2.debug("DEBUG record test generated @ %s" % time.strftime("%x - %X"))
                 logger2.info("INFO record test generated @ %s" % time.strftime("%x - %X"))
                 logger2.notice("NOTICE record test generated @ %s" % time.strftime("%x - %X"))
                 logger2.warning("WARNING record test generated @ %s" % time.strftime("%x - %X"))
                 logger2.error("ERROR record test generated @ %s" % time.strftime("%x - %X"))
                 logger2.critical("CRITICAL record test generated @ %s" % time.strftime("%x - %X"))
     except Exception as e:
         return e
     return None
Пример #11
0
def phmmer(**kwargs):
    """Search a protein sequence against a HMMER sequence database.

    Arguments:
      seq - The sequence to search -- a Fasta string.
      seqdb -- Sequence database to search against.
      range -- A string range of results to return (ie. 1,10 for the first ten)
      output -- The output format (defaults to JSON).
    """
    logging.debug(kwargs)
    args = {'seq': kwargs.get('seq'),
            'seqdb': kwargs.get('seqdb')}
    args2 = {'output': kwargs.get('output', 'json'),
             'range': kwargs.get('range')}
    return _hmmer("http://hmmer.janelia.org/search/phmmer", args, args2)
Пример #12
0
    def __init__(self, fc_name, fc_date, data, fc_dir=None):
        # Extract the run_items if we are passed a dictionary
        try:
            log.debug("Try making flowcell with this data:")
            log.debug(data)
            d = data.get('details', [])
            data = d
        except AttributeError:
            pass

        self.set_fc_dir(fc_dir)
        self.set_fc_date(fc_date)
        self.set_fc_name(fc_name)
        self.set_lanes(data)
        # Attempts to set the read counts on creation
        self.set_read_counts()
Пример #13
0
    def __init__(self, fc_name, fc_date, data, fc_dir=None):
        # Extract the run_items if we are passed a dictionary
        try:
            log.debug("Try making flowcell with this data:")
            log.debug(data)
            d = data.get('details', [])
            data = d
        except AttributeError:
            pass

        self.set_fc_dir(fc_dir)
        self.set_fc_date(fc_date)
        self.set_fc_name(fc_name)
        self.set_lanes(data)
        # Attempts to set the read counts on creation
        self.set_read_counts()
Пример #14
0
def _copy_from_sequencer(remote_info, config):
    """Get local directory of flowcell info, or copy from sequencer.
    """
    if "fc_dir" in remote_info:
        fc_dir = remote_info["fc_dir"]
        assert os.path.exists(fc_dir)
    else:
        logger.debug("Remote host information: %s" % remote_info)
        c_host_str = _config_hosts(config)
        c_keyfile = config["analysis"].get("copy_keyfile", None)
        with fabric.settings(host_string=c_host_str, key_filename=c_keyfile):
            base_dir = config["store_dir"]
            protocol = config.get("transfer_protocol", None)

            fc_dir = remote_copy(remote_info, base_dir, protocol)

    return fc_dir
Пример #15
0
def _copy_from_sequencer(remote_info, config):
    """Get local directory of flowcell info, or copy from sequencer.
    """
    if "fc_dir" in remote_info:
        fc_dir = remote_info["fc_dir"]
        assert os.path.exists(fc_dir)
    else:
        logger.debug("Remote host information: %s" % remote_info)
        c_host_str = _config_hosts(config)
        c_keyfile = config["analysis"].get("copy_keyfile", None)
        with fabric.settings(host_string=c_host_str, key_filename=c_keyfile):
            base_dir = config["store_dir"]
            protocol = config.get("transfer_protocol", None)

            fc_dir = remote_copy(remote_info, base_dir, protocol)

    return fc_dir
Пример #16
0
 def _log_messages(self, log_handler, subject="Test email"):
     try:
         with log_handler.applicationbound():
             with logbook.Processor(lambda record: record.extra.__setitem__(
                     'run', subject)):
                 logger2.debug("DEBUG record test generated @ %s" %
                               time.strftime("%x - %X"))
                 logger2.info("INFO record test generated @ %s" %
                              time.strftime("%x - %X"))
                 logger2.notice("NOTICE record test generated @ %s" %
                                time.strftime("%x - %X"))
                 logger2.warning("WARNING record test generated @ %s" %
                                 time.strftime("%x - %X"))
                 logger2.error("ERROR record test generated @ %s" %
                               time.strftime("%x - %X"))
                 logger2.critical("CRITICAL record test generated @ %s" %
                                  time.strftime("%x - %X"))
     except Exception as e:
         return e
     return None
Пример #17
0
def _clean_qseq(bc_dir, fastq_dir):
    """Remove the temporary qseq files if the corresponding fastq file
       has been created
    """
    glob_str = "*_1_fastq.txt"
    fastq_files = glob.glob(os.path.join(fastq_dir, glob_str))

    for fastq_file in fastq_files:
        try:
            lane = int(os.path.basename(fastq_file)[0])
        except ValueError:
            continue

        logger2.debug("Removing qseq files for lane %d" % lane)
        glob_str = "s_%d_*qseq.txt" % lane

        for qseq_file in glob.glob(os.path.join(bc_dir, glob_str)):
            try:
                os.unlink(qseq_file)
            except:
                logger2.debug("Could not remove %s" % qseq_file)
Пример #18
0
def _compress_fastq(fastq_dir, config):
    """Compress the fastq files using gzip
    """
    glob_str = "*_fastq.txt"
    fastq_files = glob.glob(os.path.join(fastq_dir,glob_str))
    num_cores = config["algorithm"].get("num_cores",1)
    active_procs = []
    for fastq_file in fastq_files:
        # Sleep for one minute while waiting for an open slot
        while len(active_procs) >= num_cores:
            time.sleep(60)
            active_procs, _ = _process_status(active_procs)
            
        logger2.debug("Compressing %s using gzip" % fastq_file)
        cl = ["gzip",fastq_file]
        active_procs.append(subprocess.Popen(cl))
    
    # Wait for the last processes to finish
    while len(active_procs) > 0:
        time.sleep(60)
        active_procs, _ = _process_status(active_procs)
Пример #19
0
def _clean_qseq(bc_dir, fastq_dir):
    """Remove the temporary qseq files if the corresponding fastq file 
       has been created
    """    
    glob_str = "*_1_fastq.txt"
    fastq_files = glob.glob(os.path.join(fastq_dir,glob_str))
    
    for fastq_file in fastq_files:
        try:
            lane = int(os.path.basename(fastq_file)[0])
        except ValueError:
            continue
        
        logger2.debug("Removing qseq files for lane %d" % lane)
        glob_str = "s_%d_*qseq.txt" % lane
        
        for qseq_file in glob.glob(os.path.join(bc_dir, glob_str)):
            try:
                os.unlink(qseq_file)
            except:
                logger2.debug("Could not remove %s" % qseq_file)
Пример #20
0
def make_lane_items(info, fc_date, fc_name, dirs, config):
    sample_name = info.get("description", "")
    if (config["algorithm"].get("include_short_name", True) and
            info.get("name", "")):
        sample_name = "%s---%s" % (info.get("name", ""), sample_name)
    genome_build = info.get("genome_build", None)
    multiplex = info.get("multiplex", "")
    logger.info("Processing sample: %s; lane %s; reference genome %s; " \
             "researcher %s; analysis method %s" %
             (sample_name, info["lane"], genome_build,
              info.get("researcher", ""), info.get("analysis", "")))
    lane_items = []
    if multiplex:
        logger.debug("Sample %s is multiplexed as: %s" % (sample_name, multiplex))
        mitems = get_multiplex_items(multiplex, info['lane'], dirs['fc_dir'], fc_name, fc_date)
        for fastq1, fastq2, mlane_name, msample in mitems:
            lane_items.append((fastq1, fastq2, genome_build, mlane_name, msample, dirs, config))
    else:
        # TODO: Not multiplex: what to do?
        pass
    return lane_items
Пример #21
0
def make_lane_items(info, fc_date, fc_name, dirs, config):
    sample_name = info.get("description", "")
    if (config["algorithm"].get("include_short_name", True)
            and info.get("name", "")):
        sample_name = "%s---%s" % (info.get("name", ""), sample_name)
    genome_build = info.get("genome_build", None)
    multiplex = info.get("multiplex", "")
    logger.info("Processing sample: %s; lane %s; reference genome %s; " \
             "researcher %s; analysis method %s" %
             (sample_name, info["lane"], genome_build,
              info.get("researcher", ""), info.get("analysis", "")))
    lane_items = []
    if multiplex:
        logger.debug("Sample %s is multiplexed as: %s" %
                     (sample_name, multiplex))
        mitems = get_multiplex_items(multiplex, info['lane'], dirs['fc_dir'],
                                     fc_name, fc_date)
        for fastq1, fastq2, mlane_name, msample in mitems:
            lane_items.append((fastq1, fastq2, genome_build, mlane_name,
                               msample, dirs, config))
    else:
        # TODO: Not multiplex: what to do?
        pass
    return lane_items
Пример #22
0
def _generate_fastq_with_casava(fc_dir, config, r1=False):
    """Perform demultiplexing and generate fastq.gz files for the current
    flowecell using CASAVA (>1.8).
    """
    basecall_dir = os.path.join(fc_dir, "Data", "Intensities", "BaseCalls")
    casava_dir = config["program"].get("casava")
    unaligned_dir = os.path.join(fc_dir, "Unaligned")
    samplesheet_file = samplesheet.run_has_samplesheet(fc_dir, config)
    num_mismatches = config["algorithm"].get("mismatches", 1)
    num_cores = config["algorithm"].get("num_cores", 1)

    cl = [os.path.join(casava_dir, "configureBclToFastq.pl")]
    cl.extend(["--input-dir", basecall_dir])
    cl.extend(["--output-dir", unaligned_dir])
    cl.extend(["--sample-sheet", samplesheet_file])
    cl.extend(["--mismatches", str(num_mismatches)])

    options = ["--fastq-cluster-count", "0", \
               "--ignore-missing-stats", \
               "--ignore-missing-bcl", \
               "--ignore-missing-control"]

    cl.extend(options)

    if r1:
        # Run configuration script
        logger2.info("Configuring BCL to Fastq conversion")
        logger2.debug(cl)
        subprocess.check_call(cl)

    # Go to <Unaligned> folder
    with utils.chdir(unaligned_dir):
        # Perform make
        cl = ["nohup", "make", "-j", str(num_cores)]
        if r1:
            cl.append("r1")

        logger2.info("Demultiplexing and converting bcl to fastq.gz")
        logger2.debug(cl)
        subprocess.check_call(cl)

    logger2.debug("Done")
Пример #23
0
def create_report_on_gdocs(fc_date, fc_name, run_info_yaml, dirs, config):
    """Create reports on gdocs containing both demultiplexed read counts and QC data.
    """
    success = True
    try:
        # Inject the fc_date and fc_name in the email subject
        def record_processor(record):
            return record.extra.__setitem__('run',
                                            "%s_%s" % (fc_date, fc_name))

        # Parse the run_info.yaml file
        log.debug("Loading this run_info: {}".format(run_info_yaml))
        with open(run_info_yaml, "r") as fh:
            run_info = yaml.load(fh)

        # Get the gdocs account credentials
        encoded_credentials = get_credentials(config)
        if not encoded_credentials:
            log.warn(
                "Could not find Google Docs account credentials in configuration. \
                      No sequencing report was written")
            return False

        # Get the required parameters from the post_process.yaml configuration file
        gdocs = config.get("gdocs_upload", None)

        # Add email notification
        email = gdocs.get("gdocs_email_notification", None)
        smtp_host = config.get("smtp_host", "")
        smtp_port = config.get("smtp_port", "")
        log_handler = create_log_handler({'email': email, \
                                          'smtp_host': smtp_host, \
                                          'smtp_port': smtp_port}, True)

    except Exception as e:
        success = False
        log.warn(
            "Encountered exception when writing sequencing report to Google Docs: %s"
            % e)

    with log_handler.applicationbound(), logbook.Processor(record_processor):
        try:
            log.info("Started creating sequencing report on Google docs for %s_%s on %s" \
                % (fc_date, fc_name, datetime.datetime.now().isoformat()))

            # Get a flowcell object
            fc = Flowcell(fc_name, fc_date, run_info, dirs.get("work", None))

            # Get the GDocs demultiplex result file title
            gdocs_dmplx_spreadsheet = gdocs.get("gdocs_dmplx_file", None)
            # Get the GDocs QC file title
            gdocs_qc_spreadsheet = gdocs.get("gdocs_qc_file", None)

            # FIXME: Make the bc stuff use the Flowcell module
            if gdocs_dmplx_spreadsheet is not None:
                # Upload the data
                bc_metrics.write_run_report_to_gdocs(fc, fc_date, \
                    fc_name, gdocs_dmplx_spreadsheet, encoded_credentials, append=True)
            else:
                log.warn("Could not find Google Docs demultiplex results file \
                    title in configuration. No demultiplex counts were \
                    written to Google Docs for %s_%s" % (fc_date, fc_name))

            # Parse the QC metrics
            try:
                qc = RTAQCMetrics(dirs.get("flowcell", None))
            except:
                qc = None

            if gdocs_qc_spreadsheet is not None and qc is not None:
                qc_metrics.write_run_report_to_gdocs(fc, qc,
                                                     gdocs_qc_spreadsheet,
                                                     encoded_credentials)
            else:
                log.warn("Could not find Google Docs QC file title in configuration. " \
                         "No QC data were written to Google Docs " \
                         "for %s_%s".format(fc_date, fc_name))

            # Get the projects parent folder
            projects_folder = gdocs.get("gdocs_projects_folder", None)

            # Write the bc project summary report
            if projects_folder is not None:
                create_project_report_on_gdocs(fc, qc, \
                    encoded_credentials, projects_folder)

        except Exception as e:
            success = False
            log.warn("Encountered exception when writing sequencing report " \
                     "to Google Docs: {}".format(e))

        if success:
            log.info("Sequencing report successfully created on Google " \
                     "docs for {}_{} on {}".format(fc_date, fc_name, datetime.datetime.now().isoformat()))
        else:
            log.warn("Encountered exception when writing sequencing " \
                     "report for %s_%s to Google docs on %s" \
                     % (fc_date, fc_name, datetime.datetime.now().isoformat()))

    return success
Пример #24
0
def create_report_on_gdocs(fc_date, fc_name, run_info_yaml, dirs, config):
    """Create reports on gdocs containing both demultiplexed read counts and QC data.
    """
    success = True
    try:
        # Inject the fc_date and fc_name in the email subject
        def record_processor(record):
            return record.extra.__setitem__('run', "%s_%s" % (fc_date, fc_name))

        # Parse the run_info.yaml file
        log.debug("Loading this run_info: {}".format(run_info_yaml))
        with open(run_info_yaml, "r") as fh:
            run_info = yaml.load(fh)

        # Get the gdocs account credentials
        encoded_credentials = get_credentials(config)
        if not encoded_credentials:
            log.warn("Could not find Google Docs account credentials in configuration. \
                      No sequencing report was written")
            return False

        # Get the required parameters from the post_process.yaml configuration file
        gdocs = config.get("gdocs_upload", None)

        # Add email notification
        email = gdocs.get("gdocs_email_notification", None)
        smtp_host = config.get("smtp_host", "")
        smtp_port = config.get("smtp_port", "")
        log_handler = create_log_handler({'email': email, \
                                          'smtp_host': smtp_host, \
                                          'smtp_port': smtp_port}, True)

    except Exception as e:
        success = False
        log.warn("Encountered exception when writing sequencing report to Google Docs: %s" % e)

    with log_handler.applicationbound(), logbook.Processor(record_processor):
        try:
            log.info("Started creating sequencing report on Google docs for %s_%s on %s" \
                % (fc_date, fc_name, datetime.datetime.now().isoformat()))

            # Get a flowcell object
            fc = Flowcell(fc_name, fc_date, run_info, dirs.get("work", None))

            # Get the GDocs demultiplex result file title
            gdocs_dmplx_spreadsheet = gdocs.get("gdocs_dmplx_file", None)
            # Get the GDocs QC file title
            gdocs_qc_spreadsheet = gdocs.get("gdocs_qc_file", None)

            # FIXME: Make the bc stuff use the Flowcell module
            if gdocs_dmplx_spreadsheet is not None:
                # Upload the data
                bc_metrics.write_run_report_to_gdocs(fc, fc_date, \
                    fc_name, gdocs_dmplx_spreadsheet, encoded_credentials, append=True)
            else:
                log.warn("Could not find Google Docs demultiplex results file \
                    title in configuration. No demultiplex counts were \
                    written to Google Docs for %s_%s" % (fc_date, fc_name))

            # Parse the QC metrics
            try:
                qc = RTAQCMetrics(dirs.get("flowcell", None))
            except:
                qc = None

            if gdocs_qc_spreadsheet is not None and qc is not None:
                qc_metrics.write_run_report_to_gdocs(fc, qc, gdocs_qc_spreadsheet, encoded_credentials)
            else:
                log.warn("Could not find Google Docs QC file title in configuration. " \
                         "No QC data were written to Google Docs " \
                         "for %s_%s".format(fc_date, fc_name))

            # Get the projects parent folder
            projects_folder = gdocs.get("gdocs_projects_folder", None)

            # Write the bc project summary report
            if projects_folder is not None:
                create_project_report_on_gdocs(fc, qc, \
                    encoded_credentials, projects_folder)

        except Exception as e:
            success = False
            log.warn("Encountered exception when writing sequencing report " \
                     "to Google Docs: {}".format(e))

        if success:
            log.info("Sequencing report successfully created on Google " \
                     "docs for {}_{} on {}".format(fc_date, fc_name, datetime.datetime.now().isoformat()))
        else:
            log.warn("Encountered exception when writing sequencing " \
                     "report for %s_%s to Google docs on %s" \
                     % (fc_date, fc_name, datetime.datetime.now().isoformat()))

    return success
Пример #25
0
            logger2.error("Configuring BCL to Fastq conversion for {:s} FAILED " \
                          "(exit code {}), please check log files {:s}, {:s}".format(fc_dir,
                                                                                     str(e.returncode),
                                                                                     configure_out,
                                                                                     configure_err))
            raise e
        
    # Go to <Unaligned> folder
    with utils.chdir(unaligned_dir):
        # Perform make
        cl = ["make", "-j", str(num_cores)]
        if r1:
            cl.append("r1")

        logger2.info("Demultiplexing and converting bcl to fastq.gz")
        logger2.debug(cl)
        
        co = open(casava_out,'w')
        ce = open(casava_err,'w')
        try:
            subprocess.check_call(cl,stdout=co,stderr=ce)
            co.close()
            ce.close()
        except subprocess.CalledProcessError, e:
            logger2.error("BCL to Fastq conversion for {:s} FAILED " \
                          "(exit code {}), please check log files {:s}, "\
                          "{:s}".format(fc_dir,
                                        str(e.returncode),
                                        casava_out,
                                        casava_err))
            raise e
Пример #26
0
def remote_copy(remote_info, base_dir, protocol):
    """Securely copy files between servers.
    """
    fc_dir = base_dir

    if not fabric_files.exists(fc_dir):
        fabric.run("mkdir %s" % fc_dir)

    if protocol == "scp" or protocol == None:
        for fcopy in remote_info["to_copy"]:
            target_loc = os.path.join(fc_dir, \
            os.path.basename(remote_info['directory']), fcopy)
            if not fabric_files.exists(target_loc):
                target_dir = os.path.dirname(target_loc)
                if not fabric_files.exists(target_dir):
                    fabric.run("mkdir -p %s" % target_dir)

                cl = [
                    "scp", "-r",
                    "%s@%s:%s/%s" %
                    (remote_info["user"], remote_info["hostname"],
                     remote_info["directory"], fcopy), target_loc
                ]

                logger.debug(cl)
                fabric.run(" ".join(cl))

    elif protocol == "rsync":
        include = []
        for fcopy in remote_info['to_copy']:
            include.append("--include='%s**/*'" % (fcopy, ))
            include.append("--include='%s'" % (fcopy, ))
            # By including both these patterns we get the entire directory
            # if a directory is given, or a single file if a single file is
            # given.

        cl = ["rsync", "--checksum", "--archive", \
                "--partial", "--progress", \
                "--prune-empty-dirs", "--include='*/'", \
                " ".join(include), "--exclude='*'", \
                "%s@%s:%s" % (remote_info["user"], remote_info["hostname"], \
                remote_info["directory"]), fc_dir]

        logger.debug(cl)
        fabric.run(" ".join(cl))

    # Note: rdiff-backup doesn't have the ability to resume a partial transfer,
    # and will instead transfer the backup from the beginning if it detects a
    # partial transfer.
    elif protocol == "rdiff-backup":
        include = []
        for fcopy in remote_info['to_copy']:
            include.append("--include %s/%s" % \
            (remote_info["directory"], fcopy))

        cl = [
            "rdiff-backup", " ".join(include), "--exclude '**'",
            "%s@%s::%s" % (remote_info["user"], remote_info["hostname"],
                           remote_info["directory"]), fc_dir
        ]

        logger.debug(cl)
        fabric.run(" ".join(cl))

    fc_dir = os.path.join(fc_dir, os.path.basename(remote_info['directory']))
    return fc_dir
Пример #27
0
                                                                                     configure_out,
                                                                                     configure_err))
            raise e
        finally:
            co.close()
            ce.close()

    # Go to <Unaligned> folder
    with utils.chdir(unaligned_dir):
        # Perform make
        cl = ["make", "-j", str(num_cores)]
        if r1:
            cl.append("r1")

        logger2.info("Demultiplexing and converting bcl to fastq.gz")
        logger2.debug(cl)

        co = open(casava_out, 'w')
        ce = open(casava_err, 'w')
        try:
            co.write("{}\n".format(" ".join(cl)))
            ce.write("{}\n".format(" ".join(cl)))
            subprocess.check_call(cl, stdout=co, stderr=ce)
        except subprocess.CalledProcessError, e:
            logger2.error("BCL to Fastq conversion for {:s} FAILED " \
                          "(exit code {}), please check log files {:s}, "\
                          "{:s}".format(fc_dir,
                                        str(e.returncode),
                                        casava_out,
                                        casava_err))
            raise e
Пример #28
0
def remote_copy(remote_info, base_dir, protocol):
    """Securely copy files between servers.
    """
    fc_dir = base_dir

    if not fabric_files.exists(fc_dir):
        fabric.run("mkdir %s" % fc_dir)

    if protocol == "scp" or protocol == None:
        for fcopy in remote_info["to_copy"]:
            target_loc = os.path.join(fc_dir, \
            os.path.basename(remote_info['directory']), fcopy)
            if not fabric_files.exists(target_loc):
                target_dir = os.path.dirname(target_loc)
                if not fabric_files.exists(target_dir):
                    fabric.run("mkdir -p %s" % target_dir)

                cl = ["scp", "-r", "%s@%s:%s/%s" %
                      (remote_info["user"], remote_info["hostname"],
                      remote_info["directory"], fcopy),
                      target_loc]

                logger.debug(cl)
                fabric.run(" ".join(cl))

    elif protocol == "rsync":
        include = []
        for fcopy in remote_info['to_copy']:
            include.append("--include='%s**/*'" % (fcopy,))
            include.append("--include='%s'" % (fcopy,))
            # By including both these patterns we get the entire directory
            # if a directory is given, or a single file if a single file is
            # given.

        cl = ["rsync", "--checksum", "--archive", \
                "--partial", "--progress", \
                "--prune-empty-dirs", "--include='*/'", \
                " ".join(include), "--exclude='*'", \
                "%s@%s:%s" % (remote_info["user"], remote_info["hostname"], \
                remote_info["directory"]), fc_dir]

        logger.debug(cl)
        fabric.run(" ".join(cl))

    # Note: rdiff-backup doesn't have the ability to resume a partial transfer,
    # and will instead transfer the backup from the beginning if it detects a
    # partial transfer.
    elif protocol == "rdiff-backup":
        include = []
        for fcopy in remote_info['to_copy']:
            include.append("--include %s/%s" % \
            (remote_info["directory"], fcopy))

        cl = ["rdiff-backup", " ".join(include), "--exclude '**'",
              "%s@%s::%s" % (remote_info["user"], remote_info["hostname"],
              remote_info["directory"]), fc_dir]

        logger.debug(cl)
        fabric.run(" ".join(cl))

    fc_dir = os.path.join(fc_dir, os.path.basename(remote_info['directory']))
    return fc_dir
Пример #29
0
def _generate_fastq_with_casava_task(args):
    """Perform demultiplexing and generate fastq.gz files for the current
    flowecell using CASAVA (>1.8).
    """
    bp = args.get('bp')
    samples_group = args.get('samples')
    base_mask = samples_group['base_mask']
    samples = samples_group['samples']
    fc_dir = args.get('fc_dir')
    config = args.get('config')
    r1 = args.get('r1', False)
    idx_only = args.get('idx_only', False)
    ss = 'SampleSheet_{bp}bp.csv'.format(bp=str(bp))
    unaligned_folder = 'Unaligned_{bp}bp'.format(bp=str(bp))
    out_file = 'configureBclToFastq_{bp}bp.out'.format(bp=str(bp))
    err_file = 'configureBclToFastq_{bp}bp.err'.format(bp=str(bp))

    #Prepare CL arguments and call configureBclToFastq
    basecall_dir = os.path.join(fc_dir, "Data", "Intensities", "BaseCalls")
    casava_dir = config["program"].get("casava")
    out_dir = config.get("out_directory", fc_dir)
    #Append the flowcell dir to the output directory if different from the run dir
    if out_dir != fc_dir:
        out_dir = os.path.join(out_dir, os.path.basename(fc_dir))
    unaligned_dir = os.path.join(out_dir, unaligned_folder)
    samplesheet_file = os.path.join(fc_dir, ss)
    num_mismatches = config["algorithm"].get("mismatches", 1)
    num_cores = config["algorithm"].get("num_cores", 1)
    im_stats = config["algorithm"].get("ignore-missing-stats", False)
    im_bcl = config["algorithm"].get("ignore-missing-bcl", False)
    im_control = config["algorithm"].get("ignore-missing-control", False)

    # Write to log files
    configure_out = os.path.join(fc_dir, out_file)
    configure_err = os.path.join(fc_dir, err_file)
    casava_out = os.path.join(fc_dir, "bclToFastq_R{:d}.out".format(2 - int(r1)))
    casava_err = os.path.join(fc_dir, "bclToFastq_R{:d}.err".format(2 - int(r1)))

    cl = [os.path.join(casava_dir, "configureBclToFastq.pl")]
    cl.extend(["--input-dir", basecall_dir])
    cl.extend(["--output-dir", unaligned_dir])
    cl.extend(["--mismatches", str(num_mismatches)])
    cl.extend(["--fastq-cluster-count", "0"])
    if samplesheet_file is not None:
        cl.extend(["--sample-sheet", samplesheet_file])

    if im_stats:
        cl.append("--ignore-missing-stats")

    if im_bcl:
        cl.append("--ignore-missing-bcl")

    if im_control:
        cl.append("--ignore-missing-control")

    if base_mask is not None:
        cl.extend(["--use-bases-mask", ','.join(base_mask)])
    if r1:
        cl.append("--force")

    if r1 or idx_only:
        #Create separate samplesheet and folder
        with open(os.path.join(fc_dir, ss), 'w') as fh:
            samplesheet = csv.DictWriter(fh, fieldnames=samples['fieldnames'], dialect='excel')
            samplesheet.writeheader()
            samplesheet.writerows(samples['samples'])

        # Run configuration script
        logger2.info("Configuring BCL to Fastq conversion")
        logger2.debug(cl)

        co = open(configure_out, 'w')
        ce = open(configure_err, 'w')
        try:
            co.write("{}\n".format(" ".join(cl)))
            ce.write("{}\n".format(" ".join(cl)))
            subprocess.check_call(cl, stdout=co, stderr=ce)
        except subprocess.CalledProcessError, e:
            logger2.error("Configuring BCL to Fastq conversion for {:s} FAILED " \
                          "(exit code {}), please check log files {:s}, {:s}".format(fc_dir,
                                                                                     str(e.returncode),
                                                                                     configure_out,
                                                                                     configure_err))
            raise e
        finally:
Пример #30
0
 def http_error_302(self, req, fp, code, msg, headers):
     logging.debug(headers)
     return headers