def _generate_fastq(fc_dir, config, compress_fastq): """Generate fastq files for the current flowcell. """ fc_name, fc_date = get_flowcell_info(fc_dir) short_fc_name = "%s_%s" % (fc_date, fc_name) fastq_dir = get_fastq_dir(fc_dir) basecall_dir = os.path.split(fastq_dir)[0] postprocess_dir = config.get("postprocess_dir", "") if postprocess_dir: fastq_dir = os.path.join(postprocess_dir, os.path.basename(fc_dir), "fastq") if not fastq_dir == fc_dir:# and not os.path.exists(fastq_dir): with utils.chdir(basecall_dir): lanes = sorted(list(set([f.split("_")[1] for f in glob.glob("*qseq.txt")]))) cl = ["solexa_qseq_to_fastq.py", short_fc_name, ",".join(lanes)] if postprocess_dir: cl += ["-o", fastq_dir] if compress_fastq: cl += ["--gzip"] logger2.debug("Converting qseq to fastq on all lanes.") subprocess.check_call(cl) return fastq_dir
def _generate_fastq(fc_dir, config, compress_fastq): """Generate fastq files for the current flowcell. """ fc_name, fc_date = get_flowcell_info(fc_dir) short_fc_name = "%s_%s" % (fc_date, fc_name) fastq_dir = get_fastq_dir(fc_dir) basecall_dir = os.path.split(fastq_dir)[0] postprocess_dir = config.get("postprocess_dir", "") if postprocess_dir: fastq_dir = os.path.join(postprocess_dir, os.path.basename(fc_dir), "fastq") if not fastq_dir == fc_dir: # and not os.path.exists(fastq_dir): with utils.chdir(basecall_dir): lanes = sorted(list(set([f.split("_")[1] for f in glob.glob("*qseq.txt")]))) cl = ["solexa_qseq_to_fastq.py", short_fc_name, ",".join(lanes)] if postprocess_dir: cl += ["-o", fastq_dir] if compress_fastq: cl += ["--gzip"] logger2.debug("Converting qseq to fastq on all lanes.") subprocess.check_call(cl) return fastq_dir
def finished_message(fn_name, run_module, directory, files_to_copy, config, config_file, pushed=False): """Wait for messages with the give tag, passing on to the supplied handler. """ logger2.debug("Calling remote function: %s" % fn_name) user = getpass.getuser() hostname = socket.gethostbyaddr(socket.gethostname())[0] data = dict( machine_type='illumina', hostname=hostname, user=user, directory=directory, to_copy=files_to_copy ) dirs = {"work": os.getcwd(), "config": os.path.dirname(config_file)} runner = messaging.runner(run_module, dirs, config, config_file, wait=False) if pushed: config["directory"] = directory runner(fn_name, [[config]]) else: runner(fn_name, [[data]])
def hmmscan(**kwargs): logging.debug(kwargs) args = {'seq': kwargs.get('seq'), 'hmmdb': kwargs.get('hmmdb')} args2 = {'output': 'json'} range = kwargs.get('range', None) if range: args2['range'] = range return _hmmer("http://hmmer.janelia.org/search/hmmscan", args, args2)
def long_term_storage(remote_info, config_file): """Securely copy files from remote directory to the storage server. This requires ssh public keys to be setup so that no password entry is necessary, Fabric is used to manage setting up copies on the remote storage server. """ config = load_config(config_file) logger.info("Copying run data over to remote storage: %s" % config["store_host"]) logger.debug("The contents from AMQP for this dataset are:\n %s" % remote_info) _copy_for_storage(remote_info, config)
def _calculate_md5(fastq_dir): """Calculate the md5sum for the fastq files """ glob_str = "*_fastq.txt" fastq_files = glob.glob(os.path.join(fastq_dir, glob_str)) md5sum_file = os.path.join(fastq_dir, "md5sums.txt") with open(md5sum_file, 'w') as fh: for fastq_file in fastq_files: logger2.debug("Calculating md5 for %s using md5sum" % fastq_file) cl = ["md5sum", fastq_file] fh.write(subprocess.check_output(cl))
def _calculate_md5(fastq_dir): """Calculate the md5sum for the fastq files """ glob_str = "*_fastq.txt" fastq_files = glob.glob(os.path.join(fastq_dir,glob_str)) md5sum_file = os.path.join(fastq_dir,"md5sums.txt") with open(md5sum_file,'w') as fh: for fastq_file in fastq_files: logger2.debug("Calculating md5 for %s using md5sum" % fastq_file) cl = ["md5sum",fastq_file] fh.write(subprocess.check_output(cl))
def _generate_fastq_with_casava(fc_dir, config, r1=False): """Perform demultiplexing and generate fastq.gz files for the current flowecell using CASAVA (>1.8). """ basecall_dir = os.path.join(fc_dir, "Data", "Intensities", "BaseCalls") casava_dir = config["program"].get("casava") unaligned_dir = os.path.join(fc_dir, "Unaligned") samplesheet_file = samplesheet.run_has_samplesheet(fc_dir, config) num_mismatches = config["algorithm"].get("mismatches", 1) num_cores = config["algorithm"].get("num_cores", 1) im_stats = config["algorithm"].get("ignore-missing-stats",False) im_bcl = config["algorithm"].get("ignore-missing-bcl",False) im_control = config["algorithm"].get("ignore-missing-control",False) # Write to log files configure_out = os.path.join(fc_dir,"configureBclToFastq.out") configure_err = os.path.join(fc_dir,"configureBclToFastq.err") casava_out = os.path.join(fc_dir,"bclToFastq_R{:d}.out".format(2-int(r1))) casava_err = os.path.join(fc_dir,"bclToFastq_R{:d}.err".format(2-int(r1))) cl = [os.path.join(casava_dir, "configureBclToFastq.pl")] cl.extend(["--input-dir", basecall_dir]) cl.extend(["--output-dir", unaligned_dir]) cl.extend(["--mismatches", str(num_mismatches)]) cl.extend(["--fastq-cluster-count", "0"]) if samplesheet_file is not None: cl.extend(["--sample-sheet", samplesheet_file]) if im_stats: cl.append("--ignore-missing-stats") if im_bcl: cl.append("--ignore-missing-bcl") if im_control: cl.append("--ignore-missing-control") bm = _get_bases_mask(fc_dir) if bm is not None: cl.extend(["--use-bases-mask", bm]) if r1: # Run configuration script logger2.info("Configuring BCL to Fastq conversion") logger2.debug(cl) co = open(configure_out,'w') ce = open(configure_err,'w') try: subprocess.check_call(cl,stdout=co,stderr=ce) co.close() ce.close() except subprocess.CalledProcessError, e: logger2.error("Configuring BCL to Fastq conversion for {:s} FAILED " \ "(exit code {}), please check log files {:s}, {:s}".format(fc_dir, str(e.returncode), configure_out, configure_err)) raise e
def _log_messages(self, log_handler, subject="Test email"): try: with log_handler.applicationbound(): with logbook.Processor(lambda record: record.extra.__setitem__('run', subject)): logger2.debug("DEBUG record test generated @ %s" % time.strftime("%x - %X")) logger2.info("INFO record test generated @ %s" % time.strftime("%x - %X")) logger2.notice("NOTICE record test generated @ %s" % time.strftime("%x - %X")) logger2.warning("WARNING record test generated @ %s" % time.strftime("%x - %X")) logger2.error("ERROR record test generated @ %s" % time.strftime("%x - %X")) logger2.critical("CRITICAL record test generated @ %s" % time.strftime("%x - %X")) except Exception as e: return e return None
def phmmer(**kwargs): """Search a protein sequence against a HMMER sequence database. Arguments: seq - The sequence to search -- a Fasta string. seqdb -- Sequence database to search against. range -- A string range of results to return (ie. 1,10 for the first ten) output -- The output format (defaults to JSON). """ logging.debug(kwargs) args = {'seq': kwargs.get('seq'), 'seqdb': kwargs.get('seqdb')} args2 = {'output': kwargs.get('output', 'json'), 'range': kwargs.get('range')} return _hmmer("http://hmmer.janelia.org/search/phmmer", args, args2)
def __init__(self, fc_name, fc_date, data, fc_dir=None): # Extract the run_items if we are passed a dictionary try: log.debug("Try making flowcell with this data:") log.debug(data) d = data.get('details', []) data = d except AttributeError: pass self.set_fc_dir(fc_dir) self.set_fc_date(fc_date) self.set_fc_name(fc_name) self.set_lanes(data) # Attempts to set the read counts on creation self.set_read_counts()
def _copy_from_sequencer(remote_info, config): """Get local directory of flowcell info, or copy from sequencer. """ if "fc_dir" in remote_info: fc_dir = remote_info["fc_dir"] assert os.path.exists(fc_dir) else: logger.debug("Remote host information: %s" % remote_info) c_host_str = _config_hosts(config) c_keyfile = config["analysis"].get("copy_keyfile", None) with fabric.settings(host_string=c_host_str, key_filename=c_keyfile): base_dir = config["store_dir"] protocol = config.get("transfer_protocol", None) fc_dir = remote_copy(remote_info, base_dir, protocol) return fc_dir
def _log_messages(self, log_handler, subject="Test email"): try: with log_handler.applicationbound(): with logbook.Processor(lambda record: record.extra.__setitem__( 'run', subject)): logger2.debug("DEBUG record test generated @ %s" % time.strftime("%x - %X")) logger2.info("INFO record test generated @ %s" % time.strftime("%x - %X")) logger2.notice("NOTICE record test generated @ %s" % time.strftime("%x - %X")) logger2.warning("WARNING record test generated @ %s" % time.strftime("%x - %X")) logger2.error("ERROR record test generated @ %s" % time.strftime("%x - %X")) logger2.critical("CRITICAL record test generated @ %s" % time.strftime("%x - %X")) except Exception as e: return e return None
def _clean_qseq(bc_dir, fastq_dir): """Remove the temporary qseq files if the corresponding fastq file has been created """ glob_str = "*_1_fastq.txt" fastq_files = glob.glob(os.path.join(fastq_dir, glob_str)) for fastq_file in fastq_files: try: lane = int(os.path.basename(fastq_file)[0]) except ValueError: continue logger2.debug("Removing qseq files for lane %d" % lane) glob_str = "s_%d_*qseq.txt" % lane for qseq_file in glob.glob(os.path.join(bc_dir, glob_str)): try: os.unlink(qseq_file) except: logger2.debug("Could not remove %s" % qseq_file)
def _compress_fastq(fastq_dir, config): """Compress the fastq files using gzip """ glob_str = "*_fastq.txt" fastq_files = glob.glob(os.path.join(fastq_dir,glob_str)) num_cores = config["algorithm"].get("num_cores",1) active_procs = [] for fastq_file in fastq_files: # Sleep for one minute while waiting for an open slot while len(active_procs) >= num_cores: time.sleep(60) active_procs, _ = _process_status(active_procs) logger2.debug("Compressing %s using gzip" % fastq_file) cl = ["gzip",fastq_file] active_procs.append(subprocess.Popen(cl)) # Wait for the last processes to finish while len(active_procs) > 0: time.sleep(60) active_procs, _ = _process_status(active_procs)
def _clean_qseq(bc_dir, fastq_dir): """Remove the temporary qseq files if the corresponding fastq file has been created """ glob_str = "*_1_fastq.txt" fastq_files = glob.glob(os.path.join(fastq_dir,glob_str)) for fastq_file in fastq_files: try: lane = int(os.path.basename(fastq_file)[0]) except ValueError: continue logger2.debug("Removing qseq files for lane %d" % lane) glob_str = "s_%d_*qseq.txt" % lane for qseq_file in glob.glob(os.path.join(bc_dir, glob_str)): try: os.unlink(qseq_file) except: logger2.debug("Could not remove %s" % qseq_file)
def make_lane_items(info, fc_date, fc_name, dirs, config): sample_name = info.get("description", "") if (config["algorithm"].get("include_short_name", True) and info.get("name", "")): sample_name = "%s---%s" % (info.get("name", ""), sample_name) genome_build = info.get("genome_build", None) multiplex = info.get("multiplex", "") logger.info("Processing sample: %s; lane %s; reference genome %s; " \ "researcher %s; analysis method %s" % (sample_name, info["lane"], genome_build, info.get("researcher", ""), info.get("analysis", ""))) lane_items = [] if multiplex: logger.debug("Sample %s is multiplexed as: %s" % (sample_name, multiplex)) mitems = get_multiplex_items(multiplex, info['lane'], dirs['fc_dir'], fc_name, fc_date) for fastq1, fastq2, mlane_name, msample in mitems: lane_items.append((fastq1, fastq2, genome_build, mlane_name, msample, dirs, config)) else: # TODO: Not multiplex: what to do? pass return lane_items
def _generate_fastq_with_casava(fc_dir, config, r1=False): """Perform demultiplexing and generate fastq.gz files for the current flowecell using CASAVA (>1.8). """ basecall_dir = os.path.join(fc_dir, "Data", "Intensities", "BaseCalls") casava_dir = config["program"].get("casava") unaligned_dir = os.path.join(fc_dir, "Unaligned") samplesheet_file = samplesheet.run_has_samplesheet(fc_dir, config) num_mismatches = config["algorithm"].get("mismatches", 1) num_cores = config["algorithm"].get("num_cores", 1) cl = [os.path.join(casava_dir, "configureBclToFastq.pl")] cl.extend(["--input-dir", basecall_dir]) cl.extend(["--output-dir", unaligned_dir]) cl.extend(["--sample-sheet", samplesheet_file]) cl.extend(["--mismatches", str(num_mismatches)]) options = ["--fastq-cluster-count", "0", \ "--ignore-missing-stats", \ "--ignore-missing-bcl", \ "--ignore-missing-control"] cl.extend(options) if r1: # Run configuration script logger2.info("Configuring BCL to Fastq conversion") logger2.debug(cl) subprocess.check_call(cl) # Go to <Unaligned> folder with utils.chdir(unaligned_dir): # Perform make cl = ["nohup", "make", "-j", str(num_cores)] if r1: cl.append("r1") logger2.info("Demultiplexing and converting bcl to fastq.gz") logger2.debug(cl) subprocess.check_call(cl) logger2.debug("Done")
def create_report_on_gdocs(fc_date, fc_name, run_info_yaml, dirs, config): """Create reports on gdocs containing both demultiplexed read counts and QC data. """ success = True try: # Inject the fc_date and fc_name in the email subject def record_processor(record): return record.extra.__setitem__('run', "%s_%s" % (fc_date, fc_name)) # Parse the run_info.yaml file log.debug("Loading this run_info: {}".format(run_info_yaml)) with open(run_info_yaml, "r") as fh: run_info = yaml.load(fh) # Get the gdocs account credentials encoded_credentials = get_credentials(config) if not encoded_credentials: log.warn( "Could not find Google Docs account credentials in configuration. \ No sequencing report was written") return False # Get the required parameters from the post_process.yaml configuration file gdocs = config.get("gdocs_upload", None) # Add email notification email = gdocs.get("gdocs_email_notification", None) smtp_host = config.get("smtp_host", "") smtp_port = config.get("smtp_port", "") log_handler = create_log_handler({'email': email, \ 'smtp_host': smtp_host, \ 'smtp_port': smtp_port}, True) except Exception as e: success = False log.warn( "Encountered exception when writing sequencing report to Google Docs: %s" % e) with log_handler.applicationbound(), logbook.Processor(record_processor): try: log.info("Started creating sequencing report on Google docs for %s_%s on %s" \ % (fc_date, fc_name, datetime.datetime.now().isoformat())) # Get a flowcell object fc = Flowcell(fc_name, fc_date, run_info, dirs.get("work", None)) # Get the GDocs demultiplex result file title gdocs_dmplx_spreadsheet = gdocs.get("gdocs_dmplx_file", None) # Get the GDocs QC file title gdocs_qc_spreadsheet = gdocs.get("gdocs_qc_file", None) # FIXME: Make the bc stuff use the Flowcell module if gdocs_dmplx_spreadsheet is not None: # Upload the data bc_metrics.write_run_report_to_gdocs(fc, fc_date, \ fc_name, gdocs_dmplx_spreadsheet, encoded_credentials, append=True) else: log.warn("Could not find Google Docs demultiplex results file \ title in configuration. No demultiplex counts were \ written to Google Docs for %s_%s" % (fc_date, fc_name)) # Parse the QC metrics try: qc = RTAQCMetrics(dirs.get("flowcell", None)) except: qc = None if gdocs_qc_spreadsheet is not None and qc is not None: qc_metrics.write_run_report_to_gdocs(fc, qc, gdocs_qc_spreadsheet, encoded_credentials) else: log.warn("Could not find Google Docs QC file title in configuration. " \ "No QC data were written to Google Docs " \ "for %s_%s".format(fc_date, fc_name)) # Get the projects parent folder projects_folder = gdocs.get("gdocs_projects_folder", None) # Write the bc project summary report if projects_folder is not None: create_project_report_on_gdocs(fc, qc, \ encoded_credentials, projects_folder) except Exception as e: success = False log.warn("Encountered exception when writing sequencing report " \ "to Google Docs: {}".format(e)) if success: log.info("Sequencing report successfully created on Google " \ "docs for {}_{} on {}".format(fc_date, fc_name, datetime.datetime.now().isoformat())) else: log.warn("Encountered exception when writing sequencing " \ "report for %s_%s to Google docs on %s" \ % (fc_date, fc_name, datetime.datetime.now().isoformat())) return success
def create_report_on_gdocs(fc_date, fc_name, run_info_yaml, dirs, config): """Create reports on gdocs containing both demultiplexed read counts and QC data. """ success = True try: # Inject the fc_date and fc_name in the email subject def record_processor(record): return record.extra.__setitem__('run', "%s_%s" % (fc_date, fc_name)) # Parse the run_info.yaml file log.debug("Loading this run_info: {}".format(run_info_yaml)) with open(run_info_yaml, "r") as fh: run_info = yaml.load(fh) # Get the gdocs account credentials encoded_credentials = get_credentials(config) if not encoded_credentials: log.warn("Could not find Google Docs account credentials in configuration. \ No sequencing report was written") return False # Get the required parameters from the post_process.yaml configuration file gdocs = config.get("gdocs_upload", None) # Add email notification email = gdocs.get("gdocs_email_notification", None) smtp_host = config.get("smtp_host", "") smtp_port = config.get("smtp_port", "") log_handler = create_log_handler({'email': email, \ 'smtp_host': smtp_host, \ 'smtp_port': smtp_port}, True) except Exception as e: success = False log.warn("Encountered exception when writing sequencing report to Google Docs: %s" % e) with log_handler.applicationbound(), logbook.Processor(record_processor): try: log.info("Started creating sequencing report on Google docs for %s_%s on %s" \ % (fc_date, fc_name, datetime.datetime.now().isoformat())) # Get a flowcell object fc = Flowcell(fc_name, fc_date, run_info, dirs.get("work", None)) # Get the GDocs demultiplex result file title gdocs_dmplx_spreadsheet = gdocs.get("gdocs_dmplx_file", None) # Get the GDocs QC file title gdocs_qc_spreadsheet = gdocs.get("gdocs_qc_file", None) # FIXME: Make the bc stuff use the Flowcell module if gdocs_dmplx_spreadsheet is not None: # Upload the data bc_metrics.write_run_report_to_gdocs(fc, fc_date, \ fc_name, gdocs_dmplx_spreadsheet, encoded_credentials, append=True) else: log.warn("Could not find Google Docs demultiplex results file \ title in configuration. No demultiplex counts were \ written to Google Docs for %s_%s" % (fc_date, fc_name)) # Parse the QC metrics try: qc = RTAQCMetrics(dirs.get("flowcell", None)) except: qc = None if gdocs_qc_spreadsheet is not None and qc is not None: qc_metrics.write_run_report_to_gdocs(fc, qc, gdocs_qc_spreadsheet, encoded_credentials) else: log.warn("Could not find Google Docs QC file title in configuration. " \ "No QC data were written to Google Docs " \ "for %s_%s".format(fc_date, fc_name)) # Get the projects parent folder projects_folder = gdocs.get("gdocs_projects_folder", None) # Write the bc project summary report if projects_folder is not None: create_project_report_on_gdocs(fc, qc, \ encoded_credentials, projects_folder) except Exception as e: success = False log.warn("Encountered exception when writing sequencing report " \ "to Google Docs: {}".format(e)) if success: log.info("Sequencing report successfully created on Google " \ "docs for {}_{} on {}".format(fc_date, fc_name, datetime.datetime.now().isoformat())) else: log.warn("Encountered exception when writing sequencing " \ "report for %s_%s to Google docs on %s" \ % (fc_date, fc_name, datetime.datetime.now().isoformat())) return success
logger2.error("Configuring BCL to Fastq conversion for {:s} FAILED " \ "(exit code {}), please check log files {:s}, {:s}".format(fc_dir, str(e.returncode), configure_out, configure_err)) raise e # Go to <Unaligned> folder with utils.chdir(unaligned_dir): # Perform make cl = ["make", "-j", str(num_cores)] if r1: cl.append("r1") logger2.info("Demultiplexing and converting bcl to fastq.gz") logger2.debug(cl) co = open(casava_out,'w') ce = open(casava_err,'w') try: subprocess.check_call(cl,stdout=co,stderr=ce) co.close() ce.close() except subprocess.CalledProcessError, e: logger2.error("BCL to Fastq conversion for {:s} FAILED " \ "(exit code {}), please check log files {:s}, "\ "{:s}".format(fc_dir, str(e.returncode), casava_out, casava_err)) raise e
def remote_copy(remote_info, base_dir, protocol): """Securely copy files between servers. """ fc_dir = base_dir if not fabric_files.exists(fc_dir): fabric.run("mkdir %s" % fc_dir) if protocol == "scp" or protocol == None: for fcopy in remote_info["to_copy"]: target_loc = os.path.join(fc_dir, \ os.path.basename(remote_info['directory']), fcopy) if not fabric_files.exists(target_loc): target_dir = os.path.dirname(target_loc) if not fabric_files.exists(target_dir): fabric.run("mkdir -p %s" % target_dir) cl = [ "scp", "-r", "%s@%s:%s/%s" % (remote_info["user"], remote_info["hostname"], remote_info["directory"], fcopy), target_loc ] logger.debug(cl) fabric.run(" ".join(cl)) elif protocol == "rsync": include = [] for fcopy in remote_info['to_copy']: include.append("--include='%s**/*'" % (fcopy, )) include.append("--include='%s'" % (fcopy, )) # By including both these patterns we get the entire directory # if a directory is given, or a single file if a single file is # given. cl = ["rsync", "--checksum", "--archive", \ "--partial", "--progress", \ "--prune-empty-dirs", "--include='*/'", \ " ".join(include), "--exclude='*'", \ "%s@%s:%s" % (remote_info["user"], remote_info["hostname"], \ remote_info["directory"]), fc_dir] logger.debug(cl) fabric.run(" ".join(cl)) # Note: rdiff-backup doesn't have the ability to resume a partial transfer, # and will instead transfer the backup from the beginning if it detects a # partial transfer. elif protocol == "rdiff-backup": include = [] for fcopy in remote_info['to_copy']: include.append("--include %s/%s" % \ (remote_info["directory"], fcopy)) cl = [ "rdiff-backup", " ".join(include), "--exclude '**'", "%s@%s::%s" % (remote_info["user"], remote_info["hostname"], remote_info["directory"]), fc_dir ] logger.debug(cl) fabric.run(" ".join(cl)) fc_dir = os.path.join(fc_dir, os.path.basename(remote_info['directory'])) return fc_dir
configure_out, configure_err)) raise e finally: co.close() ce.close() # Go to <Unaligned> folder with utils.chdir(unaligned_dir): # Perform make cl = ["make", "-j", str(num_cores)] if r1: cl.append("r1") logger2.info("Demultiplexing and converting bcl to fastq.gz") logger2.debug(cl) co = open(casava_out, 'w') ce = open(casava_err, 'w') try: co.write("{}\n".format(" ".join(cl))) ce.write("{}\n".format(" ".join(cl))) subprocess.check_call(cl, stdout=co, stderr=ce) except subprocess.CalledProcessError, e: logger2.error("BCL to Fastq conversion for {:s} FAILED " \ "(exit code {}), please check log files {:s}, "\ "{:s}".format(fc_dir, str(e.returncode), casava_out, casava_err)) raise e
def remote_copy(remote_info, base_dir, protocol): """Securely copy files between servers. """ fc_dir = base_dir if not fabric_files.exists(fc_dir): fabric.run("mkdir %s" % fc_dir) if protocol == "scp" or protocol == None: for fcopy in remote_info["to_copy"]: target_loc = os.path.join(fc_dir, \ os.path.basename(remote_info['directory']), fcopy) if not fabric_files.exists(target_loc): target_dir = os.path.dirname(target_loc) if not fabric_files.exists(target_dir): fabric.run("mkdir -p %s" % target_dir) cl = ["scp", "-r", "%s@%s:%s/%s" % (remote_info["user"], remote_info["hostname"], remote_info["directory"], fcopy), target_loc] logger.debug(cl) fabric.run(" ".join(cl)) elif protocol == "rsync": include = [] for fcopy in remote_info['to_copy']: include.append("--include='%s**/*'" % (fcopy,)) include.append("--include='%s'" % (fcopy,)) # By including both these patterns we get the entire directory # if a directory is given, or a single file if a single file is # given. cl = ["rsync", "--checksum", "--archive", \ "--partial", "--progress", \ "--prune-empty-dirs", "--include='*/'", \ " ".join(include), "--exclude='*'", \ "%s@%s:%s" % (remote_info["user"], remote_info["hostname"], \ remote_info["directory"]), fc_dir] logger.debug(cl) fabric.run(" ".join(cl)) # Note: rdiff-backup doesn't have the ability to resume a partial transfer, # and will instead transfer the backup from the beginning if it detects a # partial transfer. elif protocol == "rdiff-backup": include = [] for fcopy in remote_info['to_copy']: include.append("--include %s/%s" % \ (remote_info["directory"], fcopy)) cl = ["rdiff-backup", " ".join(include), "--exclude '**'", "%s@%s::%s" % (remote_info["user"], remote_info["hostname"], remote_info["directory"]), fc_dir] logger.debug(cl) fabric.run(" ".join(cl)) fc_dir = os.path.join(fc_dir, os.path.basename(remote_info['directory'])) return fc_dir
def _generate_fastq_with_casava_task(args): """Perform demultiplexing and generate fastq.gz files for the current flowecell using CASAVA (>1.8). """ bp = args.get('bp') samples_group = args.get('samples') base_mask = samples_group['base_mask'] samples = samples_group['samples'] fc_dir = args.get('fc_dir') config = args.get('config') r1 = args.get('r1', False) idx_only = args.get('idx_only', False) ss = 'SampleSheet_{bp}bp.csv'.format(bp=str(bp)) unaligned_folder = 'Unaligned_{bp}bp'.format(bp=str(bp)) out_file = 'configureBclToFastq_{bp}bp.out'.format(bp=str(bp)) err_file = 'configureBclToFastq_{bp}bp.err'.format(bp=str(bp)) #Prepare CL arguments and call configureBclToFastq basecall_dir = os.path.join(fc_dir, "Data", "Intensities", "BaseCalls") casava_dir = config["program"].get("casava") out_dir = config.get("out_directory", fc_dir) #Append the flowcell dir to the output directory if different from the run dir if out_dir != fc_dir: out_dir = os.path.join(out_dir, os.path.basename(fc_dir)) unaligned_dir = os.path.join(out_dir, unaligned_folder) samplesheet_file = os.path.join(fc_dir, ss) num_mismatches = config["algorithm"].get("mismatches", 1) num_cores = config["algorithm"].get("num_cores", 1) im_stats = config["algorithm"].get("ignore-missing-stats", False) im_bcl = config["algorithm"].get("ignore-missing-bcl", False) im_control = config["algorithm"].get("ignore-missing-control", False) # Write to log files configure_out = os.path.join(fc_dir, out_file) configure_err = os.path.join(fc_dir, err_file) casava_out = os.path.join(fc_dir, "bclToFastq_R{:d}.out".format(2 - int(r1))) casava_err = os.path.join(fc_dir, "bclToFastq_R{:d}.err".format(2 - int(r1))) cl = [os.path.join(casava_dir, "configureBclToFastq.pl")] cl.extend(["--input-dir", basecall_dir]) cl.extend(["--output-dir", unaligned_dir]) cl.extend(["--mismatches", str(num_mismatches)]) cl.extend(["--fastq-cluster-count", "0"]) if samplesheet_file is not None: cl.extend(["--sample-sheet", samplesheet_file]) if im_stats: cl.append("--ignore-missing-stats") if im_bcl: cl.append("--ignore-missing-bcl") if im_control: cl.append("--ignore-missing-control") if base_mask is not None: cl.extend(["--use-bases-mask", ','.join(base_mask)]) if r1: cl.append("--force") if r1 or idx_only: #Create separate samplesheet and folder with open(os.path.join(fc_dir, ss), 'w') as fh: samplesheet = csv.DictWriter(fh, fieldnames=samples['fieldnames'], dialect='excel') samplesheet.writeheader() samplesheet.writerows(samples['samples']) # Run configuration script logger2.info("Configuring BCL to Fastq conversion") logger2.debug(cl) co = open(configure_out, 'w') ce = open(configure_err, 'w') try: co.write("{}\n".format(" ".join(cl))) ce.write("{}\n".format(" ".join(cl))) subprocess.check_call(cl, stdout=co, stderr=ce) except subprocess.CalledProcessError, e: logger2.error("Configuring BCL to Fastq conversion for {:s} FAILED " \ "(exit code {}), please check log files {:s}, {:s}".format(fc_dir, str(e.returncode), configure_out, configure_err)) raise e finally:
def http_error_302(self, req, fp, code, msg, headers): logging.debug(headers) return headers