def get_bases_mask(run_info_xml, sample_sheet_file): """ Get bases mask string Generates initial bases mask based on data in RunInfo.xml (which says how many reads there are, how many cycles in each read, and which are index reads). Then updates this using the barcode information in the sample sheet file. Arguments: run_info_xml: name and path of RunInfo.xml file from the sequencing run sample_sheet_file: name and path of sample sheet file. Returns: Bases mask string e.g. 'y101,I6'. """ # Get initial bases mask bases_mask = IlluminaData.IlluminaRunInfo(run_info_xml).bases_mask print "Bases mask: %s (from RunInfo.xml)" % bases_mask # Update bases mask from sample sheet example_barcode = IlluminaData.get_casava_sample_sheet( sample_sheet_file)[0]['Index'] bases_mask = IlluminaData.fix_bases_mask(bases_mask, example_barcode) print "Bases mask: %s (updated for barcode sequence '%s')" % ( bases_mask, example_barcode) return bases_mask
def get_bases_mask(run_info_xml,sample_sheet_file): """ Get bases mask string Generates initial bases mask based on data in RunInfo.xml (which says how many reads there are, how many cycles in each read, and which are index reads). Then updates this using the barcode information in the sample sheet file. Arguments: run_info_xml: name and path of RunInfo.xml file from the sequencing run sample_sheet_file: name and path of sample sheet file. Returns: Bases mask string e.g. 'y101,I6'. """ # Get initial bases mask bases_mask = IlluminaData.IlluminaRunInfo(run_info_xml).bases_mask print "Bases mask: %s (from RunInfo.xml)" % bases_mask # Update bases mask from sample sheet example_barcode = IlluminaData.get_casava_sample_sheet(sample_sheet_file)[0]['Index'] bases_mask = IlluminaData.fix_bases_mask(bases_mask,example_barcode) print "Bases mask: %s (updated for barcode sequence '%s')" % (bases_mask, example_barcode) return bases_mask
def get_bases_mask(run_info_xml, sample_sheet_file=None): """ Get bases mask string Generates initial bases mask based on data in RunInfo.xml (which says how many reads there are, how many cycles in each read, and which are index reads), and optionally updates this using the barcode information in the sample sheet file. Arguments: run_info_xml: name and path of RunInfo.xml file from the sequencing run sample_sheet_file: (optional) path to sample sheet file Returns: Bases mask string e.g. 'y101,I6'. """ # Get initial bases mask bases_mask = IlluminaData.IlluminaRunInfo(run_info_xml).bases_mask print "Bases mask: %s (from RunInfo.xml)" % bases_mask if sample_sheet_file is not None: # Update bases mask from sample sheet example_barcode = IlluminaData.samplesheet_index_sequence( IlluminaData.SampleSheet(sample_sheet_file).data[0]) if example_barcode is None: example_barcode = "" if barcode_is_10xgenomics(example_barcode): print "Bases mask: barcode is 10xGenomics sample set ID" else: bases_mask = IlluminaData.fix_bases_mask(bases_mask, example_barcode) print "Bases mask: %s (updated for barcode sequence '%s')" % \ (bases_mask,example_barcode) return bases_mask
def verify_fastq_generation(ap, unaligned_dir=None, lanes=None, include_sample_dir=False): """Check that generated Fastqs match sample sheet predictions Arguments: ap (AutoProcessor): autoprocessor pointing to the analysis directory to do Fastqs verification on unaligned_dir (str): explicitly specify the bcl2fastq output directory to check lanes (list): specify a list of lane numbers (integers) to check (others will be ignored) include_sample_dir (bool): if True then include a 'sample_name' directory level when checking for bcl2fastq2 outputs, even if one shouldn't be present Returns: True if outputs match sample sheet, False otherwise. """ if unaligned_dir is None: if ap.params.unaligned_dir is not None: unaligned_dir = ap.params.unaligned_dir else: raise Exception("Bcl2fastq output directory not defined") print "Checking bcl2fastq output directory '%s'" % unaligned_dir bcl_to_fastq_dir = os.path.join(ap.analysis_dir, unaligned_dir) if not os.path.isdir(bcl_to_fastq_dir): # Directory doesn't exist return False # Make a temporary sample sheet to verify against tmp_sample_sheet = os.path.join( ap.tmp_dir, "SampleSheet.verify.%s.csv" % time.strftime("%Y%m%d%H%M%S")) make_custom_sample_sheet(ap.params.sample_sheet, tmp_sample_sheet, lanes=lanes) # Try to create an IlluminaData object try: illumina_data = IlluminaData.IlluminaData(ap.analysis_dir, unaligned_dir=unaligned_dir) except IlluminaData.IlluminaDataError as ex: # Failed to initialise logger.warning("Failed to get information from %s: %s" % (bcl_to_fastq_dir, ex)) return False # Do check return IlluminaData.verify_run_against_sample_sheet( illumina_data, tmp_sample_sheet, include_sample_dir=include_sample_dir)
def get_fastqs_from_dir(dirn, lane, unaligned_dir=None): """ Collect Fastq files for specified lane Arguments: dirn (str): path to directory to collect Fastq files from lane (int): lane Fastqs must have come from unaligned_dir (str): subdirectory of 'dirn' with outputs from bcl2fastq Returns: List: list of Fastqs (for single ended data) or of Fastq pairs (for pair ended data). """ try: illumina_data = IlluminaData.IlluminaData(dirn, unaligned_dir=unaligned_dir) except Exception as ex: raise Exception("Unable to read fastqs from %s: %s\n" % (dirn, ex)) paired_end = illumina_data.paired_end fastqs_r1 = [] fastqs_r2 = [] for project in illumina_data.projects: for sample in project.samples: for fastq in sample.fastq_subset(read_number=1, full_path=True): if IlluminaData.IlluminaFastq(fastq).lane_number == lane: fastqs_r1.append(fastq) for fastq in sample.fastq_subset(read_number=2, full_path=True): if IlluminaData.IlluminaFastq(fastq).lane_number == lane: fastqs_r2.append(fastq) if illumina_data.undetermined: for sample in illumina_data.undetermined.samples: for fastq in sample.fastq_subset(read_number=1, full_path=True): if IlluminaData.IlluminaFastq(fastq).lane_number == lane: fastqs_r1.append(fastq) for fastq in sample.fastq_subset(read_number=2, full_path=True): if IlluminaData.IlluminaFastq(fastq).lane_number == lane: fastqs_r2.append(fastq) if not paired_end: return fastqs_r1 fastqs = [] fastqs_r1.sort() fastqs_r2.sort() for fq1, fq2 in zip(fastqs_r1, fastqs_r2): fastqs.append("%s,%s" % (fq1, fq2)) return fastqs
def report_info(ap): """Generate a general report Generates an unstructured report on the contents of the analysis directory. Arguments: ap (AutoProcessor): autoprocessor pointing to the analysis directory to be reported on Returns: String with the report text. """ report = [] report.append("Run reference: %s" % ap.run_reference_id) report.append("Directory : %s" % ap.analysis_dir) report.append("Platform : %s" % ap.metadata.platform) report.append("Unaligned dir: %s" % ap.params.unaligned_dir) if ap.readme_file: report.append("README.txt found: %s" % ap.readme_file) if ap.params.unaligned_dir is not None or \ not os.path.exists(ap.params.unaligned_dir): try: illumina_data = ap.load_illumina_data() report.append("\nSummary of data in '%s' dir:\n" % ap.params.unaligned_dir) for project in illumina_data.projects: report.append("- %s" % IlluminaData.describe_project(project)) except IlluminaData.IlluminaDataError as ex: report.append("Failed to load data from %s:" % ap.params.unaligned_dir) report.append("%s" % ex) else: report.append("No information on source fastq data (no unaligned dir " "found)") try: projects = ap.get_analysis_projects() report.append("\n%d analysis project%s:" % (len(projects), "s" if len(projects) != 0 else "")) except Exception as ex: projects = [] report.append("\nNo analysis projects found") for project in projects: info = project.info report.append("\n- %s" % project.name) report.append(" %s" % ('-' * len(project.name), )) report.append(" User : %s" % info.user) report.append(" PI : %s" % info.PI) report.append(" Library : %s" % info.library_type) report.append(" SC Plat.: %s" % info.single_cell_platform) report.append(" Organism: %s" % info.organism) report.append(" Dir : %s" % os.path.basename(project.dirn)) report.append(" #samples: %s" % len(project.samples)) report.append(" #cells : %s" % default_value(info.number_of_cells)) report.append(" Samples : %s" % project.prettyPrintSamples()) report.append(" QC : %s" % ('ok' if verify_qc(project) else 'not verified')) report.append(" Comments: %s" % (project.info.comments)) return '\n'.join(report)
def get_fastqs_from_dir(dirn, lane, unaligned_dir=None): """Automatically collect Fastq files for specified lane """ try: illumina_data = IlluminaData.IlluminaData(dirn, unaligned_dir=unaligned_dir) except Exception, ex: sys.stderr.write("Unable to read fastqs from %s: %s\n" % (dirn, ex)) sys.exit(1)
def load_illumina_data(self, unaligned_dir=None): # Load and return an IlluminaData object if unaligned_dir is None: unaligned_dir = self.params.unaligned_dir if unaligned_dir is None: logging.error( "Unaligned directory not specified, cannot load data") return None return IlluminaData.IlluminaData(self.analysis_dir, unaligned_dir=unaligned_dir)
def report_summary(ap): """Generate summary report suitable for bioinformaticians Generates a multi-line report which gives general information about the run, plus one-line summaries for each project, plus any additional information that has been recorded. The general information includes: - Platform - Run name - Run reference id - Processing software - Assay (i.e. sequencing kit) For each project: - Project subdirectory - Researcher (aka user) - PI - Application (aka library type) - Single cell prep platform (e.g. ICell8) - Organism - Number of samples Arguments: ap (AutoProcessor): autoprocessor pointing to the analysis directory to be reported on Returns: String with the report text. """ # Default items to report report_items = [ 'Run name', 'Reference', 'Platform', 'Directory', 'Endedness', 'Bcl2fastq', ] # Gather information analysis_dir = analysis.AnalysisDir(ap.analysis_dir) datestamp = None instrument = None run_number = None run_name = ap.run_name try: datestamp, instrument, run_number = IlluminaData.split_run_name( run_name) except Exception, ex: logger.warning("Unable to extract information from run name '%s'" \ % run_name) logger.warning("Exception: %s" % ex)
def run_reference_id(run_name, platform=None, facility_run_number=None): """Return a run reference id e.g. 'HISEQ_140701/242#22' The run reference code is a code that identifies the sequencing run, and has the general form: PLATFORM_DATESTAMP[/INSTRUMENT_RUN_NUMBER]#FACILITY_RUN_NUMBER - PLATFORM is always uppercased e.g. HISEQ, MISEQ, GA2X - DATESTAMP is the YYMMDD code e.g. 140701 - INSTRUMENT_RUN_NUMBER is the run number that forms part of the run directory e.g. for '140701_SN0123_0045_000000000-A1BCD' it is '45' - FACILITY_RUN_NUMBER is the run number that has been assigned by the facility Note that the instrument run number is only used if it differs from the facility run number. If the platform isn't supplied then the instrument name is used instead, e.g.: 'SN0123_140701/242#22' If the run name can't be split into components then the general form will be: [PLATFORM_]RUN_NAME[#FACILITY_RUN_NUMBER] depending on whether platform and/or facility run number have been supplied. For example for a run called 'rag_05_2017': 'MISEQ_rag_05_2017#90' Arguments: run_name (str): the run name (can be a path) platform (str): the platform name (optional) facility_run_number (int): the run number assigned by the local facility (can be different from the instrument run number) (optional) """ # Extract information from run name run_name = os.path.basename(os.path.normpath(run_name)) try: datestamp, instrument, run_number = IlluminaData.split_run_name( run_name) except Exception, ex: logger.warning("Unable to extract information from run name '%s'" \ % run_name) logger.warning("Exception: %s" % ex) instrument = None date_stamp = None run_number = None
def get_sequencer_platform(dirn, instrument=None, settings=None): """ Return the platform for the sequencing instrument Attempts to identify the platform (e.g. 'hiseq', 'miseq' etc) for a sequencing run. If 'settings' is supplied then the platform is looked up based on the instrument names and platforms listed in the 'sequencers' section of the configuration. If 'instrument' is also supplied then this is used; otherwise the instrument name is extracted from the supplied directory name. If no match can be found then there is a final attempt to determine the platform from the hard-coded names in the 'bcftbx.platforms' module. Arguments: dirn (str): path to the data or analysis directory instrument (str): (optional) the instrument name settings (Settings): (optional) a Settings instance with the configuration loaded Returns: String: either the platform or None, if the platform cannot be determined. """ # Attempt to look up the instrument name platform = None if instrument is None: print "Extracting instrument name from directory name" try: datestamp,instrument,run_number,\ flow_cell_prefix,flow_cell_id = \ IlluminaData.split_run_name_full(dirn) except Exception as ex: logging.warning("Unable to extract instrument name: " "%s" % ex) if instrument and settings: print "Identifying platform from instrument name" try: return settings.sequencers[instrument] except KeyError: # Instrument not listed in the settings logging.warning("Instrument name '%s' not found in " "configuration file" % instrument) # Fall back to old method print "Identifying platform from data directory name" platform = platforms.get_sequencer_platform(dirn) if platform is None: logging.warning("Unable to identify platform from " "directory name") return platform
def setup(self): # Make output filenames report_file = os.path.join(self.args.barcode_analysis_dir, 'barcodes.report') xls_file = os.path.join(self.args.barcode_analysis_dir, 'barcodes.xls') html_file = os.path.join(self.args.barcode_analysis_dir, 'barcodes.html') # Remove existing copies, if found for filen in (report_file, xls_file, html_file): if os.path.exists(filen): os.remove(filen) # Build command to run the barcode analysis cmd = PipelineCommandWrapper( "Run analyse_barcodes.py to report barcodes", 'analyse_barcodes.py', '--report', report_file, '--xls', xls_file, '--html', html_file) if self.args.sample_sheet: cmd.add_args('--sample-sheet', self.args.sample_sheet) if self.args.lanes: lanes = self.args.lanes elif self.args.sample_sheet: # Implicitly get lanes from sample sheet try: lanes = sorted( set([ line['Lane'] for line in IlluminaData.SampleSheet( self.args.sample_sheet).data ])) except KeyError: # No lanes lanes = None else: lanes = None if lanes: cmd.add_args('--lanes', ','.join([str(l) for l in lanes])) if self.args.cutoff: cmd.add_args('--cutoff', self.args.cutoff) if self.args.mismatches: cmd.add_args('--mismatches', self.args.mismatches) if self.args.title: cmd.add_args('--title', self.args.title) cmd.add_args('-c') cmd.add_args(*self.args.counts_files) self.add_cmd(cmd) # Update the output parameters self.output.report_file.set(report_file) self.output.xls_file.set(xls_file) self.output.html_file.set(html_file)
def report_projects(ap): """Generate one line reports suitable for pasting into spreadsheet Generate one-line report for each each project with tab-separated data items, suitable for injection into a spreadsheet. Each line has the following information: - Run id e.g. HISEQ_140328 - Run number - Source - Date - User - PI - Application - Single Cell Platform - Organism - Platform - #Samples - #Cells - PE (yes/no) - Samples Arguments: ap (AutoProcessor): autoprocessor pointing to the analysis directory to be reported on Returns: String with the report text. """ # Acquire data analysis_dir = utils.AnalysisDir(ap.analysis_dir) # General information run_name = ap.run_name try: datestamp, instrument, run_number = IlluminaData.split_run_name( run_name) run_number = run_number.lstrip('0') except Exception, ex: logger.warning("Unable to extract information from run name '%s'" \ % run_name) logger.warning("Exception: %s" % ex) date_stamp = '' run_number = ''
def detect_unaligned_dir(self): # Attempt to detect an existing 'bcl2fastq' or 'Unaligned' directory # containing data from bcl2fastq for test_unaligned in ('bcl2fastq', 'Unaligned'): if os.path.isdir(os.path.join(self.analysis_dir, test_unaligned)): logging.debug( "Testing subdirectory '%s' to see if it has sequence data" % test_unaligned) try: IlluminaData.IlluminaData(self.analysis_dir, unaligned_dir=test_unaligned) print("Setting 'unaligned_dir' parameter to %s" % test_unaligned) return test_unaligned except IlluminaData.IlluminaDataError as ex: logging.debug("Unable to load data from %s" % test_unaligned) # Unable to detect existing data directory return None
def get_analysis_projects_from_dirs(self, pattern=None, strict=False): """ Return a list of AnalysisProjects in the analysis directory Tests each of the subdirectories in the top-level of the analysis directory and rejects any that appear to be CASVAVA/bcl2fastq outputs or which don't successfully load as AnalysisProject instances. Unlike the `get_analysis_projects` method, no checking against the project metadata (typically in 'projects.info') is performed. If the 'pattern' is not None then it should be a simple pattern used to match against available names to select a subset of projects (see bcf_utils.name_matches). Arguments: pattern (str): optional pattern to select a subset of projects (default: select all projects) strict (bool): if True then apply strict checks on each discovered project directory before adding it to the list (default: don't apply strict checks) Returns: List: list of AnalysisProject instances. """ logging.debug("Testing subdirectories to determine analysis projects") projects = [] if pattern is None: pattern = '*' # Try loading each subdirectory as a project for dirn in bcf_utils.list_dirs(self.analysis_dir): # Test for bcl2fastq output try: IlluminaData.IlluminaData(self.analysis_dir, unaligned_dir=dirn) logging.debug("* %s: rejected" % dirn) continue except IlluminaData.IlluminaDataError: pass except Exception as ex: logging.debug("Exception when attempting to load " "subdir '%s' as CASAVA/bcl2fastq output " "(ignored): %s" % (dirn, ex)) # Try loading as a project test_project = AnalysisProject( dirn, os.path.join(self.analysis_dir, dirn)) if strict: # Apply strict checks if not test_project.is_analysis_dir: logging.debug("* %s: rejected (failed strict checks)" % dirn) continue else: # Basic check: are there any samples? if not len(test_project.samples): logging.debug("* %s: rejected (no samples)" % dirn) continue # Passed checks logging.debug("* %s: analysis directory" % dirn) if bcf_utils.name_matches(test_project.name, pattern): projects.append(test_project) return projects
def update_project_metadata_file(self, unaligned_dir=None, project_metadata_file='projects.info'): """ Update project metadata file from bcl2fastq outputs Updates the contents of the project metadata file (default: "projects.info") from a bcl-to-fastq output directory, by adding new entries for projects in the bcl-to-fastq outputs which don't currently appear. Arguments: unaligned_dir (str): path to the bcl-to-fastq output directory relative to the analysis dir. Defaults to the unaligned dir stored in the analysis directory parameter file. project_metatadata_file (str): optional, path to the project metadata file to update """ if project_metadata_file is not None: self.params['project_metadata'] = project_metadata_file logging.debug("Project metadata file: %s" % self.params.project_metadata) filen = os.path.join(self.analysis_dir, self.params.project_metadata) if unaligned_dir is not None: self.params['unaligned_dir'] = unaligned_dir logging.debug("Unaligned_dir: %s" % self.params.unaligned_dir) illumina_data = IlluminaData.IlluminaData( self.analysis_dir, unaligned_dir=self.params.unaligned_dir) if os.path.exists(filen): # Load data from existing file logging.debug("Loading project metadata from existing file: %s" % filen) project_metadata = ProjectMetadataFile(filen) else: # New (empty) metadata file logging.debug("Creating new project metadata file: %s" % filen) project_metadata = ProjectMetadataFile() # Get projects and samples projects = {} for project in illumina_data.projects: projects[project.name] = sorted([s.name for s in project.samples]) # Add data from metadata file for line in project_metadata: project_name = line['Project'] project_is_commented = project_name.startswith('#') # Uncomment project line for now project_name = project_name.lstrip('#') # Add to the list if not found if project_name not in projects: if project_is_commented or \ not os.path.exists(os.path.join(self.analysis_dir, project_name)): # Comment out project not in latest list # if already commented or if project directory # doesn't exist project_name = "#%s" % project_name projects[project_name] = line['Samples'].split(',') # Populate/update for project_name in projects: sample_names = projects[project_name] if project_name not in project_metadata: project_metadata.add_project(project_name, sample_names) else: project_metadata.update_project(project_name, sample_names=sample_names) # Save project_metadata.save(filen) print("Updated project metadata file '%s'" % self.params.project_metadata)
else: n_fastqs = len(sample.fastq) if n_fastqs == 1: print "\t%s" % sample.name else: print "\t%s (%d fastqs)" % (sample.name,n_fastqs) # Print fastq names fastqs = sample.fastq_subset(read_number=1) + \ sample.fastq_subset(read_number=2) for fastq in fastqs: print "\t\t%s" % fastq # Report the names of the samples in each project if options.report: for project in illumina_data.projects: print "%s" % IlluminaData.describe_project(project) # Report statistics for fastq files if options.stats: # Print number of reads for each file, and file size for sample in project.samples: for fastq in sample.fastq: fq = os.path.join(sample.dirn,fastq) nreads = FASTQFile.nreads(fq) fsize = os.path.getsize(fq) print "%s\t%s\t%d" % (fastq, bcf_utils.format_file_size(fsize), nreads) print "" # Summary: short report suitable for logging file if options.summary:
def update_metadata(self): """ Migrates and updates metadata values """ # Migrate missing values from parameter file if self.has_parameter_file: # Migrate relevant values across print("Migrating metadata values from parameter file") for param in ( 'platform', 'run_number', 'source', ): if param not in self.params: continue if self.metadata[param] is None: logging.debug("Importing metadata item '%s': set to " "'%s'" % (param, self.params[param])) print("Importing metadata item '%s'" % param) self.metadata[param] = self.params[param] # Run name if self.metadata.run_name is None: print("Attempting to set missing 'run_name' metadata item") self.metadata['run_name'] = self.run_name # Instrument-related metadata if self.metadata.instrument_name is None or \ self.metadata.instrument_datestamp is None or \ self.metadata.instrument_run_number is None: print("Attempting to set missing instrument metadata items") # Extract from run name try: datestamp,instrument,run_number,\ flow_cell_prefix,flow_cell_id = \ IlluminaData.split_run_name_full(self.run_name) if self.metadata.instrument_name is None: self.metadata['instrument_name'] = instrument if self.metadata.instrument_datestamp is None: self.metadata['instrument_datestamp'] = datestamp if self.metadata.instrument_run_number is None: self.metadata['instrument_run_number'] = run_number if self.metadata.instrument_flow_cell_id is None: self.metadata['instrument_flow_cell_id'] = \ flow_cell_prefix + flow_cell_id except Exception as ex: logging.warning( "Unable to extract missing instrument metadata " "from run name") # Sequencing platform if self.metadata.platform is None: # Attempt to look up the instrument name platform = get_sequencer_platform( self.analysis_dir, instrument=self.metadata.instrument_name, settings=self.settings) if platform: print("Setting 'platform' metadata item to %s" % platform) self.metadata['platform'] = platform # Sequencer model if self.metadata.sequencer_model is None: instrument_name = self.metadata.instrument_name if instrument_name: try: self.metadata['sequencer_model'] = \ self.settings.sequencers[instrument_name].model print("Setting 'sequencer_model' metadata item to " "'%s'" % self.metadata.sequencer_model) except KeyError: print("Unable to get sequencer model for " "instrument '%s'" % instrument_name)
def demultiplex_fastq(fastq_file,barcodes,nmismatches): """Perform demultiplexing of a FASTQ file Demultiplex reads in a FASTQ file given information about a set of barcode/index sequences. Produces a file for each barcode, plus another for 'unbinned' reads. Arguments: fastq_file: FASTQ file to be demultiplexed (can be gzipped) barcodes: list of barcode sequences to use for demultiplexing nmismatches: maxiumum number of mismatched bases allowed when testing whether barcode sequences match Returns: No return value """ # Start print "Processing %s" % fastq_file info = IlluminaData.IlluminaFastq(fastq_file) # Set up output files output_files = {} # Weed out barcodes that aren't associated with this lane local_barcodes = [] for barcode in barcodes: if barcode['lane'] != info.lane_number: continue local_barcodes.append(barcode) output_file_name = "%s_%s_L%03d_R%d_%03d.fastq" % (barcode['name'], barcode['index'], info.lane_number, info.read_number, info.set_number) print "\t%s\t%s" % (barcode['index'],output_file_name) if os.path.exists(output_file_name): print "\t%s: already exists,exiting" % output_file_name sys.exit(1) output_files[barcode['index']] = open(output_file_name,'w') # Check if there's anything to do if len(local_barcodes) == 0: return # Also make a file for unbinned reads unbinned_file_name = "unbinned_L%03d_R%d_%03d.fastq" % (info.lane_number, info.read_number, info.set_number) if os.path.exists(unbinned_file_name): print "\t%s: already exists,exiting" % unbinned_file_name sys.exit(1) output_files['unbinned'] = open(unbinned_file_name,'w') # Process reads nreads = 0 for read in FASTQFile.FastqIterator(fastq_file): nreads += 1 matched_read = False this_barcode = read.seqid.index_sequence for barcode in local_barcodes: if barcode['matcher'].match(this_barcode,nmismatches): ##print "Matched %s against %s" % (this_barcode,barcodes[barcode]['name']) output_files[barcode['index']].write(str(read)+'\n') matched_read = True break # Put in unbinned if no match if not matched_read: output_files['unbinned'].write(str(read)+'\n') ##if nreads > 100: break # Close files for barcode in local_barcodes: output_files[barcode['index']].close() print "\tMatched %d reads for %s" % (nreads,os.path.basename(fastq_file))
def make_custom_sample_sheet(input_sample_sheet, output_sample_sheet=None, lanes=None, fmt=None): """ Creates a corrected copy of a sample sheet file Creates and returns a SampleSheet object with a copy of the input sample sheet, with any illegal or duplicated names fixed. Optionally it can also: write the updated sample sheet data to a new file, switch the format, and include only a subset of lanes from the original file Arguments: input_sample_sheet (str): name and path of the original sample sheet file output_sample_sheet (str): (optional) name and path to write updated sample sheet to, or `None` lanes (list): (optional) list of lane numbers to keep in the output sample sheet; if `None` then all lanes will be kept (the default), otherwise lanes will be dropped if they don't appear in the supplied list fmt (str): (optional) format for the output sample sheet, either 'CASAVA' or 'IEM'; if this is `None` then the format of the original file will be used Returns: SampleSheet object with the data for the corrected sample sheet. """ # Load the sample sheet data sample_sheet = IlluminaData.SampleSheet(input_sample_sheet) # Determine the column names for this format if sample_sheet.format == 'CASAVA': sample_col = 'SampleID' project_col = 'SampleProject' elif sample_sheet.format == 'IEM': sample_col = 'Sample_ID' project_col = 'Sample_Project' else: raise Exception("Unknown sample sheet format: %s" % sample_sheet.format) # Add project names if not supplied for line in sample_sheet: if not line[project_col]: line[project_col] = line[sample_col] # Fix other problems sample_sheet.fix_illegal_names() sample_sheet.fix_duplicated_names() # Select subset of lanes if requested if lanes is not None: logging.debug("Updating to include only specified lanes: %s" % ','.join([str(l) for l in lanes])) i = 0 while i < len(sample_sheet): line = sample_sheet[i] if line['Lane'] in lanes: logging.debug("Keeping %s" % line) i += 1 else: del (sample_sheet[i]) # Write out new sample sheet if output_sample_sheet is not None: sample_sheet.write(output_sample_sheet, fmt=fmt) return sample_sheet
def fetch_value(ap, project, field): """ Return the value of the supplied field Given a field name, return the value determined from the data in the supplied AutoProcessor and AnalysisProject instances. Arguments: ap (AutoProcessor): autoprocessor pointing to the analysis directory to be reported on project (AnalysisProject): project to report on field (str): field name to return value of Returns: String: value of supplied field. """ # Convenience variable for project info try: info = project.info except AttributeError: info = None # Generate value for supplied field name if field == 'datestamp': return IlluminaData.split_run_name(ap.run_name)[0] elif field == 'run_id': return ap.run_reference_id elif field == 'run_number': return ('' if not ap.metadata.run_number else str(ap.metadata.run_number)) elif field == 'source' or field == 'data_source': return ('' if not ap.metadata.source else ap.metadata.source) elif field == 'analysis_dir' or field == 'path': return ap.params.analysis_dir elif field == 'project' or field == 'project_name': return project.name elif field == 'user': return ('' if not info.user else info.user) elif field == 'PI' or field == 'pi': return ('' if not info.PI else info.PI) elif field == 'application' or field == 'library_type': return ('' if not info.library_type else info.library_type) elif field == 'single_cell_platform': return ('' if not info.single_cell_platform else info.single_cell_platform) elif field == 'organism': return ('' if not info.organism else info.organism) elif field == 'sequencer_platform' or field == 'platform': return ('' if not ap.metadata.platform else str( ap.metadata.platform).upper()) elif field == 'sequencer_model': return ('' if not ap.metadata.sequencer_model else ap.metadata.sequencer_model) elif field == 'no_of_samples' or field == '#samples': return str(len(project.samples)) elif field == 'no_of_cells' or field == '#cells': return ('' if not info.number_of_cells else str(info.number_of_cells)) elif field == 'paired_end': return ('yes' if ap.paired_end else 'no') elif field == 'sample_names' or field == 'samples': return project.prettyPrintSamples() elif field == 'null' or field == '': return '' else: raise KeyError("'%s': unrecognised field for reporting" % field)
def report_summary(ap): """Generate summary report suitable for bioinformaticians Generates a multi-line report which gives general information about the run, plus one-line summaries for each project, plus any additional information that has been recorded. The general information includes: - Platform - Run name - Run reference id - Sequencer model - Processing software For each project: - Project subdirectory - Researcher (aka user) - PI - Application (aka library type) - Single cell prep platform (e.g. ICell8) - Organism - Number of samples Arguments: ap (AutoProcessor): autoprocessor pointing to the analysis directory to be reported on Returns: String with the report text. """ # Default items to report report_items = [ 'Run name', 'Reference', 'Platform', 'Sequencer', 'Directory', 'Endedness', 'Bcl2fastq', ] # Gather information analysis_dir = analysis.AnalysisDir(ap.analysis_dir) datestamp = None instrument = None run_number = None run_name = ap.run_name try: datestamp, instrument, run_number = IlluminaData.split_run_name( run_name) except Exception as ex: logger.warning("Unable to extract information from run name '%s'" \ % run_name) logger.warning("Exception: %s" % ex) if ap.metadata.platform is not None: platform = ap.metadata.platform.upper() else: platform = 'unknown' if ap.metadata.run_number is not None: run_number = ap.metadata.run_number # Processing software information try: processing_software = ast.literal_eval(ap.metadata.processing_software) except ValueError: processing_software = dict() if not processing_software: # Fallback to legacy metadata items try: processing_software['bcl2fastq'] = ast.literal_eval( ap.metadata.bcl2fastq_software) except ValueError: pass try: processing_software['cellranger'] = ast.literal_eval( ap.metadata.cellranger_software) except ValueError: pass for pkg in ('cellranger', 'cellranger-atac'): if pkg in processing_software: report_items.append(pkg.title()) # Generate report text report = [] # Report header if datestamp and instrument and run_number: title = "%s run #%s datestamped %s" % (platform, run_number, datestamp) else: title = "%s" % os.path.basename(ap.analysis_dir) report.append("%s\n%s" % (title, '=' * len(title))) # General information field_width = max([len(i) for i in report_items]) for item in report_items: # Get the value for each item if item == 'Run name': value = run_name elif item == 'Reference': value = ap.run_reference_id elif item == 'Platform': value = platform elif item == 'Sequencer': value = ap.metadata.sequencer_model elif item == 'Directory': value = ap.params.analysis_dir elif item == 'Endedness': value = ('Paired end' if analysis_dir.paired_end else 'Single end') elif item == 'Bcl2fastq': if 'bcl2fastq' in processing_software: value = "%s %s" % (processing_software['bcl2fastq'][1], processing_software['bcl2fastq'][2]) else: value = 'Unknown' elif item == 'Cellranger': if 'cellranger' in processing_software: value = "%s %s" % (processing_software['cellranger'][1], processing_software['cellranger'][2]) else: value = 'Unknown' elif item == 'Cellranger-Atac': if 'cellranger-atac' in processing_software: value = "%s %s" % (processing_software['cellranger-atac'][1], processing_software['cellranger-atac'][2]) else: value = 'Unknown' else: raise Exception("Unknown reporting item '%s'" % item) # Append a line reporting the value report.append("%s%s: %s" % (item, ' ' * (field_width - len(item)), value)) report.append("") # Projects rows = [] comments = bcf_utils.OrderedDictionary() if analysis_dir.n_projects != 0: report.append("%d project%s:" % (analysis_dir.n_projects, '' if analysis_dir.n_projects == 1 else 's')) data_items = ('user', 'PI', 'library_type', 'single_cell_platform', 'number_of_cells', 'organism') for project in analysis_dir.projects: project_data = dict(project=project.name) for item in data_items: value = project.info[item] project_data[item] = value if value not in ('.','?') else \ '<unspecified %s>' % item.lower() library = project_data['library_type'] if project_data['single_cell_platform'] is not None: library += " (%s)" % project_data['single_cell_platform'] samples = "%d sample%s" % (len( project.samples), 's' if len(project.samples) != 1 else '') if project_data['number_of_cells'] is not None: samples += "/%d cell%s" % ( int(project_data['number_of_cells']), 's' if int(project_data['number_of_cells']) != 1 else '') rows.append(("- '%s':" % project_data['project'], project_data['user'], project_data['organism'], library, samples, "(PI %s)" % project_data['PI'])) if project.info.comments: comments[project.name] = project.info.comments report.append(utils.pretty_print_rows(rows)) else: # No projects - try loading data from unaligned dir try: illumina_data = ap.load_illumina_data() report.append("No projects found; '%s' directory contains " "the following data:\n" % ap.params.unaligned_dir) for project in illumina_data.projects: rows.append(("- '%s':" % project.name, "%s sample%s" % (len(project.samples), 's' if len(project.samples) != 1 else ''))) report.append(utils.pretty_print_rows(rows)) except IlluminaData.IlluminaDataError as ex: report.append("No projects found") # Additional comments/notes if comments: width = max([len(x) for x in comments]) report.append("") report.append("Additional notes/comments:") for project in comments: first_line = True for line in bcf_utils.split_into_lines(comments[project], 70 - width): if first_line: report.append("- %s%s: %s" % (project, ' ' * (width - len(project)), line)) first_line = False else: report.append(" %s %s" % (' ' * width, line)) return '\n'.join(report)
p.add_option_group(deprecated_options) # Process command line options, args = p.parse_args() if len(args) != 1: p.error("input is a single SampleSheet.csv file") if options.miseq: logging.warning( "--miseq option no longer necessary; MiSEQ-style sample sheets " "are now converted automatically") # Get input sample sheet file samplesheet = args[0] if not os.path.isfile(samplesheet): logging.error("sample sheet '%s': not found" % samplesheet) sys.exit(1) # Read in the data as CSV data = IlluminaData.get_casava_sample_sheet(samplesheet) # Remove lanes if options.lanes is not None: lanes = parse_lane_expression(options.lanes) print "Keeping lanes %s, removing the rest" % ','.join( [str(x) for x in lanes]) new_data = IlluminaData.CasavaSampleSheet() for line in data: if line['Lane'] in lanes: print "Keeping %s" % line new_data.append(tabdata="%s" % line) data = new_data # Update the SampleID and SampleProject fields for sample_id in options.sample_id: lanes, name = parse_name_expression(sample_id) for line in data:
def check_barcode_collisions(sample_sheet_file, nmismatches): """ Check sample sheet for barcode collisions Check barcode index sequences within each lane (or across all samples, if no lane information is present) and find any which differ in fewer bases than a threshold number which is calculated as: less than 2 times the number of mismatches plus 1 (as is stated in the output from bcl2fastq v2.) Pairs of barcodes which are too similar (i.e. which collide) are reported as a list of tuples, e.g. [('ATTCCT','ATTCCG'),...] Arguments: sample_sheet_file (str): path to a SampleSheet.csv file to analyse for barcode collisions nmismatches (int): maximum number of mismatches to allow Returns: List: list of pairs of colliding barcodes (with each pair wrapped in a tuple), or an empty list if no collisions were detected. """ # Load the sample sheet data sample_sheet = IlluminaData.SampleSheet(sample_sheet_file) # List of index sequences (barcodes) barcodes = {} has_lanes = sample_sheet.has_lanes for line in sample_sheet: # Lane if has_lanes: lane = line['Lane'] else: lane = 1 # Index sequence try: # Try dual-indexed IEM4 format indx = "%s%s" % (line['index'].strip(), line['index2'].strip()) except KeyError: # Try single indexed IEM4 (no index2) try: indx = line['index'].strip() except KeyError: # Try CASAVA format try: indx = line['Index'].strip() except KeyError: # No index columns indx = "" # Explicitly set empty index to None if not indx: indx = None try: barcodes[lane].append(indx) except KeyError: barcodes[lane] = [ indx, ] # Mismatch threshold mismatch_threshold = 2 * nmismatches + 1 # Check for collisions collisions = [] for lane in barcodes: for i, seq1 in enumerate(barcodes[lane][:-1]): for seq2 in barcodes[lane][i + 1:]: ndiff = 0 for c1, c2 in zip(seq1, seq2): if c1 != c2: ndiff += 1 if ndiff < mismatch_threshold: collisions.append((seq1, seq2)) return collisions
"IEM sample sheet to older format)") p.add_argument('sample_sheet',metavar="SAMPLE_SHEET", help="input sample sheet file") # Process command line args = p.parse_args() if args.miseq: logging.warning("--miseq option no longer necessary; " "MiSEQ-style sample sheets are now converted " "automatically") # Get input sample sheet file samplesheet = args.sample_sheet if not os.path.isfile(samplesheet): logging.error("sample sheet '%s': not found" % samplesheet) sys.exit(1) # Read in the sample sheet data = IlluminaData.SampleSheet(samplesheet) if data.format is None: logging.error("Unable to determine samplesheet format") sys.exit(1) print "Sample sheet format: %s" % data.format # Remove lanes if args.lanes is not None: if not data.has_lanes: logging.error("sample sheet doesn't define any lanes") sys.exit(1) lanes = parse_lanes(args.lanes) print "Keeping lanes %s, removing the rest" % \ ','.join([str(x) for x in lanes]) i = 0 while i < len(data): line = data[i]
help="check CASAVA outputs against those expected for SAMPLE_SHEET") p.add_option("--stats", action="store_true", dest="stats", help="Report statistics (read counts etc) for fastq files") # Parse command line options, args = p.parse_args() # Get data directory name if len(args) != 1: p.error("expected one argument (location of Illumina analysis dir)") illumina_analysis_dir = os.path.abspath(args[0]) # Populate Illumina data object try: illumina_data = IlluminaData.IlluminaData( illumina_analysis_dir, unaligned_dir=options.unaligned_dir) except IlluminaData.IlluminaDataError, ex: logging.error("Failed to collect data: %s", ex) sys.exit(1) # Check there's at least one thing to do if not (options.report or options.summary or options.list or options.sample_sheet or options.merge_fastqs): options.report = True # List option if options.list: for project in illumina_data.projects: n_samples = len(project.samples) print "Project: %s (%d sample%s)" % (project.name, n_samples, 's' if n_samples != 1 else '')
"'NY_ChIP-seq'. Use multiple --expt=... to set the types for different " "projects") p.add_option("--keep-names",action="store_true",dest="keep_names",default=False, help="preserve the full names of the source fastq files when creating links") p.add_option("--merge-replicates",action="store_true",dest="merge_replicates",default=False, help="create merged fastq files for each set of replicates detected") # Parse command line options,args = p.parse_args() # Get data directory name if len(args) != 1: p.error("expected one argument (location of Illumina analysis dir)") illumina_analysis_dir = os.path.abspath(args[0]) # Populate Illumina data object illumina_data = IlluminaData.IlluminaData(illumina_analysis_dir, unaligned_dir=options.unaligned_dir) # Assign experiment types for expt in options.expt_type: name,type_ = expt.split(':') illumina_data.get_project(name).expt_type = type_ # Create and populate per-project directory structure for project in illumina_data.projects: create_analysis_dir(project, top_dir=illumina_analysis_dir, merge_replicates=options.merge_replicates, keep_names=options.keep_names, dry_run=options.dry_run)
def create_analysis_dir(project, top_dir=None, merge_replicates=False, keep_names=False, dry_run=False): """Create and populate analysis directory for an IlluminaProject Creates a new directory and populates either with links to FASTQ files, or with 'merged' FASTQ files created by concatenating multiple FASTQs for each sample (which can happen for multiplexed runs where samples are split across multiple lanes). Project directory names are made up of the project name and then the experiment type, or just the project name if experiment type is not set. Arguments: project : populated IlluminaProject object top_dir : parent directory to create analysis subdirectory under. Defaults to cwd if not explicitly specified merge_replicates: if True then creates a single FASTQ file for each sample by merging multiple FASTQs together keep_names: if True then links to FASTQ files will have the same names as the original files; by default links use the shortest unique name dry_run : if True then report what would be done but don't actually perform any action Returns: Name of the project directory. """ project_dir = os.path.join(top_dir,project.full_name) print "Creating analysis directory for project '%s'..." % project.full_name # Check for & create directory if os.path.exists(project_dir): print "-> %s already exists" % project_dir else: print "Making analysis directory for %s" % project.name if not dry_run: bcf_utils.mkdir(project_dir,mode=0775) # Make an empty ScriptCode directory scriptcode_dir = os.path.join(project_dir,"ScriptCode") if os.path.exists(scriptcode_dir): print "'ScriptCode' directory %s already exists" % scriptcode_dir else: print "Making 'ScriptCode' directory for %s" % project.name if not dry_run: bcf_utils.mkdir(scriptcode_dir,mode=0775) # Check for & create links to fastq files if not merge_replicates: for sample in project.samples: fastq_names = IlluminaData.get_unique_fastq_names(sample.fastq) for fastq in sample.fastq: fastq_file = os.path.join(sample.dirn,fastq) if keep_names: fastq_ln = os.path.join(project_dir,fastq) else: fastq_ln = os.path.join(project_dir,fastq_names[fastq]) if os.path.exists(fastq_ln): logging.error("Failed to link to %s: %s already exists" % (fastq_file,os.path.basename(fastq_ln))) else: print "Linking to %s" % fastq if not dry_run: bcf_utils.mklink(fastq_file,fastq_ln,relative=True) else: # Merge files for replicates within each sample for sample in project.samples: replicates = {} # Gather replicates to be merged for fastq in sample.fastq: fastq_data = IlluminaData.IlluminaFastq(fastq) name = "%s_%s_R%d" % (fastq_data.sample_name, fastq_data.barcode_sequence, fastq_data.read_number) if name not in replicates: replicates[name] = [] replicates[name].append(os.path.join(sample.dirn,fastq)) # Sort into order replicates[name].sort() # Report detected replicates print "Sample %s" % sample.name for name in replicates: print "\tReplicate '%s'" % name for fastq in replicates[name]: print "\t\t%s" % fastq # Do the merge for name in replicates: merged_fastq = os.path.join(project_dir,name+'.fastq') bcf_utils.concatenate_fastq_files(merged_fastq,replicates[name]) # Return directory name return project_dir
def main(): p = optparse.OptionParser( usage="%prog [OPTIONS] ILLUMINA_RUN_DIR OUTPUT_DIR [ SAMPLE_SHEET ]", version="%prog "+__version__, description="Wrapper to automate the Illumina bcl to fastq " "conversion process. It will either run the CASAVA/bcl2fastq v1.8 " "configureBclToFastq.pl/make pipeline or bcl2fastq v2 directly, " "depending on which software package is detected. ILLUMINA_RUN_DIR " "is the top-level directory of the Illumina run to be processed; " "output will be written to OUTPUT_DIR. Optionally a SAMPLE_SHEET " "file can also be specified, otherwise the SampleSheet.csv file in " "the BaseCalls directory will be used (if present).") # Options common to both bcl2fastq/bcl2fastq v2 p.add_option('--nmismatches',action="store",dest="nmismatches", default=None, help="set number of mismatches to allow; recommended " "values are 0 for samples without multiplexing, 1 for " "multiplexed samples with tags of length 6 or longer " "(CASAVA/bcl2fastq v1.8 --mismatches option, bcl2fastq " "v2 --barcode-mismatches option)") p.add_option('--use-bases-mask',action="store",dest="bases_mask", default=None, help="specify a bases-mask string to tell CASAVA how " "to use each cycle (the supplied value is passed " "to the --use-bases-mask option)") p.add_option('--nprocessors',action="store",dest="nprocessors", default=None, help="set the number of processors to use (defaults to " "1; for CASAVA/bcl2fastq v1.8 this is passed to the " "-j option of the 'make' step after running " "configureBcltoFastq.pl, for bcl2fastq v2 this is " "the maximum number of CPUs that should be used by " "the -r, -d, -p and -w options)") p.add_option('--ignore-missing-bcl',action="store_true", dest="ignore_missing_bcl",default=False, help="interpret missing bcl files as no call " "(CASAVA/bcl2fastq v1.8 --ignore-missing-bcl option, " "bcl2fastq v2 --ignore-missing-bcls option)") p.add_option('--bcl2fastq_path',action="store", dest="bcl2fastq_path",default=None, help="explicitly specify the path to the CASAVA or " "bcl2fastq software to use.") # CASAVA/bcl2fastq 1.8.* only casava = optparse.OptionGroup(p,'CASAVA/bcl2fastq v1.8 only') casava.add_option('--ignore-missing-stats',action="store_true", dest="ignore_missing_stats",default=False, help="fill in with zeroes when *.stats files are missing " "(see the CASAVA user guide for details of how " "--ignore-missing-stats works)") casava.add_option('--ignore-missing-control',action="store_true", dest="ignore_missing_control",default=False, help="interpret missing control files as not-set control " "bits (see the CASAVA user guide for details of how " "--ignore-missing-control works)") p.add_option_group(casava) # bcl2fastq 2 only bcl2fastq2 = optparse.OptionGroup(p,'bcl2fastq v2 only') bcl2fastq2.add_option('--no-lane-splitting',action="store_true", dest="no_lane_splitting",default=False, help="Don't split output FASTQ files by lane") # Adapter trimming (bcl2fastq 2 only) adapter_trimming = optparse.OptionGroup(p,'Adapter trimming (bcl2fastq v2 only)') adapter_trimming.add_option('--minimum-trimmed-read-length',action="store", dest="minimum_trimmed_read_length",default=35, help="Minimum read length after adapter " "trimming. bcl2fastq trims the adapter from " "the read down to this value; if there is more " "adapter match below this length then those " "bases are masked not trimmed (i.e. replaced " "by N rather than removed) (default: 35)") adapter_trimming.add_option('--mask-short-adapter-reads',action="store", dest="mask_short_adapter_reads",default=22, help="minimum length of unmasked bases that " "a read can be after adapter trimming; reads " "with fewer ACGT bases will be completely " "masked with Ns (default: 22)") p.add_option_group(adapter_trimming) # Advanced options advanced = optparse.OptionGroup(p,'Advanced options') advanced.add_option('--platform',action="store", dest="platform",default=None, help="Explicitly specify platform; only use this if " "the platform can't be read from the instrument name") p.add_option_group(advanced) options,args = p.parse_args() if not (2 <= len(args) <=3): p.error("input is an input directory, output directory and an " "optional sample sheet") # Acquire bcl2fastq software bcl2fastq = available_bcl2fastq_versions(paths=(options.bcl2fastq_path,)) if not bcl2fastq: logging.error("No bcl2fastq software found") return 1 else: bcl2fastq_exe = bcl2fastq[0] # Determine bcl2fastq version bcl2fastq_info = bcl_to_fastq_info(bcl2fastq_exe) if bcl2fastq_info[0] is None: logging.error("No bcl2fastq software found") return 1 print "Using conversion software from %s" % os.path.dirname( bcl2fastq_info[0]) # Return with error code if no version detected bcl2fastq_package = bcl2fastq_info[1] bcl2fastq_version = bcl2fastq_info[2] if bcl2fastq_version is None: logging.error("Cannot determine bcl2fastq software version") return 1 print "Package: %s" % bcl2fastq_package print "Version: %s" % bcl2fastq_version known_version = None for version in BCL2FASTQ_VERSIONS: if bcl2fastq_version.startswith("%s." % version): known_version = version break if known_version is None: # Unimplemented version logging.error("Don't know how to run bcl2fastq version %s" % bcl2fastq_version) return 1 # Locate run directory (and strip any trailing slash) illumina_run_dir = os.path.abspath(args[0].rstrip(os.sep)) if not os.path.isdir(illumina_run_dir): logging.error("%s: doesn't exist or is not a directory" % illumina_run_dir) sys.exit(1) illumina_run = IlluminaData.IlluminaRun(illumina_run_dir, options.platform) # Output directory output_dir = os.path.abspath(args[1].rstrip(os.sep)) # Sample sheet if len(args) == 3: sample_sheet = os.path.abspath(args[2]) else: sample_sheet = illumina_run.sample_sheet_csv # Bases mask if options.bases_mask is not None: bases_mask = options.bases_mask else: bases_mask = IlluminaData.IlluminaRunInfo( illumina_run.runinfo_xml).bases_mask # Report settings print "Illumina run directory : %s" % illumina_run.run_dir print "Basecalls directory : %s" % illumina_run.basecalls_dir print "Platform : %s" % illumina_run.platform print "Bcl file extension : %s" % illumina_run.bcl_extension print "SampleSheet.csv file : %s" % sample_sheet print "Output dir : %s" % output_dir print "Nmismatches : %s" % options.nmismatches print "Bases mask : %s" % bases_mask print "Nprocessors : %s" % options.nprocessors print "Ignore missing bcl : %s" % options.ignore_missing_bcl if known_version == '1.8': print "Ignore missing stats : %s" % options.ignore_missing_stats print "Ignore missing control : %s" % options.ignore_missing_control elif known_version in ('2.17','2.20',): print "No lane splitting : %s" % options.no_lane_splitting print "Min trimmed read length : %s" % \ options.minimum_trimmed_read_length print "Mask short adapter reads: %s" % \ options.mask_short_adapter_reads # Run bclToFastq conversion based on the version if known_version in ('1.8',): # 1.8.* pipeline status = run_bcl2fastq_1_8( illumina_run.basecalls_dir, sample_sheet, output_dir=output_dir, mismatches=options.nmismatches, bases_mask=options.bases_mask, nprocessors=options.nprocessors, force=True, ignore_missing_bcl=options.ignore_missing_bcl, ignore_missing_stats=options.ignore_missing_stats, ignore_missing_control=options.ignore_missing_control ) elif known_version in ('2.17',): # bcl2fastq 2.17.* if options.nprocessors is not None: # Explicitly set number of threads for each stage nprocessors=int(options.nprocessors) loading_threads=min(4,nprocessors) writing_threads=min(4,nprocessors) demultiplexing_threads=max(int(float(nprocessors)*0.2), nprocessors) processing_threads=nprocessors print "Explicitly setting number of threads for each stage:" print "Loading (-r) : %d" % loading_threads print "Demultiplexing (-d): %d" % demultiplexing_threads print "Processing (-p) : %d" % processing_threads print "Writing (-w) : %d" % writing_threads else: # Use the defaults loading_threads = None demultiplexing_threads = None processing_threads = None writing_threads = None # Run the bcl to fastq conversion status = run_bcl2fastq_2_17( illumina_run.run_dir, sample_sheet, output_dir=output_dir, mismatches=options.nmismatches, bases_mask=options.bases_mask, ignore_missing_bcl=options.ignore_missing_bcl, no_lane_splitting=options.no_lane_splitting, minimum_trimmed_read_length=options.minimum_trimmed_read_length, mask_short_adapter_reads=options.mask_short_adapter_reads, loading_threads=loading_threads, demultiplexing_threads=demultiplexing_threads, processing_threads=processing_threads, writing_threads=writing_threads ) elif known_version in ('2.20',): # bcl2fastq 2.20.* if options.nprocessors is not None: # Explicitly set number of threads for each stage nprocessors=int(options.nprocessors) loading_threads=min(4,nprocessors) writing_threads=min(4,nprocessors) processing_threads=nprocessors print "Explicitly setting number of threads for each stage:" print "Loading (-r) : %d" % loading_threads print "Processing (-p) : %d" % processing_threads print "Writing (-w) : %d" % writing_threads else: # Use the defaults loading_threads = None processing_threads = None writing_threads = None # Run the bcl to fastq conversion status = run_bcl2fastq_2_20( illumina_run.run_dir, sample_sheet, output_dir=output_dir, mismatches=options.nmismatches, bases_mask=options.bases_mask, ignore_missing_bcl=options.ignore_missing_bcl, no_lane_splitting=options.no_lane_splitting, minimum_trimmed_read_length=options.minimum_trimmed_read_length, mask_short_adapter_reads=options.mask_short_adapter_reads, loading_threads=loading_threads, processing_threads=processing_threads, writing_threads=writing_threads ) print "bclToFastq returncode: %s" % status if status != 0: logging.error("bclToFastq failure") return status
p.add_option('-N','--nprocessors',action="store",dest="cores",default=1,type='int', help="spread work across multiple processors/cores (default is 1)") options,args = p.parse_args() # Check arguments if not args and options.counts_file_in is None: p.error("Need to supply at least one input Fastq file, a bclToFastq output " "directory, or a counts file from a previous run (if using -c)") if options.report_file is not None: print "Writing report to %s" % options.report_file fp = open(options.report_file,'w') else: fp = sys.stdout # Handle input sample sheet if options.sample_sheet is not None: print "Loading sample sheet data from %s" % options.sample_sheet sample_sheet = IlluminaData.get_casava_sample_sheet(options.sample_sheet) # Process according to inputs if options.counts_file_in: # Use counts from a previously generated file counts_file = options.counts_file_in print "Loading counts from %s" % counts_file counts = dict() for line in open(counts_file,'r'): seq = line.split('\t')[1] count = int(line.split('\t')[2]) counts[seq] = count report(counts,nseqs=options.n,cutoff=options.cutoff,fp=fp) # Match barcodes to index sequences in sample sheet if options.sample_sheet: if options.lanes is not None: lanes = [int(lane) for lane in options.lanes.split(',')]
"when creating links") p.add_argument("--merge-replicates",action="store_true", dest="merge_replicates",default=False, help="create merged fastq files for each set of " "replicates detected") p.add_argument('illumina_data_dir', help="top-level directory containing the 'Unaligned' " "directory with the fastq.gz files") # Parse command line args = p.parse_args() # Get data directory name illumina_analysis_dir = os.path.abspath(args.illumina_data_dir) # Populate Illumina data object illumina_data = IlluminaData.IlluminaData(illumina_analysis_dir, unaligned_dir=args.unaligned_dir) # Assign experiment types for expt in args.expt_type: name,type_ = expt.split(':') illumina_data.get_project(name).expt_type = type_ # Create and populate per-project directory structure for project in illumina_data.projects: create_analysis_dir(project, top_dir=illumina_analysis_dir, merge_replicates=args.merge_replicates, keep_names=args.keep_names, dry_run=args.dry_run)
def __init__(self, unaligned_dir=None): """ Create a new AnalyseBarcodes pipeline instance Arguments: unaligned_dir (str): path to the directory with outputs from bcl2fastq """ # Initialise the pipeline superclass Pipeline.__init__(self, name="Analyse Barcodes") # Define parameters self.add_param('barcode_analysis_dir', type=str) self.add_param('counts_dir', type=str) self.add_param('title', type=str) self.add_param('lanes', type=list) self.add_param('sample_sheet', type=str) self.add_param('bases_mask', type=str) self.add_param('mismatches', type=int) self.add_param('cutoff', type=float) self.add_param('force', type=bool, value=False) # Load data from bcl2fastq output if not os.path.exists(unaligned_dir): raise OSError("'%s': not found" % unaligned_dir) analysis_dir = os.path.abspath(os.path.dirname(unaligned_dir)) unaligned_dir = os.path.basename(unaligned_dir) illumina_data = IlluminaData.IlluminaData(analysis_dir, unaligned_dir=unaligned_dir) # Example Fastq file used for determining mismatches in # absence of bases mask example_fastq = illumina_data.projects[0].samples[0].fastq_subset( read_number=1, full_path=True)[0] #################### # Build the pipeline #################### # Setup barcode analysis and counts directories setup_barcode_analysis_dir = SetupBarcodeAnalysisDirs( "Setup barcode analysis directory", self.params.barcode_analysis_dir, self.params.counts_dir, force=self.params.force) self.add_task(setup_barcode_analysis_dir) # Generate counts for Fastqs in each project count_tasks = [] for project in illumina_data.projects: count_barcodes = CountBarcodes("Count barcodes in '%s'" % project.name, project, self.params.counts_dir, lanes=self.params.lanes) self.add_task(count_barcodes, requires=(setup_barcode_analysis_dir, )) count_tasks.append(count_barcodes) # Generate counts for undetermined Fastqs if illumina_data.undetermined is not None: count_barcodes = CountBarcodes("Count barcodes in 'undetermined'", illumina_data.undetermined, self.params.counts_dir, lanes=self.params.lanes, use_project_name="undetermined") self.add_task(count_barcodes, requires=(setup_barcode_analysis_dir, )) count_tasks.append(count_barcodes) # List the counts files list_counts_files = ListBarcodeCountFiles( "Fetch the barcode counts files", self.params.counts_dir) self.add_task(list_counts_files, requires=count_tasks) # Analyse counts and report the results report_barcodes = ReportBarcodeAnalysis( "Report barcode analysis", list_counts_files.output.counts_files, self.params.barcode_analysis_dir, sample_sheet=self.params.sample_sheet, lanes=self.params.lanes, mismatches=self.params.mismatches, cutoff=self.params.cutoff, title=self.params.title) self.add_task(report_barcodes, requires=(list_counts_files, )) # Add final outputs to the pipeline self.add_output('report_file', report_barcodes.output.report_file) self.add_output('xls_file', report_barcodes.output.xls_file) self.add_output('html_file', report_barcodes.output.html_file)
"if required)") p.add_option_group(deprecated_options) # Process command line options,args = p.parse_args() if len(args) != 1: p.error("input is a single SampleSheet.csv file") if options.miseq: logging.warning("--miseq option no longer necessary; MiSEQ-style sample sheets " "are now converted automatically") # Get input sample sheet file samplesheet = args[0] if not os.path.isfile(samplesheet): logging.error("sample sheet '%s': not found" % samplesheet) sys.exit(1) # Read in the data as CSV data = IlluminaData.get_casava_sample_sheet(samplesheet) # Remove lanes if options.lanes is not None: lanes = parse_lane_expression(options.lanes) print "Keeping lanes %s, removing the rest" % ','.join([str(x) for x in lanes]) new_data = IlluminaData.CasavaSampleSheet() for line in data: if line['Lane'] in lanes: print "Keeping %s" % line new_data.append(tabdata="%s" % line) data = new_data # Update the SampleID and SampleProject fields for sample_id in options.sample_id: lanes,name = parse_name_expression(sample_id) for line in data: if line['Lane'] in lanes: