def create_analysis_dir(project, top_dir=None, merge_replicates=False, keep_names=False, dry_run=False): """Create and populate analysis directory for an IlluminaProject Creates a new directory and populates either with links to FASTQ files, or with 'merged' FASTQ files created by concatenating multiple FASTQs for each sample (which can happen for multiplexed runs where samples are split across multiple lanes). Project directory names are made up of the project name and then the experiment type, or just the project name if experiment type is not set. Arguments: project : populated IlluminaProject object top_dir : parent directory to create analysis subdirectory under. Defaults to cwd if not explicitly specified merge_replicates: if True then creates a single FASTQ file for each sample by merging multiple FASTQs together keep_names: if True then links to FASTQ files will have the same names as the original files; by default links use the shortest unique name dry_run : if True then report what would be done but don't actually perform any action Returns: Name of the project directory. """ project_dir = os.path.join(top_dir,project.full_name) print "Creating analysis directory for project '%s'..." % project.full_name # Check for & create directory if os.path.exists(project_dir): print "-> %s already exists" % project_dir else: print "Making analysis directory for %s" % project.name if not dry_run: bcf_utils.mkdir(project_dir,mode=0775) # Make an empty ScriptCode directory scriptcode_dir = os.path.join(project_dir,"ScriptCode") if os.path.exists(scriptcode_dir): print "'ScriptCode' directory %s already exists" % scriptcode_dir else: print "Making 'ScriptCode' directory for %s" % project.name if not dry_run: bcf_utils.mkdir(scriptcode_dir,mode=0775) # Check for & create links to fastq files if not merge_replicates: for sample in project.samples: fastq_names = IlluminaData.get_unique_fastq_names(sample.fastq) for fastq in sample.fastq: fastq_file = os.path.join(sample.dirn,fastq) if keep_names: fastq_ln = os.path.join(project_dir,fastq) else: fastq_ln = os.path.join(project_dir,fastq_names[fastq]) if os.path.exists(fastq_ln): logging.error("Failed to link to %s: %s already exists" % (fastq_file,os.path.basename(fastq_ln))) else: print "Linking to %s" % fastq if not dry_run: bcf_utils.mklink(fastq_file,fastq_ln,relative=True) else: # Merge files for replicates within each sample for sample in project.samples: replicates = {} # Gather replicates to be merged for fastq in sample.fastq: fastq_data = IlluminaData.IlluminaFastq(fastq) name = "%s_%s_R%d" % (fastq_data.sample_name, fastq_data.barcode_sequence, fastq_data.read_number) if name not in replicates: replicates[name] = [] replicates[name].append(os.path.join(sample.dirn,fastq)) # Sort into order replicates[name].sort() # Report detected replicates print "Sample %s" % sample.name for name in replicates: print "\tReplicate '%s'" % name for fastq in replicates[name]: print "\t\t%s" % fastq # Do the merge for name in replicates: merged_fastq = os.path.join(project_dir,name+'.fastq') bcf_utils.concatenate_fastq_files(merged_fastq,replicates[name]) # Return directory name return project_dir
"if required)") p.add_option_group(deprecated_options) # Process command line options,args = p.parse_args() if len(args) != 1: p.error("input is a single SampleSheet.csv file") if options.miseq: logging.warning("--miseq option no longer necessary; MiSEQ-style sample sheets " "are now converted automatically") # Get input sample sheet file samplesheet = args[0] if not os.path.isfile(samplesheet): logging.error("sample sheet '%s': not found" % samplesheet) sys.exit(1) # Read in the data as CSV data = IlluminaData.get_casava_sample_sheet(samplesheet) # Remove lanes if options.lanes is not None: lanes = parse_lane_expression(options.lanes) print "Keeping lanes %s, removing the rest" % ','.join([str(x) for x in lanes]) new_data = IlluminaData.CasavaSampleSheet() for line in data: if line['Lane'] in lanes: print "Keeping %s" % line new_data.append(tabdata="%s" % line) data = new_data # Update the SampleID and SampleProject fields for sample_id in options.sample_id: lanes,name = parse_name_expression(sample_id) for line in data: if line['Lane'] in lanes:
"'NY_ChIP-seq'. Use multiple --expt=... to set the types for different " "projects") p.add_option("--keep-names",action="store_true",dest="keep_names",default=False, help="preserve the full names of the source fastq files when creating links") p.add_option("--merge-replicates",action="store_true",dest="merge_replicates",default=False, help="create merged fastq files for each set of replicates detected") # Parse command line options,args = p.parse_args() # Get data directory name if len(args) != 1: p.error("expected one argument (location of Illumina analysis dir)") illumina_analysis_dir = os.path.abspath(args[0]) # Populate Illumina data object illumina_data = IlluminaData.IlluminaData(illumina_analysis_dir, unaligned_dir=options.unaligned_dir) # Assign experiment types for expt in options.expt_type: name,type_ = expt.split(':') illumina_data.get_project(name).expt_type = type_ # Create and populate per-project directory structure for project in illumina_data.projects: create_analysis_dir(project, top_dir=illumina_analysis_dir, merge_replicates=options.merge_replicates, keep_names=options.keep_names, dry_run=options.dry_run)
"if required)") p.add_option_group(deprecated_options) # Process command line options,args = p.parse_args() if len(args) != 1: p.error("input is a single SampleSheet.csv file") if options.miseq: logging.warning("--miseq option no longer necessary; MiSEQ-style sample sheets " "are now converted automatically") # Get input sample sheet file samplesheet = args[0] if not os.path.isfile(samplesheet): logging.error("sample sheet '%s': not found" % samplesheet) sys.exit(1) # Read in the data as CSV data = IlluminaData.get_casava_sample_sheet(samplesheet) # Update the SampleID and SampleProject fields for sample_id in options.sample_id: lanes,name = parse_name_expression(sample_id) for line in data: if line['Lane'] in lanes: print "Setting SampleID for lane %d: '%s'" % (line['Lane'],name) line['SampleID'] = name # Update the SampleProject field for sample_project in options.sample_project: lanes,name = parse_name_expression(sample_project) for line in data: if line['Lane'] in lanes: print "Setting SampleProject for lane %d: '%s'" % (line['Lane'],name) line['SampleProject'] = name # Fix spaces
p.add_option('--ignore-warnings',action="store_true",dest="ignore_warnings",default=False, help="ignore warnings about spaces and duplicated sampleID/sampleProject " "combinations when writing new samplesheet.csv file") # Process command line options,args = p.parse_args() if len(args) != 1: p.error("input is a single SampleSheet.csv file") # Get input sample sheet file samplesheet = args[0] if not os.path.isfile(samplesheet): logging.error("sample sheet '%s': not found" % samplesheet) sys.exit(1) # Read in the data as CSV if options.miseq: # Input sample sheet is from MiSEQ data = IlluminaData.convert_miseq_samplesheet_to_casava(samplesheet) else: # Standard CASAVA sample sheet data = IlluminaData.CasavaSampleSheet(samplesheet) # Update the SampleID and SampleProject fields for sample_id in options.sample_id: lanes,name = parse_name_expression(sample_id) for line in data: if line['Lane'] in lanes: print "Setting SampleID for lane %d: '%s'" % (line['Lane'],name) line['SampleID'] = name # Update the SampleProject field for sample_project in options.sample_project: lanes,name = parse_name_expression(sample_project) for line in data: if line['Lane'] in lanes:
def demultiplex_fastq(fastq_file, barcodes, nmismatches): """Perform demultiplexing of a FASTQ file Demultiplex reads in a FASTQ file given information about a set of barcode/index sequences. Produces a file for each barcode, plus another for 'unbinned' reads. Arguments: fastq_file: FASTQ file to be demultiplexed (can be gzipped) barcodes: list of barcode sequences to use for demultiplexing nmismatches: maxiumum number of mismatched bases allowed when testing whether barcode sequences match Returns: No return value """ # Start print "Processing %s" % fastq_file info = IlluminaData.IlluminaFastq(fastq_file) # Set up output files output_files = {} # Weed out barcodes that aren't associated with this lane local_barcodes = [] for barcode in barcodes: if barcode['lane'] != info.lane_number: continue local_barcodes.append(barcode) output_file_name = "%s_%s_L%03d_R%d_%03d.fastq" % ( barcode['name'], barcode['index'], info.lane_number, info.read_number, info.set_number) print "\t%s\t%s" % (barcode['index'], output_file_name) if os.path.exists(output_file_name): print "\t%s: already exists,exiting" % output_file_name sys.exit(1) output_files[barcode['index']] = open(output_file_name, 'w') # Check if there's anything to do if len(local_barcodes) == 0: return # Also make a file for unbinned reads unbinned_file_name = "unbinned_L%03d_R%d_%03d.fastq" % ( info.lane_number, info.read_number, info.set_number) if os.path.exists(unbinned_file_name): print "\t%s: already exists,exiting" % unbinned_file_name sys.exit(1) output_files['unbinned'] = open(unbinned_file_name, 'w') # Process reads nreads = 0 for read in FASTQFile.FastqIterator(fastq_file): nreads += 1 matched_read = False this_barcode = read.seqid.index_sequence for barcode in local_barcodes: if barcode['matcher'].match(this_barcode, nmismatches): ##print "Matched %s against %s" % (this_barcode,barcodes[barcode]['name']) output_files[barcode['index']].write(str(read) + '\n') matched_read = True break # Put in unbinned if no match if not matched_read: output_files['unbinned'].write(str(read) + '\n') ##if nreads > 100: break # Close files for barcode in local_barcodes: output_files[barcode['index']].close() print "\tMatched %d reads for %s" % (nreads, os.path.basename(fastq_file))
# Set up barcode data barcodes = [] for barcode_info in options.barcode_info: name, barcode, lane = barcode_info.split(':') print "Assigning barcode '%s' in lane %s to %s" % (barcode, lane, name) barcodes.append({ 'name': name, 'index': barcode, 'matcher': BarcodeMatcher(barcode), 'lane': int(lane) }) # Read from sample sheet (if supplied) if options.sample_sheet is not None: print "Reading data from sample sheet %s" % options.sample_sheet sample_sheet = IlluminaData.CasavaSampleSheet(options.sample_sheet) for line in sample_sheet: name = line['SampleID'] barcode = line['Index'].rstrip('N').rstrip('-').rstrip('N') lane = line['Lane'] print "Assigning barcode '%s' in lane %s to %s" % (barcode, lane, name) barcodes.append({ 'name': name, 'index': barcode, 'matcher': BarcodeMatcher(barcode), 'lane': int(lane) }) if len(barcodes) < 1: p.error("need at least one --barcode and/or --samplesheet assignment")
else: n_fastqs = len(sample.fastq) if n_fastqs == 1: print "\t%s" % sample.name else: print "\t%s (%d fastqs)" % (sample.name,n_fastqs) # Print fastq names fastqs = sample.fastq_subset(read_number=1) + \ sample.fastq_subset(read_number=2) for fastq in fastqs: print "\t\t%s" % fastq # Report the names of the samples in each project if options.report: for project in illumina_data.projects: print "%s" % IlluminaData.describe_project(project) # Report statistics for fastq files if options.stats: # Print number of reads for each file, and file size for sample in project.samples: for fastq in sample.fastq: fq = os.path.join(sample.dirn,fastq) nreads = FASTQFile.nreads(fq) fsize = os.path.getsize(fq) print "%s\t%s\t%d" % (fastq, bcf_utils.format_file_size(fsize), nreads) print "" # Summary: short report suitable for logging file if options.summary: