def check_sample_def(sample_def): hostname = socket.gethostname() check(tk_preflight.check_gem_groups(sample_def)) print "Checking FASTQ folder..." for sample_def in sample_def: read_path = sample_def["read_path"] if not read_path.startswith('/'): raise PreflightException( "Specified FASTQ folder must be an absolute path: %s" % read_path) if not os.path.exists(read_path): raise PreflightException( "On machine: %s, specified FASTQ folder does not exist: %s" % (hostname, read_path)) if not os.access(read_path, os.X_OK): raise PreflightException( "On machine: %s, cellranger does not have permission to open FASTQ folder: %s" % (hostname, read_path)) if not os.listdir(read_path): raise PreflightException("Specified FASTQ folder is empty: " + read_path) lanes = sample_def["lanes"] if lanes is not None: for lane in lanes: if not is_int(lane): raise PreflightException( "Lanes must be a comma-separated list of numbers.") check(tk_preflight.check_sample_indices(sample_def))
def main(args, outs): hostname = socket.gethostname() if args.output_format == 'bam' and args.read_group is None: martian.exit( "Please specify a read_group to populate the @RG field of the BAM file" ) if args.sample_id is not None: if not re.match("^[\w-]+$", args.sample_id): martian.exit( "Sample name may only contain letters, numbers, underscores, and dashes: " + args.sample_id) for sample_def in args.sample_def: read_path = sample_def["read_path"] if not read_path.startswith('/'): martian.exit( "Specified FASTQ folder must be an absolute path: %s" % read_path) if not os.path.exists(read_path): martian.exit( "On machine: %s, specified FASTQ folder does not exist: %s" % (hostname, read_path)) if not os.access(read_path, os.X_OK): martian.exit( "On machine: %s, longranger does not have permission to open FASTQ folder: %s" % (hostname, read_path)) if not os.listdir(read_path): martian.exit("Specified FASTQ folder is empty: " + read_path) library_id = sample_def.get("library_id") if library_id is not None: if not re.match("^[\w-]+$", library_id): martian.exit( "Library name may only contain letters, numbers, underscores, and dashes: " + library_id) lanes = sample_def["lanes"] if lanes is not None: for lane in lanes: if not tk_preflight.is_int(lane): martian.exit( "Lanes must be a comma-separated list of numbers.") ok, msg = tk_preflight.check_sample_indices(sample_def) if not ok: martian.exit(msg) # Check open file handles limit ok, msg = tk_preflight.check_open_fh() if not ok: martian.exit(msg) martian.log_info(tk_preflight.record_package_versions())
def main_bcl_processor(sample_id, sample_def, chemistry_arg, custom_chemistry_def): chunks = [] sample_index_strings, msg = tk_preflight.check_sample_indices(sample_def) if sample_index_strings is None: martian.exit(msg) path = sample_def['read_path'] lanes = sample_def['lanes'] for sample_index in sample_index_strings: # Determine the read-type => fastq filename mapping try: chemistry_name = cr_chem.infer_sc3p_chemistry_bcl_processor( chemistry_arg, path, sample_index, lanes) except cr_chem.NoInputFastqsException: continue if chemistry_name == cr_chem.CUSTOM_CHEMISTRY_NAME: chemistry = custom_chemistry_def else: chemistry = cr_chem.get_chemistry(chemistry_name) read_type_map = cr_chem.get_read_type_map( chemistry, tk_constants.BCL_PROCESSOR_FASTQ_MODE) # Collect the fastq files for each read type filename_lists = {} for dest_read_type in cr_constants.FASTQ_READ_TYPES: src_read_type = read_type_map[dest_read_type] filename_lists[ dest_read_type] = tk_fasta.find_input_fastq_files_10x_preprocess( path, src_read_type, sample_index, lanes) fill_in_missing_reads(filename_lists) if validate_fastq_lists(filename_lists): chunks += construct_chunks(filename_lists, sample_id, sample_def, reads_interleaved=True, chemistry=chemistry) return chunks
def main(args, outs): """Combine reads from multiple input FASTQ files, and potentially trim. Demultiplex outputs a series of FASTQ files with filenames of the form: read-[RA|I1|I2]_si-AGTAACGT_lane-001_chunk_001.fastq[.gz]. """ def check_key(n, dict_in, name, tys): if not dict_in.has_key(name): martian.exit("Entry %d in sample_def missing required field: %s" % (n, name)) if not (type(dict_in[name]) in tys): martian.exit("Entry %d in sample_def for '%s' has incorrect type -- expecting %s, got %s" % (n, name, str(tys), type(dict_in[name]))) global_subsample_rate = 1.0 downsample_gigabases = False downsample_reads = False if args.downsample is not None: ## make sure that exactly one downsampling option is specified options_supplied=0 for subsample_key in ["gigabases", "subsample_rate", "target_reads"]: if args.downsample.get(subsample_key, None) is not None: options_supplied += 1 assert( options_supplied == 1 ) ## if 'subsample_rate' in args.downsample and args.downsample['subsample_rate'] is not None: global_subsample_rate = args.downsample['subsample_rate'] assert( global_subsample_rate <= 1.0 ) elif 'target_reads' in args.downsample and args.downsample['target_reads'] is not None: downsample_reads = True else: downsample_gigabases = True # Check for self-consistent gem_group settings in the sample_def entries gem_groups = [x['gem_group'] for x in args.sample_def] all_null = all([x is None for x in gem_groups]) all_int = all([type(x) is int for x in gem_groups]) if not (all_null or all_int): martian.exit("Inconsistent gem_group tags. Please specify all gem_group tags as null, or all gem_group tags with an integer") # If all gem_groups are set to null, then set them all to 1 if all_null: for sample_item in args.sample_def: sample_item['gem_group'] = 1 # Predicted input bases total_seq_bases = 0 total_seq_reads = 0 # verify input mode upfront if args.input_mode not in ["BCL_PROCESSOR", "ILMN_BCL2FASTQ"]: martian.throw("Unrecognized input_mode: %s" % args.input_mode) for (idx, sample_item) in enumerate(args.sample_def): # validate fields check_key(idx, sample_item, "read_path", [str, unicode]) check_key(idx, sample_item, "lanes", [list, type(None)]) check_key(idx, sample_item, "gem_group", [int, type(None)]) if args.input_mode == "BCL_PROCESSOR": check_key(idx, sample_item, "sample_indices", [list, type(None)]) elif args.input_mode == "ILMN_BCL2FASTQ": check_key(idx, sample_item, "sample_names", [list, type(None)]) interleaved_read_type = "RA" chunks = [] read_groups = set() for read_chunk in args.sample_def: # Check if subsample_rate exists in sample_def if 'subsample_rate' in read_chunk.keys(): subsample_rate = global_subsample_rate * read_chunk['subsample_rate'] else: subsample_rate = global_subsample_rate bc_in_read = {} if read_chunk.has_key('bc_in_read'): if read_chunk['bc_in_read'] is not None: bc_in_read['bc_in_read'] = read_chunk['bc_in_read'] bc_in_read['bc_length'] = read_chunk['bc_length'] path = read_chunk['read_path'] lanes = read_chunk['lanes'] gem_group = read_chunk['gem_group'] unbarcoded = read_chunk.get('unbarcoded') sample_id = args.sample_id library_id = read_chunk.get('library_id', 'MissingLibrary') # split on BCL_PROCESSOR / ILMN_BCL2FASTQ # the main difference is that BCL_PROCESSOR uses interleaved reads and labels FASTQs by sample index; # whereas ILMN_BCL2FASTQ uses R1/R2 and labels by sample name if args.input_mode == "BCL_PROCESSOR": sample_index_strings, msg = tk_preflight.check_sample_indices(read_chunk) if sample_index_strings is None: martian.exit(msg) sample_seq_bases = 0 sample_seq_reads = 0 find_func = tk_fasta.find_input_fastq_files_10x_preprocess for sample_index in sample_index_strings: # process interleaved reads reads = find_func(path, interleaved_read_type, sample_index, lanes) for read in reads: predicted_seq_reads, predicted_seq_bases = fastq_data_estimate(read) sample_seq_bases += predicted_seq_bases sample_seq_reads += predicted_seq_reads martian.log_info("Input data: Predict %f GB from %s" % (float(sample_seq_bases)/1e9, path)) total_seq_bases += sample_seq_bases total_seq_reads += sample_seq_reads for sample_index in sample_index_strings: reads = find_func(path, interleaved_read_type, sample_index, lanes) # TODO confirm that this works with cellranger si_read, bc_read = ("I1", "I2") if 'barcode_read' in read_chunk and read_chunk['barcode_read'] == 'I1': si_read, bc_read = ("I2", "I1") sis = find_func(path, si_read, sample_index, lanes) # allow empty sample index case if all reads in lane are same sample if sis is None or sis == []: sis = [None] * len(reads) if not unbarcoded: barcodes = find_func(path, bc_read, sample_index, lanes) if len(barcodes) == 0: barcodes = [None] * len(reads) else: barcodes = [None] * len(reads) # calculate chunks for r,b,si in zip(reads, barcodes, sis): (flowcell, lane) = get_run_data(r) rg_string = ':'.join([sample_id, library_id, str(gem_group), flowcell, lane]) new_chunk = { 'read1': r, 'read2': None, 'reads_interleaved': True, 'barcode': b, 'sample_index': si, 'barcode_reverse_complement': False, 'gem_group': gem_group, 'subsample_rate': subsample_rate, 'read_group': rg_string } new_chunk.update(bc_in_read) chunks.append(new_chunk) read_groups.add(rg_string) elif args.input_mode == "ILMN_BCL2FASTQ": sample_names = read_chunk['sample_names'] sample_seq_bases = 0 sample_seq_reads = 0 find_func = tk_fasta.find_input_fastq_files_bcl2fastq_demult for sample_name in sample_names: # process read 1 reads = find_func(path, "R1", sample_name, lanes) for read in reads: predicted_seq_reads, predicted_seq_bases = fastq_data_estimate(read) sample_seq_bases += predicted_seq_bases sample_seq_reads += predicted_seq_reads # process read 2 reads = find_func(path, "R2", sample_name, lanes) for read in reads: predicted_seq_reads, predicted_seq_bases = fastq_data_estimate(read) sample_seq_bases += predicted_seq_bases sample_seq_reads += predicted_seq_reads martian.log_info("Input data: Predict %f GB from %s" % (float(sample_seq_bases)/1e9, path)) total_seq_bases += sample_seq_bases total_seq_reads += sample_seq_reads for sample_name in sample_names: r1_reads = find_func(path, "R1", sample_name, lanes) r2_reads = find_func(path, "R2", sample_name, lanes) # TODO confirm that this works with cellranger si_read, bc_read = ("I1", "I2") if 'barcode_read' in read_chunk and read_chunk['barcode_read'] == 'I1': si_read, bc_read = ("I2", "I1") sis = find_func(path, si_read, sample_name, lanes) # allow empty sample index case if all reads in lane are same sample if sis is None or sis == []: sis = [None] * len(r1_reads) # in Chromium chemistry... there shouldn't be separate barcode reads... if not unbarcoded: barcodes = find_func(path, bc_read, sample_name, lanes) if len(barcodes) == 0: barcodes = [None] * len(r1_reads) else: barcodes = [None] * len(r1_reads) # again, with Chromium, the barcodes should be an array of Nones, but # just in case... if not (len(r1_reads) == len(r2_reads) == len(barcodes)): martian.log_info("Read 1 files: %s" % str(r1_reads)) martian.log_info("Read 2 files: %s" % str(r2_reads)) martian.log_info("Barcode files: %s" % str(barcodes)) martian.exit("Read1, Read2, and Barcode files are mismatched. Exiting pipline") # calculate chunks for r1,r2,b,si in zip(r1_reads, r2_reads, barcodes, sis): (flowcell, lane) = get_run_data(r1) rg_string = ':'.join([sample_id, library_id, str(gem_group), flowcell, lane]) new_chunk = { 'read1': r1, 'read2': r2, 'reads_interleaved': False, 'barcode': b, 'sample_index': si, 'barcode_reverse_complement': False, 'gem_group': gem_group, 'subsample_rate': subsample_rate, 'read_group': rg_string } new_chunk.update(bc_in_read) chunks.append(new_chunk) read_groups.add(rg_string) martian.log_info("Input data: Predict %f total GB" % (float(total_seq_bases)/1e9)) martian.log_info(" Predict %d total reads" % total_seq_reads) if len(chunks) == 0: martian.exit("No input FASTQs were found for the requested parameters.") if downsample_gigabases and args.downsample['gigabases'] is not None: # Calculate global downsample rate global_subsample_rate = min(1.0, float(args.downsample['gigabases'])*1e9 / float(total_seq_bases)) martian.log_info("Input data downsampling: Requested: %.2f GB, Estimated Input: %.2f GB, Downsample Rate: %.3f" \ % (float(args.downsample['gigabases']), float(total_seq_bases)/1e9, global_subsample_rate)) for chunk in chunks: chunk['subsample_rate'] = chunk['subsample_rate'] * global_subsample_rate elif downsample_reads: global_subsample_rate = min(1.0, float(args.downsample['target_reads'])/float(total_seq_reads)) martian.log_info("Input data downsampling: Requested: %.2f M reads, Estimated Input: %.2f M reads, Downsample Rate: %.3f" \ % (float(args.downsample['target_reads'])/1e6, float(total_seq_reads)/1e6, global_subsample_rate)) for chunk in chunks: chunk['subsample_rate'] = chunk['subsample_rate'] * global_subsample_rate martian.log_info("Input reads: %s" % str(chunks)) outs.chunks = chunks outs.read_groups = [rg for rg in read_groups] # log info about input vs requested GB # first, set defaults available_gb = float(total_seq_bases)/1e9 requested_gb = None available_reads = total_seq_reads requested_reads = None requested_rate = None post_downsample_gb = requested_gb downsample_succeeded = True if args.downsample is not None and args.downsample.get('gigabases') is not None: requested_gb = float(args.downsample['gigabases']) post_downsample_gb = min(available_gb, requested_gb) if available_gb < requested_gb: martian.log_info("Downsample requested more GB than was available; will not downsample.") downsample_succeeded = False elif args.downsample is not None and args.downsample.get('subsample_rate') is not None: requested_rate = float(args.downsample['subsample_rate']) post_downsample_gb = available_gb * requested_rate elif args.downsample is not None and args.downsample.get('target_reads') is not None: requested_reads = float(args.downsample['target_reads']) downsample_info = {} downsample_info['available_gb'] = available_gb downsample_info['requested_gb'] = requested_gb downsample_info['available_reads'] = available_reads downsample_info['requested_reads'] = requested_reads downsample_info['requested_rate'] = requested_rate downsample_info['post_downsample_gb'] = post_downsample_gb downsample_info['downsample_succeeded'] = downsample_succeeded with open(outs.downsample_info, 'w') as downsample_out: tenkit.safe_json.dump_numpy(downsample_info, downsample_out) check_fastqs(outs.chunks)
def main(args, outs): hostname = socket.gethostname() # Sample ID / pipestance name if args.sample_id is not None: if not re.match("^[\w-]+$", args.sample_id): martian.exit("Sample name may only contain letters, numbers, underscores, and dashes: " + args.sample_id) # FASTQ input for sample_def in args.sample_def: #if not tk_preflight.check_is_chromium(sample_def): # martian.exit("This version of Longranger does not support GemCode data. Please use Longranger 1.2 instead.") read_path = sample_def["read_path"] if not read_path: martian.exit("Must specify a read_path containing FASTQs.") if not read_path.startswith('/'): martian.exit("Specified FASTQ folder must be an absolute path: %s" % read_path) if not os.path.exists(read_path): martian.exit("On machine: %s, specified FASTQ folder does not exist: %s" % (hostname, read_path)) if not os.access(read_path, os.X_OK): martian.exit("On machine: %s, longranger does not have permission to open FASTQ folder: %s" % (hostname, read_path)) if not os.listdir(read_path): martian.exit("Specified FASTQ folder is empty: " + read_path) library_id = sample_def.get("library_id") if library_id is not None: if not re.match("^[\w-]+$", library_id): martian.exit("Library name may only contain letters, numbers, underscores, and dashes: " + library_id) lanes = sample_def["lanes"] if lanes is not None: for lane in lanes: if not is_int(lane): martian.exit("Lanes must be a comma-separated list of numbers.") ok, msg = tk_preflight.check_sample_indices(sample_def) if not ok: martian.exit(msg) # Reference MAX_CONTIGS = 1000 ok, msg = tk_preflight.check_refdata(args.reference_path, MAX_CONTIGS) if ok: martian.log_info(msg) else: martian.exit(msg) # Sex (given reference) if args.sex is not None: if args.sex.lower() not in ["m", "male", "f", "female"]: martian.exit("Sex of sample must be 'm', 'male', 'f', or 'female'.") else: if tenkit.reference.load_male_chromosomes(args.reference_path) == None: martian.exit("Must specify sex of sample, or use a reference package that includes a sex_chromosomes.tsv file.\nFor more details, see http://support.10xgenomics.com/genome-exome/software/pipelines/latest/advanced/references") ref = tenkit.reference.open_reference(args.reference_path) male_chrom = tenkit.reference.load_male_chromosomes(args.reference_path) for m in male_chrom: if m not in ref: martian.exit("Reference issue in sex_chromosomes.tsv. Male-specific chromosome '%s' does not exist in reference" % m) auto_chrom = tenkit.reference.load_autosomal_chromosomes(args.reference_path) if auto_chrom is None: martian.exit("No autosomal chromosome listed in sex_chromosomes.tsv. Please list an autosomal chromosome to use as a reference for sex determination") for a in auto_chrom: if a not in ref: martian.exit("Reference issue in sex_chromosomes.tsv. Autosomal chromosome '%s' does not exist in reference" % a) # Open file handles limit - per LONGRANGER-1758, only check this on the execution machine. # We can tell if we're on the execution machine by looking at args.check_executables if args.check_executables: ok, msg = tk_preflight.check_open_fh() if not ok: martian.exit(msg) # Targets if args.targets is not None: tk_preflight.check_file("targets", args.targets, hostname) tk_preflight.check_bed(args.targets, args.reference_path) if args.target_blacklist is None: print "\nWARNING: You selected targeted mode but did not provide a --cnvfilter.\nPlease note this may result in a high number of false positive CNV calls.\nFor more details, see http://support.10xgenomics.com/genome-exome/software\n" # Target blacklist if args.target_blacklist is not None: tk_preflight.check_file("cnvfilter", args.target_blacklist, hostname) tk_preflight.check_bed(args.target_blacklist, args.reference_path) # Restrict locus if tenkit.reference.is_tenx(args.reference_path): if args.restrict_locus is not None: if not re.match("^chr[A-Za-z0-9]{1,2}:[0-9]+\.\.[0-9]+$", args.restrict_locus): martian.exit("restrict_locus must be of the form 'chrXX:start..end'.") # Pre-called if args.vc_precalled is not None: tk_preflight.check_file("pre-called VCF", args.vc_precalled, hostname) check_vcf(args.vc_precalled, args) # VC mode if not re.match("^(disable|freebayes|gatk:/.*\.jar|precalled:/.*\.vcf)$", args.vc_mode): martian.exit("vc_mode must be of the form 'freebayes', 'gatk:/path/to/gatk_jar_file.jar', 'disable'.") if args.vc_precalled is None and args.vc_mode == "disable": martian.exit("Because you have not provided a pre-called VCF file, variant calling cannot be disabled.") vc_args = args.vc_mode.split(":") vc_mode = vc_args[0] if vc_mode == "precalled": if args.vc_precalled is not None: martian.exit("Please specify a pre-called VCF file using only one method.") precalled_vars_path = vc_args[1] tk_preflight.check_file("pre-called VCF", precalled_vars_path, hostname) check_vcf(precalled_vars_path, args) elif vc_mode == "gatk": jar_path = vc_args[1] if not jar_path.startswith('/'): martian.exit("Specified GATK jar file must be an absolute path: %s" % jar_path) if not os.path.exists(jar_path): martian.exit("On machine: %s, specified GATK jar file does not exist: %s" % (hostname, jar_path)) if os.path.isdir(jar_path): martian.exit("Please specify a GATK jar file, not a folder.") if args.check_executables: check_gatk(jar_path, hostname) check_gatk_ref(args.reference_path) # VC ground truth if args.vc_ground_truth is not None: tk_preflight.check_file("VCF ground truth", args.vc_ground_truth, hostname) check_vcf(args.vc_ground_truth, args) # SV min QV if args.sv_min_qv is not None and args.sv_min_qv < 0: martian.exit("sv_min_qv must be a positive integer.") # SV ground truth if args.sv_ground_truth is not None: tk_preflight.check_file("SV ground truth", args.sv_ground_truth, hostname) martian.log_info(tk_preflight.record_package_versions())
def join(args, outs, chunk_defs, chunk_outs): # Sample ID / pipestance name check_sample_id(args.sample_id) # force_cells check_force_cells(args.force_cells) # downsample if args.downsample is not None: if len(args.downsample) == 0: martian.exit("downsample must be a non-empty dictionary.") keys = args.downsample.keys() if len(keys) > 1: martian.exit("Please supply either subsample_rate or gigabases but not both.") key = keys[0] if not (key in ['subsample_rate', 'gigabases']): martian.exit("Please supply either subsample_rate or gigabases as the downsample argument. '%s' is invalid" % key) value = args.downsample[key] bad_value = False try: float(value) bad_value = value < 1e-12 except ValueError: bad_value = True if bad_value: martian.exit("Command line argument for downsampling must be a positive number") # FASTQ mode if args.fastq_mode is not None: if args.fastq_mode not in ['ILMN_BCL2FASTQ', 'BCL_PROCESSOR']: martian.exit("Unsupported fastq_mode. Options are ILMN_BCL2FASTQ and BCL_PROCESSOR, provided: {}". format(args.fastq_mode)) # FASTQ input (sample_def) hostname = socket.gethostname() for idx, sample_def in enumerate(args.sample_def): read_path = sample_def.get("read_path") if not read_path: martian.exit("Must specify a read_path containing FASTQs in each entry of 'sample_def' argument") if not read_path.startswith('/'): martian.exit("Specified FASTQ folder must be an absolute path: %s" % read_path) if not os.path.exists(read_path): martian.exit("On machine: %s, specified FASTQ folder does not exist: %s" % (hostname, read_path)) if not os.access(read_path, os.X_OK): martian.exit("On machine: %s, cellranger-atac does not have permission to open FASTQ folder: %s" % ( hostname, read_path)) if not os.listdir(read_path): martian.exit("Specified FASTQ folder is empty: " + read_path) library_id = sample_def.get("library_id") if library_id is not None: if not re.match("^[\w-]+$", library_id): martian.exit( "Library name may only contain letters, numbers, underscores, and dashes: " + library_id) lanes = sample_def["lanes"] if lanes is not None: for lane in lanes: if not tk_preflight.is_int(lane): martian.exit("Lanes must be a comma-separated list of numbers.") if args.fastq_mode == "BCL_PROCESSOR": sample_indices, msg = tk_preflight.check_sample_indices(sample_def) if sample_indices is None: martian.exit(msg) find_func = tk_fasta.find_input_fastq_files_10x_preprocess reads = [] for sample_index in sample_indices: # process interleaved reads reads.extend(find_func(read_path, "RA", sample_index, lanes)) if len(reads) == 0: martian.exit("No input FASTQs were found for the requested parameters.") elif args.fastq_mode == "ILMN_BCL2FASTQ": sample_names = sample_def.get("sample_names", None) if sample_names is None: martian.exit("Entry {} in sample_def missing required field: sample_names".format(idx)) find_func = tk_fasta.find_input_fastq_files_bcl2fastq_demult reads1 = [] reads2 = [] for sample_name in sample_names: r1 = find_func(read_path, BCL2FASTQ_SEQNAMES["R1"], sample_name, lanes) r2 = find_func(read_path, BCL2FASTQ_SEQNAMES["R2"], sample_name, lanes) if len(r1) != len(r2): martian.exit("Entry {} in sample_defs are missing input FASTQs.".format(idx)) reads1.extend(r1) reads2.extend(r2) if len(reads1) == 0 and len(reads2) == 0: martian.exit("No input FASTQs were found for the requested parameters.") else: martian.exit("Unrecognized fastq_mode: {}".format(args.fastq_mode)) # trim_def['R1'] and ['R2'] must be identical. if args.trim_def is not None: if len(args.trim_def) == 0: martian.exit("trim_def must be a non-empty dictionary.") if "R1" not in args.trim_def or "R2" not in args.trim_def: martian.exit("trim_def must have R1, R2 fields.") if args.trim_def["R1"] != args.trim_def["R2"]: martian.exit("trim_def['R1'] and trim_def['R2'] must be identical.") # factorization. check_factorization(args.factorization) # # Reference # ref directory structure and timestamps ok, msg = check_refdata(args.reference_path, max_contigs=None) if ok: martian.log_info(msg) else: martian.exit(msg) # usability and format check check_reference_format(args.reference_path) # Open file handles limit if args.check_executables: check_filehandle_limit() martian.log_info(tk_preflight.record_package_versions())
def main(args, outs): hostname = socket.gethostname() tk_preflight.record_package_versions() ## no barcode whitelist if args.barcode_whitelist is None: martian.exit("No barcode whitelist specified.") ## there must be a barcode in each sample ## and it should be 16 bases long ## and it should be on read 1 or read 2 for sd in args.sample_def: if sd.get("bc_length", 0) != 16 or sd.get("bc_in_read", 3) not in [1, 2]: martian.exit("Barcode must be 16 bases and on read1 or read2.") print "Checking FASTQ folder..." for sample_def in args.sample_def: read_path = sample_def["read_path"] if not read_path: martian.exit("Must specify a read_path containing FASTQs.") if not read_path.startswith('/'): martian.exit( "Specified FASTQ folder must be an absolute path: %s" % read_path) if not os.path.exists(read_path): martian.exit( "On machine: %s, specified FASTQ folder does not exist: %s" % (hostname, read_path)) if not os.access(read_path, os.X_OK): martian.exit( "On machine: %s, supernova does not have permission to open FASTQ folder: %s" % (hostname, read_path)) if not os.listdir(read_path): martian.exit("Specified FASTQ folder is empty: " + read_path) library_id = sample_def.get("library_id") if library_id is not None: if not re.match("^[\w-]+$", library_id): martian.exit( "Library name may only contain letters, numbers, underscores, and dashes: " + library_id) lanes = sample_def["lanes"] if lanes is not None: for lane in lanes: if not is_int(lane): martian.exit( "Lanes must be a comma-separated list of numbers.") # Open file handles limit - per SUPERNOVA-152, only check this on the execution machine. # We can tell if we're on the execution machine by looking at args.check_executables if args.check_executables: ok, msg = tk_preflight.check_open_fh() if not ok: martian.exit(msg) ## compile a list of fastq files fastq_files = [] if args.input_mode == "BCL_PROCESSOR": # Validate the sample_def fields are correct for (idx, sample_item) in enumerate(args.sample_def): # validate check_key(idx, sample_item, "sample_indices", [list, type(None)]) check_key(idx, sample_item, "read_path", [str, unicode]) check_key(idx, sample_item, "lanes", [list, type(None)]) main_read_type = "RA" find_func = tk_fasta.find_input_fastq_files_10x_preprocess for read_chunk in args.sample_def: sample_index_strings, msg = tk_preflight.check_sample_indices( read_chunk) if sample_index_strings is None: martian.exit(msg) path = read_chunk['read_path'] lanes = read_chunk['lanes'] for sample_index in sample_index_strings: reads = find_func(path, main_read_type, sample_index, lanes) fastq_files.extend(reads) elif args.input_mode == "ILMN_BCL2FASTQ": # Validate the sample_def fields are correct for (idx, sample_item) in enumerate(args.sample_def): # validate check_key(idx, sample_item, "read_path", [str, unicode]) check_key(idx, sample_item, "lanes", [list, type(None)]) check_key(idx, sample_item, "sample_names", [list, type(None)]) find_func = tk_fasta.find_input_fastq_files_bcl2fastq_demult for read_chunk in args.sample_def: sample_names = read_chunk['sample_names'] path = read_chunk['read_path'] lanes = read_chunk['lanes'] for sample_name in sample_names: reads = find_func(path, "R1", sample_name, lanes) fastq_files.extend(reads) reads = find_func(path, "R3", sample_name, lanes) fastq_files.extend(reads) else: martian.throw("Unrecognized input_mode: %s" % args.input_mode) ## if we found nothing then break if len(fastq_files) == 0: martian.exit( "No input FASTQs were found with the requested lanes and sample indices." ) ## make sure they are okay first check_fastqs(fastq_files) total_reads = 0.0 global_avg = 0.0 num_files = 0 for fn in fastq_files: reads_fn, avg_read_len_fn = estimate_read_count_and_length( fn, num_reads=1000) total_reads += reads_fn global_avg += avg_read_len_fn num_files += 1 global_avg = global_avg / num_files martian.log_info( "Estimated read length = %.1f, Estimated total read input = %.1f" % (global_avg, total_reads)) PreflightAlert = alerts.AlertLogger(stage="preflight") PreflightAlert.issue("mean_read_length", global_avg) # verify type and range for downsampling parameters # Note that non-numerical values for bc_subsample_rate and target_reads in mro trickle down as 'None' if args.downsample is not None: bc_subsample_rate = args.downsample.get("bc_subsample_rate", None) if bc_subsample_rate is not None: if not isinstance(bc_subsample_rate, float) and not isinstance( bc_subsample_rate, int): martian.exit( "Specified barcode fraction: %s is not a fraction. Please specify a valid float between 0 and 1." % str(bc_subsample_rate)) if bc_subsample_rate <= 0 or bc_subsample_rate > 1: martian.exit( "Specified barcode fraction: %s is not between 0 and 1. Please specify a valid float between 0 and 1." % str(bc_subsample_rate)) if abs(bc_subsample_rate) < 1e-5: martian.exit( "Specified barcode fraction: %s is too close to 0 and thus impractical." % str(bc_subsample_rate)) target_reads = args.downsample.get("target_reads", None) if target_reads is not None: if not isinstance(target_reads, int) and not isinstance( target_reads, float): martian.exit( "Specified maxreads: %s is not a number. Please specify an integer larger than one for maxreads" % str(target_reads)) if target_reads < 1: martian.exit( "Specified maxreads: %s is less than one. Please specify an integer larger than one for maxreads" % str(target_reads))
def main(args, outs): hostname = socket.gethostname() # Sample ID / pipestance name if args.sample_id is not None: if not re.match("^[\w-]+$", args.sample_id): martian.exit("Sample name may only contain letters, numbers, underscores, and dashes: " + args.sample_id) # Check numerical options # types are already checked by mrp so only need to check ranges if args.force_cells is not None and (args.force_cells < 1 or args.force_cells > 20000): martian.exit("MRO parameter force_cells must be a positive integer"\ " <= 20000.") # check min_ploidy, max_ploidy if args.cnv_params is not None: min_ploidy = args.cnv_params.get("min_ploidy", None) max_ploidy = args.cnv_params.get("max_ploidy", None) if min_ploidy is not None and min_ploidy <= 0: martian.exit("Command line argument soft-min-avg-ploidy must be a "\ "positive real number.") if max_ploidy is not None and (max_ploidy <= 0 or max_ploidy > 8.0): martian.exit("Command line argument soft-max-avg-ploidy must be a "\ "positive real number <= 8.") if (min_ploidy is not None and max_ploidy is not None and max_ploidy <= min_ploidy): martian.exit("Command line arguments must satisfy "\ "soft-min-avg-ploidy < soft-max-avg-ploidy.") # check downsample options if args.downsample is not None and len(args.downsample.keys()) > 0: keys = args.downsample.keys() if len(keys) > 1: martian.exit("Please supply either maxreads or downsample but not "\ "both.") key = keys[0] value = args.downsample[key] param_map = {"target_reads" : "maxreads", "gigabases" : "downsample"} bad_value = False try: float(value) bad_value = value < 1e-12 except ValueError: bad_value = True if bad_value: cs_key = param_map[key] martian.exit("Command line argument %s must be a positive number" % cs_key) # FASTQ input for idx, sample_def in enumerate(args.sample_def): read_path = sample_def["read_path"] if not read_path: martian.exit("Must specify a read_path containing FASTQs.") if not read_path.startswith('/'): martian.exit("Specified FASTQ folder must be an absolute path: %s" % read_path) if not os.path.exists(read_path): martian.exit("On machine: %s, specified FASTQ folder does not exist: %s" % (hostname, read_path)) if not os.access(read_path, os.X_OK): martian.exit("On machine: %s, longranger does not have permission to open FASTQ folder: %s" % (hostname, read_path)) if not os.listdir(read_path): martian.exit("Specified FASTQ folder is empty: " + read_path) library_id = sample_def.get("library_id") if library_id is not None: if not re.match("^[\w-]+$", library_id): martian.exit("Library name may only contain letters, numbers, underscores, and dashes: " + library_id) lanes = sample_def["lanes"] if lanes is not None: for lane in lanes: if not tk_preflight.is_int(lane): martian.exit("Lanes must be a comma-separated list of numbers.") if args.fastq_mode == "BCL_PROCESSOR": sample_indices, msg = tk_preflight.check_sample_indices(sample_def) if sample_indices is None: martian.exit(msg) find_func = tk_fasta.find_input_fastq_files_10x_preprocess reads = [] for sample_index in sample_indices: # process interleaved reads reads.extend(find_func(read_path, "RA", sample_index, lanes)) if len(reads) == 0: martian.exit("No input FASTQs were found for the requested parameters.") elif args.fastq_mode == "ILMN_BCL2FASTQ": sample_names = sample_def.get("sample_names", None) if sample_names is None: martian.exit("Entry {} in sample_def missing required field: sample_names".format(idx)) find_func = tk_fasta.find_input_fastq_files_bcl2fastq_demult reads1 = [] reads2 = [] for sample_name in sample_names: r1 = find_func(read_path, "R1", sample_name, lanes) r2 = find_func(read_path, "R2", sample_name, lanes) if len(r1) != len(r2): martian.exit("Entry {} in sample_defs are missing input FASTQs.".format(idx)) reads1.extend(r1) reads2.extend(r2) if len(reads1) == 0 and len(reads2) == 0: martian.exit("No input FASTQs were found for the requested parameters.") else: martian.exit("Unrecognized fastq_mode: {}".format(args.fastq_mode)) # Reference ok, msg = tk_preflight.check_refdata(args.reference_path, max_contigs=None) if ok: martian.log_info(msg) else: martian.exit(msg) contig_defs_json_path = os.path.join(args.reference_path, "fasta", "contig-defs.json") faidx_path = os.path.join(args.reference_path, "fasta", "genome.fa.fai") error_msg = contig_manager.verify_contig_defs(contig_defs_json_path, faidx_path) if error_msg is not None: martian.exit(error_msg) try: ref = contig_manager.contig_manager(args.reference_path) except Exception as e: martian.exit("Unexpected error occurred.\n%s"%str(e)) # too many contigs primary = ref.primary_contigs(allow_sex_chromosomes=True) num_primary_contigs = len(primary) if num_primary_contigs > 100: martian.exit("There can be at most 100 primary contigs.") # contig length checks chrom_length_dict = ref.get_contig_lengths() contig_length_exit = 500 * 1000 contig_length_warn = 10 ** 7 offending_contigs_warn = [] offending_contigs_exit = [] for c in primary: clen = chrom_length_dict[c] if clen < contig_length_exit: offending_contigs_exit.append(c) elif clen < contig_length_warn: offending_contigs_warn.append(c) if len(offending_contigs_exit) > 0: martian.exit("Primary contig(s) \"%s\" are shorter than %d bases. "\ "Every primary contig must be at least %d bases "\ "in length."%(",".join(offending_contigs_exit), contig_length_exit, contig_length_exit)) elif (not args.check_executables) and len(offending_contigs_warn) > 0: martian.alarm("Primary contig(s) \"%s\" are shorter than %d bases. "\ "Every primary contig is recommended to be at least %d bases "\ "in length."%(",".join(offending_contigs_warn), contig_length_warn, contig_length_warn)) # Open file handles limit if args.check_executables: ok, msg = tk_preflight.check_open_fh() if not ok: martian.exit(msg) martian.log_info(tk_preflight.record_package_versions())
def main(args, outs): hostname = socket.gethostname() print "Checking sample info..." ok, msg = tk_preflight.check_gem_groups(args.sample_def) if not ok: martian.exit(msg) print "Checking FASTQ folder..." for sample_def in args.sample_def: read_path = sample_def["read_path"] if not read_path.startswith('/'): martian.exit( "Specified FASTQ folder must be an absolute path: %s" % read_path) if not os.path.exists(read_path): martian.exit( "On machine: %s, specified FASTQ folder does not exist: %s" % (hostname, read_path)) if not os.access(read_path, os.X_OK): martian.exit( "On machine: %s, cellranger does not have permission to open FASTQ folder: %s" % (hostname, read_path)) if not os.listdir(read_path): martian.exit("Specified FASTQ folder is empty: " + read_path) lanes = sample_def["lanes"] if lanes is not None: for lane in lanes: if not is_int(lane): martian.exit( "Lanes must be a comma-separated list of numbers.") ok, msg = tk_preflight.check_sample_indices(sample_def) if not ok: martian.exit(msg) if args.reference_path is None and args.vdj_reference_path is None: martian.exit( "Must specify either reference_path or vdj_reference_path.") print "Checking transcriptome..." if args.reference_path is not None: ok, msg = cr_preflight.check_refdata(args.reference_path) if not ok: martian.exit(msg) if args.vdj_reference_path is not None: ok, msg = vdj_preflight.check_refdata(args.vdj_reference_path) if not ok: martian.exit(msg) print "Checking chemistry..." ok, msg = cr_chem.check_chemistry_defs() if not ok: martian.exit(msg) ok, msg = cr_chem.check_chemistry_arg(args.chemistry_name) if not ok: martian.exit(msg) if args.chemistry_name == cr_chem.CUSTOM_CHEMISTRY_NAME: ok, msg = cr_chem.check_chemistry_def(args.custom_chemistry_def) if not ok: martian.exit(msg) # Open file handles limit - per CELLRANGER-824, only check this on the execution machine. # We can tell if we're on the execution machine by looking at args.check_executables if args.check_executables: print "Checking system environment..." ok, msg = tk_preflight.check_open_fh() if not ok: martian.exit(msg) print "Checking optional arguments..." if args.recovered_cells is not None and args.force_cells is not None: martian.exit( "Cannot specify both --force-cells and --expect-cells (or --cells) in the same run." ) cr_preflight.record_package_versions()
def main(args, outs): """Combine reads from multiple input FASTQ files, and potentially trim. Demultiplex outputs a series of FASTQ files with filenames of the form: read-[RA|I1|I2]_si-AGTAACGT_lane-001_chunk_001.fastq[.gz]. """ validate_input(args) global_subsample_rate = args.downsample.get( 'subsample_rate', 1.0) if args.downsample is not None else 1.0 # Predicted input bases total_seq_bases = 0 chunks = [] read_groups = set() for read_chunk in args.sample_def: subsample_rate = global_subsample_rate * read_chunk.get( 'subsample_rate', 1.0) bc_in_read = {} if read_chunk.get('bc_in_read', None) is not None: bc_in_read['bc_in_read'] = read_chunk['bc_in_read'] bc_in_read['bc_length'] = read_chunk['bc_length'] path = read_chunk['read_path'] lanes = read_chunk['lanes'] gem_group = read_chunk['gem_group'] unbarcoded = read_chunk.get('unbarcoded', False) if unbarcoded: martian.log_info('Flagged as unbarcoded: processing as bulk data') sample_id = args.sample_id library_id = read_chunk.get('library_id', 'MissingLibrary') # split on BCL_PROCESSOR / ILMN_BCL2FASTQ # the main difference is that BCL_PROCESSOR uses interleaved reads and labels FASTQs by sample index; # whereas ILMN_BCL2FASTQ uses R1/R2 and labels by sample name if args.input_mode == "BCL_PROCESSOR": sample_index_strings, msg = tk_preflight.check_sample_indices( read_chunk) if sample_index_strings is None: martian.exit(msg) sample_seq_bases = 0 find_func = tk_fasta.find_input_fastq_files_10x_preprocess for sample_index in sample_index_strings: read_paths = find_func(path, "RA", sample_index, lanes) for read in read_paths: _, predicted_seq_bases = fastq_data_estimate(read) sample_seq_bases += predicted_seq_bases martian.log_info("Input data: Predict %f GB from %s" % (sample_seq_bases / 1e9, path)) total_seq_bases += sample_seq_bases for sample_index in sample_index_strings: read_paths = find_func(path, "RA", sample_index, lanes) # cell barcodes and sample indices are embedded in the index reads si_read, bc_read = ("I1", "I2") # allow empty sample index case if all reads in lane are same sample sis = find_func(path, si_read, sample_index, lanes) if sis is None or len(sis) == 0: sis = [None] * len(read_paths) barcodes = find_func(path, bc_read, sample_index, lanes) if unbarcoded or len(barcodes) == 0: barcodes = [None] * len(read_paths) # calculate chunks for r, b, si in zip(read_paths, barcodes, sis): (flowcell, lane) = get_run_data(r) if sample_id is not None: rg_string = ':'.join( str(item) for item in [sample_id, library_id, gem_group, flowcell, lane]) else: rg_string = 'None:None:None:None:None' new_chunk = { 'read1': r, 'read2': None, 'reads_interleaved': True, 'barcode': b, 'sample_index': si, 'barcode_reverse_complement': False, 'gem_group': gem_group, 'subsample_rate': subsample_rate, 'read_group': rg_string } new_chunk.update(bc_in_read) chunks.append(new_chunk) read_groups.add(rg_string) elif args.input_mode == "ILMN_BCL2FASTQ": r1_read, r2_read, si_read, bc_read = \ (BCL2FASTQ_SEQNAMES["read1"], BCL2FASTQ_SEQNAMES["read2"], BCL2FASTQ_SEQNAMES["sample_index"], BCL2FASTQ_SEQNAMES["barcode"]) sample_names = read_chunk["sample_names"] sample_seq_bases = 0 find_func = tk_fasta.find_input_fastq_files_bcl2fastq_demult for sample_name in sample_names: for seq_name in (r1_read, r2_read): read_paths = find_func(path, seq_name, sample_name, lanes) for read_fn in read_paths: _, predicted_seq_bases = fastq_data_estimate(read_fn) sample_seq_bases += predicted_seq_bases martian.log_info("Input data: Predict %f GB from %s" % (sample_seq_bases / 1e9, path)) total_seq_bases += sample_seq_bases for sample_name in sample_names: r1_reads = find_func(path, r1_read, sample_name, lanes) r2_reads = find_func(path, r2_read, sample_name, lanes) # allow empty sample index case if all reads in lane are same sample sis = find_func(path, si_read, sample_name, lanes) if sis is None or len(sis) == 0: sis = [None] * len(r1_reads) barcodes = find_func(path, bc_read, sample_name, lanes) if unbarcoded or len(barcodes) == 0: martian.log_info( 'No barcodes available: ignoring sc processing') barcodes = [None] * len(r1_reads) if not (len(r1_reads) == len(r2_reads) == len(barcodes)): martian.log_info("Read 1 files: %s" % str(r1_reads)) martian.log_info("Read 2 files: %s" % str(r2_reads)) martian.log_info("Barcode files: %s" % str(barcodes)) martian.exit( "Read1, Read2, and Barcode files are mismatched. Exiting pipeline" ) # calculate chunks for r1, r2, b, si in zip(r1_reads, r2_reads, barcodes, sis): (flowcell, lane) = get_run_data(r1) if sample_id is not None: rg_string = ':'.join( str(item) for item in [sample_id, library_id, gem_group, flowcell, lane]) else: rg_string = 'None:None:None:None:None' new_chunk = { 'read1': r1, 'read2': r2, 'reads_interleaved': False, 'barcode': b, 'sample_index': si, 'barcode_reverse_complement': False, 'gem_group': gem_group, 'subsample_rate': subsample_rate, 'read_group': rg_string } new_chunk.update(bc_in_read) chunks.append(new_chunk) read_groups.add(rg_string) martian.log_info("Input data: Predict %f total GB" % (total_seq_bases / 1e9)) if len(chunks) == 0: martian.exit( "No input FASTQs were found for the requested parameters.") if args.downsample is not None and args.downsample.get('subsample_rate', None) is None \ and args.downsample.get('gigabases', None) is not None: global_subsample_rate = min( 1.0, args.downsample['gigabases'] * 1e9 / total_seq_bases) martian.log_info( "Input data downsampling: Requested: %.2f GB, Estimated Input: %.2f GB, Downsample Rate: %.3f" % (args.downsample['gigabases'], total_seq_bases / 1e9, global_subsample_rate)) for chunk in chunks: chunk['subsample_rate'] *= global_subsample_rate martian.log_info("Input reads: %s" % str(chunks)) outs.chunks = chunks outs.read_groups = [rg for rg in read_groups] downsample_info = get_downsample_info(args.downsample, total_seq_bases) with open(outs.downsample_info, 'w') as downsample_out: tenkit.safe_json.dump_numpy(downsample_info, downsample_out) check_fastqs(outs.chunks)
def check_sample_def(sample_def, feature_ref=None, pipeline=None): hostname = socket.gethostname() check(tk_preflight.check_gem_groups(sample_def)) # Check uniqueness of sample_def entries sd_entries = sorted([(sd.get("read_path"), sd.get("sample_names"), sd.get("sample_indices"), sd.get("lanes")) for sd in sample_def]) for i in range(len(sd_entries) - 1): if sd_entries[i] == sd_entries[i + 1]: msg = "Duplicated entry in the input FASTQ data. Please use a unique combination of fastq path and sample name." msg += "\nPath: %s" % sd_entries[i][0] msg += "\nNote in demux mode, a unique combination fastq path, sample indices, and lanes is required." raise PreflightException(msg) print "Checking FASTQ folder..." for sample_def in sample_def: read_path = sample_def["read_path"] if read_path.strip() == "": raise PreflightException( "Empty fastq path specifed. Please specify an absolute path.") if not read_path.startswith('/'): raise PreflightException( "Specified FASTQ folder must be an absolute path: %s" % read_path) if not os.path.exists(read_path): raise PreflightException( "On machine: %s, specified FASTQ folder does not exist: %s" % (hostname, read_path)) if not os.access(read_path, os.X_OK): raise PreflightException( "On machine: %s, cellranger does not have permission to open FASTQ folder: %s" % (hostname, read_path)) if not os.listdir(read_path): raise PreflightException("Specified FASTQ folder is empty: " + read_path) lanes = sample_def["lanes"] if lanes is not None: for lane in lanes: if not is_int(lane): raise PreflightException( "Lanes must be a comma-separated list of numbers.") check(tk_preflight.check_sample_indices(sample_def)) if pipeline == cr_constants.PIPELINE_COUNT: options = ", ".join(("'%s'" % x for x in ALLOWED_LIBRARY_TYPES)) library_type = sample_def.get("library_type", None) # Check for empty library_type if library_type == '': msg = ("library_type field may not be an empty string." "\nThe 'library_type' field in the libraries csv" " must be one of %s, or start with '%s'") % \ (options, cellranger.rna.library.CUSTOM_LIBRARY_TYPE_PREFIX) raise PreflightException(msg) # Check for a valid library_type if not (library_type is None or library_type in ALLOWED_LIBRARY_TYPES or \ library_type.startswith(cellranger.rna.library.CUSTOM_LIBRARY_TYPE_PREFIX)): msg = ("Unknown library_type: '%s'." "\nThe 'library_type' field in the libraries csv" " must be one of %s, or start with '%s'") % \ (library_type, options, cellranger.rna.library.CUSTOM_LIBRARY_TYPE_PREFIX) raise PreflightException(msg) # Check that the library_type exists in the feature_ref if feature_ref is not None and \ library_type is not None and \ library_type != cr_constants.GENE_EXPRESSION_LIBRARY_TYPE: if not any(x.feature_type == library_type for x in feature_ref.feature_defs): msg = "You declared a library with library_type = '%s', but there are no features declared with that feature_type in the feature reference." % library_type msg += "\nCheck that the 'library_type' field in the libraries csv matches at least 1 entry in the 'feature_type' field in the feature reference csv" raise PreflightException(msg) elif pipeline == cr_constants.PIPELINE_VDJ: # library type can be missing, or VDJ library_type = sample_def.get("library_type", None) if library_type is not None and not ( library_type == lib_constants.VDJ_LIBRARY_TYPE): msg = "You declared a library with library_type = '%s'. For the vdj pipeline, the library_type field in sample_def must be missing or '%s'" % ( library_type, lib_constants.VDJ_LIBRARY_TYPE) raise PreflightException(msg)
def main(args, outs): """Combine reads from multiple input FASTQ files, and potentially trim. Demultiplex outputs a series of FASTQ files with filenames of the form: read-[RA|I1|I2]_si-AGTAACGT_lane-001_chunk_001.fastq[.gz]. """ def check_key(n, dict_in, name, tys): if not dict_in.has_key(name): martian.exit("Entry %d in sample_def missing required field: %s" % (n, name)) if not (type(dict_in[name]) in tys): martian.exit( "Entry %d in sample_def for '%s' has incorrect type -- expecting %s, got %s" % (n, name, str(tys), type(dict_in[name]))) # Check for self-consistent gem_group settings in the sample_def entries gem_groups = [x['gem_group'] for x in args.sample_def] all_null = all([x is None for x in gem_groups]) all_int = all([type(x) is int for x in gem_groups]) if not (all_null or all_int): martian.exit( "Inconsistent gem_group tags. Please specify all gem_group tags as null, or all gem_group tags with an integer" ) # If all gem_groups are set to null, then set them all to 1 if all_null: for sample_item in args.sample_def: sample_item['gem_group'] = 1 # Predicted input bases total_seq_bases = 0 # verify input mode upfront if args.input_mode not in ["BCL_PROCESSOR", "ILMN_BCL2FASTQ"]: martian.throw("Unrecognized input_mode: %s" % args.input_mode) for (idx, sample_item) in enumerate(args.sample_def): # validate fields check_key(idx, sample_item, "read_path", [str, unicode]) check_key(idx, sample_item, "lanes", [list, type(None)]) check_key(idx, sample_item, "gem_group", [int, type(None)]) if args.input_mode == "BCL_PROCESSOR": check_key(idx, sample_item, "sample_indices", [list, type(None)]) elif args.input_mode == "ILMN_BCL2FASTQ": check_key(idx, sample_item, "sample_names", [list, type(None)]) interleaved_read_type = "RA" chunks = [] read_groups = set() for read_chunk in args.sample_def: # Each sample_def entry can have a separate pre-applied downsampling rate # We adjust the estimated data in that chunk to account for this # subsampling chunk_subsample_rate = read_chunk.get('subsample_rate', 1.0) bc_in_read = {} if read_chunk.has_key('bc_in_read'): if read_chunk['bc_in_read'] is not None: bc_in_read['bc_in_read'] = read_chunk['bc_in_read'] bc_in_read['bc_length'] = read_chunk['bc_length'] path = read_chunk['read_path'] lanes = read_chunk['lanes'] gem_group = read_chunk['gem_group'] unbarcoded = read_chunk.get('unbarcoded') sample_id = args.sample_id library_id = read_chunk.get('library', 'MissingLibrary') # split on BCL_PROCESSOR / ILMN_BCL2FASTQ # the main difference is that BCL_PROCESSOR uses interleaved reads and labels FASTQs by sample index; # whereas ILMN_BCL2FASTQ uses R1/R2 and labels by sample name if args.input_mode == "BCL_PROCESSOR": sample_index_strings, msg = tk_preflight.check_sample_indices( read_chunk) if sample_index_strings is None: martian.exit(msg) sample_seq_bases = 0 read_length = 100 # Should be overwritten below find_func = tk_fasta.find_input_fastq_files_10x_preprocess for sample_index in sample_index_strings: # process interleaved reads reads = find_func(path, interleaved_read_type, sample_index, lanes) for read in reads: _, predicted_seq_bases, read_length = fastq_data_estimate( read) sample_seq_bases += predicted_seq_bases sample_seq_bases = chunk_subsample_rate * sample_seq_bases bp_per_read_pair = 2 * read_length martian.log_info( "Input data: Predict %f GB from %s. (%d bp per read pair)" % (float(sample_seq_bases) / 1e9, path, bp_per_read_pair)) total_seq_bases += sample_seq_bases for sample_index in sample_index_strings: reads = find_func(path, interleaved_read_type, sample_index, lanes) # TODO confirm that this works with cellranger si_read, bc_read = ("I1", "I2") if 'barcode_read' in read_chunk and read_chunk[ 'barcode_read'] == 'I1': si_read, bc_read = ("I2", "I1") sis = find_func(path, si_read, sample_index, lanes) # allow empty sample index case if all reads in lane are same sample if sis is None or sis == []: sis = [None] * len(reads) if not unbarcoded: barcodes = find_func(path, bc_read, sample_index, lanes) if len(barcodes) == 0: barcodes = [None] * len(reads) else: barcodes = [None] * len(reads) # calculate chunks for r, b, si in zip(reads, barcodes, sis): (flowcell, lane) = get_run_data(r) rg_string = tk_bam.pack_rg_string(sample_id, library_id, gem_group, flowcell, lane) new_chunk = { 'read1': r, 'read2': None, 'reads_interleaved': True, 'barcode': b, 'sample_index': si, 'barcode_reverse_complement': False, 'gem_group': gem_group, 'subsample_rate': chunk_subsample_rate, 'read_group': rg_string } new_chunk.update(bc_in_read) chunks.append(new_chunk) read_groups.add(rg_string) elif args.input_mode == "ILMN_BCL2FASTQ": sample_names = read_chunk['sample_names'] sample_seq_bases = 0 find_func = tk_fasta.find_input_fastq_files_bcl2fastq_demult for sample_name in sample_names: # process read 1 reads = find_func(path, "R1", sample_name, lanes) for read in reads: _, predicted_seq_bases, read_length1 = fastq_data_estimate( read) sample_seq_bases += predicted_seq_bases # process read 2 reads = find_func(path, "R2", sample_name, lanes) for read in reads: _, predicted_seq_bases, read_length2 = fastq_data_estimate( read) sample_seq_bases += predicted_seq_bases sample_seq_bases = chunk_subsample_rate * sample_seq_bases bp_per_read_pair = read_length1 + read_length2 martian.log_info( "Input data: Predict %f GB from %s. (%d bp per read pair)" % (float(sample_seq_bases) / 1e9, path, bp_per_read_pair)) total_seq_bases += sample_seq_bases for sample_name in sample_names: r1_reads = find_func(path, "R1", sample_name, lanes) r2_reads = find_func(path, "R2", sample_name, lanes) # TODO confirm that this works with cellranger si_read, bc_read = ("I1", "I2") if 'barcode_read' in read_chunk and read_chunk[ 'barcode_read'] == 'I1': si_read, bc_read = ("I2", "I1") sis = find_func(path, si_read, sample_name, lanes) # allow empty sample index case if all reads in lane are same sample if sis is None or sis == []: sis = [None] * len(r1_reads) # in Chromium chemistry... there shouldn't be separate barcode reads... if not unbarcoded: barcodes = find_func(path, bc_read, sample_name, lanes) if len(barcodes) == 0: barcodes = [None] * len(r1_reads) else: barcodes = [None] * len(r1_reads) # again, with Chromium, the barcodes should be an array of Nones, but # just in case... if not (len(r1_reads) == len(r2_reads) == len(barcodes)): martian.log_info("Read 1 files: %s" % str(r1_reads)) martian.log_info("Read 2 files: %s" % str(r2_reads)) martian.log_info("Barcode files: %s" % str(barcodes)) martian.exit( "Read1, Read2, and Barcode files are mismatched. Exiting pipline" ) # calculate chunks for r1, r2, b, si in zip(r1_reads, r2_reads, barcodes, sis): (flowcell, lane) = get_run_data(r1) rg_string = tk_bam.pack_rg_string(sample_id, library_id, gem_group, flowcell, lane) new_chunk = { 'read1': r1, 'read2': r2, 'reads_interleaved': False, 'barcode': b, 'sample_index': si, 'barcode_reverse_complement': False, 'gem_group': gem_group, 'subsample_rate': chunk_subsample_rate, 'read_group': rg_string } new_chunk.update(bc_in_read) chunks.append(new_chunk) read_groups.add(rg_string) martian.log_info("Input data: Predict %f total GB" % (float(total_seq_bases) / 1e9)) if len(chunks) == 0: martian.exit( "No input FASTQs were found for the requested parameters.") # # Downsampling setup # # The total available input raw gigabases of input data (est_gb), and the base pairs per read pair (bp_per_read_pair) # are estimated above. (est_gb, bp_per_read_pair) = (float(total_seq_bases) / 1e9, bp_per_read_pair) downsample = args.downsample if args.downsample is not None else {} # Possible BC subsampling -- try to get the requested amount of data _after_ bc subsampling est_gb_post_bc = est_gb * downsample.get("bc_subsample_rate", 1.0) # Aim high to ensure that we won't be left with too few reads # if the rest of pipeline can trim this down for us. fudge_factor = args.downsample_overage downsample_succeeded = True if downsample.has_key("gigabases"): read_sample_rate = min( 1.0, fudge_factor * downsample['gigabases'] / est_gb_post_bc) requested_read_pairs = int(1e9 * downsample['gigabases'] / bp_per_read_pair) downsample_succeeded = downsample['gigabases'] > est_gb_post_bc elif downsample.has_key("target_reads"): requested_read_pairs = int(downsample['target_reads'] / 2) est_read_pair_post_bc = 1e9 * est_gb_post_bc / bp_per_read_pair read_sample_rate = min( 1.0, fudge_factor * requested_read_pairs / est_read_pair_post_bc) downsample_succeeded = requested_read_pairs > est_read_pair_post_bc elif downsample.has_key("subsample_rate"): read_sample_rate = min( 1.0, downsample["subsample_rate"] / downsample.get("bc_subsample_rate", 1.0)) requested_read_pairs = None else: read_sample_rate = 1.0 requested_read_pairs = None martian.log_info("Downsampling request: %s" % str(downsample)) martian.log_info("Base pairs per read pair: %s" % bp_per_read_pair) martian.log_info( "Estimated Input: %.2f GB, Initial Downsample Rate: %.3f. Requested total reads: %s" % (est_gb, read_sample_rate, str(requested_read_pairs))) # Copy over the per-chunk subsample rates if read_sample_rate is not None: for chunk in chunks: chunk['subsample_rate'] = chunk.get('subsample_rate', 1.0) * read_sample_rate if downsample.has_key("bc_subsample_rate"): chunk["bc_subsample_rate"] = downsample["bc_subsample_rate"] outs.requested_read_pairs = requested_read_pairs martian.log_info("Input reads: %s" % str(chunks)) outs.chunks = chunks outs.read_groups = [rg for rg in read_groups] downsample_info = {} downsample_info['available_gb'] = est_gb downsample_info['requested_gb'] = downsample.get('gigabases', None) downsample_info['requested_rate'] = read_sample_rate downsample_info['post_downsample_gb'] = float( requested_read_pairs * bp_per_read_pair) / 1e9 if requested_read_pairs is not None else None downsample_info['downsample_succeeded'] = downsample_succeeded with open(outs.downsample_info, 'w') as downsample_out: tenkit.safe_json.dump_numpy(downsample_info, downsample_out) check_fastqs(outs.chunks) # Give out full path to BC whitelist if args.barcode_whitelist: outs.barcode_whitelist_path = BARCODE_LOCATION + "/" + args.barcode_whitelist + ".txt" else: outs.barcode_whitelist_path = None
def main(args, outs): hostname = socket.gethostname() tk_preflight.record_package_versions() ## no barcode whitelist if args.barcode_whitelist is None: martian.exit("No barcode whitelist specified.") ## there must be a barcode in each sample ## and it should be 16 bases long ## and it should be on read 1 or read 2 for sd in args.sample_def: if sd.get("bc_length", 0) != 16 or sd.get("bc_in_read", 3) not in [1, 2]: martian.exit("Barcode must be 16 bases and on read1 or read2.") print "Checking FASTQ folder..." for sample_def in args.sample_def: read_path = sample_def["read_path"] if not read_path: martian.exit("Must specify a read_path containing FASTQs.") if not read_path.startswith('/'): martian.exit( "Specified FASTQ folder must be an absolute path: %s" % read_path) if not os.path.exists(read_path): martian.exit( "On machine: %s, specified FASTQ folder does not exist: %s" % (hostname, read_path)) if not os.access(read_path, os.X_OK): martian.exit( "On machine: %s, supernova does not have permission to open FASTQ folder: %s" % (hostname, read_path)) if not os.listdir(read_path): martian.exit("Specified FASTQ folder is empty: " + read_path) library_id = sample_def.get("library_id") if library_id is not None: if not re.match("^[\w-]+$", library_id): martian.exit( "Library name may only contain letters, numbers, underscores, and dashes: " + library_id) lanes = sample_def["lanes"] if lanes is not None: for lane in lanes: if not is_int(lane): martian.exit( "Lanes must be a comma-separated list of numbers.") # Open file handles limit ok, msg = tk_preflight.check_open_fh() if not ok: martian.exit(msg) ## compile a list of fastq files fastq_files = [] if args.input_mode == "BCL_PROCESSOR": # Validate the sample_def fields are correct for (idx, sample_item) in enumerate(args.sample_def): # validate check_key(idx, sample_item, "sample_indices", [list, type(None)]) check_key(idx, sample_item, "read_path", [str, unicode]) check_key(idx, sample_item, "lanes", [list, type(None)]) main_read_type = "RA" find_func = tk_fasta.find_input_fastq_files_10x_preprocess for read_chunk in args.sample_def: sample_index_strings, msg = tk_preflight.check_sample_indices( read_chunk) if sample_index_strings is None: martian.exit(msg) path = read_chunk['read_path'] lanes = read_chunk['lanes'] for sample_index in sample_index_strings: reads = find_func(path, main_read_type, sample_index, lanes) fastq_files.extend(reads) elif args.input_mode == "ILMN_BCL2FASTQ": # Validate the sample_def fields are correct for (idx, sample_item) in enumerate(args.sample_def): # validate check_key(idx, sample_item, "read_path", [str, unicode]) check_key(idx, sample_item, "lanes", [list, type(None)]) check_key(idx, sample_item, "sample_names", [list, type(None)]) find_func = tk_fasta.find_input_fastq_files_bcl2fastq_demult for read_chunk in args.sample_def: sample_names = read_chunk['sample_names'] path = read_chunk['read_path'] lanes = read_chunk['lanes'] for sample_name in sample_names: reads = find_func(path, "R1", sample_name, lanes) fastq_files.extend(reads) reads = find_func(path, "R3", sample_name, lanes) fastq_files.extend(reads) else: martian.throw("Unrecognized input_mode: %s" % args.input_mode) ## if we found nothing then break if len(fastq_files) == 0: martian.exit( "No input FASTQs were found with the requested lanes and sample indices." ) ## make sure they are okay first check_fastqs(fastq_files) total_reads = 0.0 global_avg = 0.0 num_files = 0 for fn in fastq_files: reads_fn, avg_read_len_fn = estimate_read_count_and_length( fn, num_reads=1000) total_reads += reads_fn global_avg += avg_read_len_fn num_files += 1 global_avg = global_avg / num_files martian.log_info( "Estimated read length = %.1f, Estimated total read input = %.1f" % (global_avg, total_reads)) exit_msg = "We observe many reads shorter than 125 bases. The ideal read length for Supernova is 150 bases. Reads shorter than the ideal length are likely to yield a lower quality assembly, and the algorithm has not been tested on short reads. Because reads are too short, execution will be terminated." warn_msg = "We observe many reads shorter than 150 bases.The ideal read length for Supernova is 150 bases. Reads shorter than the ideal length are likely to yield a lower quality assembly." if global_avg < 125: martian.exit(exit_msg) elif global_avg < 149: martian.alarm(warn_msg)