def get_fastq_lanes(fastq): """ Return list of lanes present in Fastq file Arguments: fastq (str): path to Fastq file (can be gzipped) Returns: Tuple: tuple (n,lanes) where ``n`` is a the number of reads and ``lanes`` is a list of integer lane numbers. """ regex = re.compile(r"^([^:]*:){3}(\d*):") nreads = 0 lanes = set() for read in getreads(fastq): nreads += 1 try: lane = regex.match(''.join(read)).group(2) lanes.add(int(lane)) except AttributeError: raise Exception("Failed to find lane in read %s: " "not a valid Fastq file?" % '\n'.join(read)) return (nreads, sorted(list(lanes)))
def get_fastq_lanes(fastq): """ Return list of lanes present in Fastq file Arguments: fastq (str): path to Fastq file (can be gzipped) Returns: Tuple: tuple (n,lanes) where ``n`` is a the number of reads and ``lanes`` is a list of integer lane numbers. """ regex = re.compile(r"^([^:]*:){3}(\d*):") nreads = 0 lanes = set() for read in getreads(fastq): nreads += 1 try: lane = regex.match(''.join(read)).group(2) lanes.add(int(lane)) except AttributeError: raise Exception("Failed to find lane in read %s: " "not a valid Fastq file?" % '\n'.join(read)) return (nreads,sorted(list(lanes)))
def fastq_strand(argv, working_dir=None): """ Driver for fastq_strand Generate strandedness statistics for single FASTQ or FASTQ pair, by running STAR using one or more genome indexes """ # Process command line p = argparse.ArgumentParser( description="Generate strandedness statistics " "for FASTQ or FASTQpair, by running STAR using " "one or more genome indexes", version=__version__) p.add_argument("r1", metavar="READ1", default=None, help="R1 Fastq file") p.add_argument("r2", metavar="READ2", default=None, nargs="?", help="R2 Fastq file") p.add_argument("-g", "--genome", dest="star_genomedirs", metavar="GENOMEDIR", default=None, action="append", help="path to directory with STAR index " "for genome to use (use as an alternative " "to -c/--conf; can be specified multiple " "times to include additional genomes)") p.add_argument("--subset", type=int, default=10000, help="use a random subset of read pairs " "from the input Fastqs; set to zero to " "use all reads (default: 10000)") p.add_argument("-o", "--outdir", default=None, help="specify directory to write final " "outputs to (default: current directory)") p.add_argument("-c", "--conf", metavar="FILE", default=None, help="specify delimited 'conf' file with " "list of NAME and STAR index directory " "pairs. NB if a conf file is supplied " "then any indices specifed on the command " "line will be ignored") p.add_argument("-n", type=int, default=1, help="number of threads to run STAR with " "(default: 1)") p.add_argument("--counts", action="store_true", help="include the count sums for " "unstranded, 1st read strand aligned and " "2nd read strand aligned in the output " "file (default: only include percentages)") p.add_argument("--keep-star-output", action="store_true", help="keep the output from STAR (default: " "delete outputs on completion)") args = p.parse_args(argv) # Print parameters print("READ1\t: %s" % args.r1) print("READ2\t: %s" % args.r2) # Check that STAR is on the path star_exe = find_program("STAR") if star_exe is None: logging.critical("STAR not found") return 1 print("STAR\t: %s" % star_exe) # Gather genome indices genome_names = {} if args.conf is not None: print("Conf file\t: %s" % args.conf) star_genomedirs = [] with open(args.conf, 'r') as fp: for line in fp: if line.startswith('#'): continue name, star_genomedir = line.rstrip().split('\t') star_genomedirs.append(star_genomedir) # Store an associated name genome_names[star_genomedir] = name else: star_genomedirs = args.star_genomedirs if not star_genomedirs: logging.critical("No genome indices specified") return 1 print("Genomes:") for genome in star_genomedirs: print("- %s" % genome) # Output directory if args.outdir is None: outdir = os.getcwd() else: outdir = os.path.abspath(args.outdir) if not os.path.exists(outdir): logging.critical("Output directory doesn't exist: %s" % outdir) return 1 # Output file outfile = "%s_fastq_strand.txt" % os.path.join( outdir, os.path.basename(strip_ngs_extensions(args.r1))) if os.path.exists(outfile): logging.warning("Removing existing output file '%s'" % outfile) os.remove(outfile) # Prefix for temporary output prefix = "fastq_strand_" # Working directory if working_dir is None: working_dir = os.getcwd() else: working_dir = os.path.abspath(working_dir) if not os.path.isdir(working_dir): raise Exception("Bad working directory: %s" % working_dir) print("Working directory: %s" % working_dir) # Make subset of input read pairs nreads = sum(1 for i in getreads(os.path.abspath(args.r1))) print("%d reads" % nreads) if args.subset == 0: print("Using all read pairs in Fastq files") subset = nreads elif args.subset > nreads: print("Actual number of read pairs smaller than requested subset") subset = nreads else: subset = args.subset print("Using random subset of %d read pairs" % subset) if subset == nreads: subset_indices = [i for i in xrange(nreads)] else: subset_indices = random.sample(xrange(nreads), subset) fqs_in = filter(lambda fq: fq is not None, (args.r1, args.r2)) fastqs = [] for fq in fqs_in: fq_subset = os.path.join(working_dir, os.path.basename(fq)) if fq_subset.endswith(".gz"): fq_subset = '.'.join(fq_subset.split('.')[:-1]) fq_subset = "%s.subset.fq" % '.'.join(fq_subset.split('.')[:-1]) with open(fq_subset, 'w') as fp: for read in getreads_subset(os.path.abspath(fq), subset_indices): fp.write('\n'.join(read) + '\n') fastqs.append(fq_subset) # Make directory to keep output from STAR if args.keep_star_output: star_output_dir = os.path.join( outdir, "STAR.%s.outputs" % os.path.basename(strip_ngs_extensions(args.r1))) print("Output from STAR will be copied to %s" % star_output_dir) # Check if directory already exists from earlier run if os.path.exists(star_output_dir): # Move out of the way i = 0 backup_dir = "%s.bak" % star_output_dir while os.path.exists(backup_dir): i += 1 backup_dir = "%s.bak%s" % (star_output_dir, i) logging.warning("Moving existing output directory to %s" % backup_dir) os.rename(star_output_dir, backup_dir) # Make the directory os.mkdir(star_output_dir) # Write output to a temporary file with tempfile.TemporaryFile() as fp: # Iterate over genome indices for star_genomedir in star_genomedirs: # Basename for output for this genome try: name = genome_names[star_genomedir] except KeyError: name = star_genomedir # Build a command line to run STAR star_cmd = [star_exe] star_cmd.extend([ '--runMode', 'alignReads', '--genomeLoad', 'NoSharedMemory', '--genomeDir', os.path.abspath(star_genomedir) ]) star_cmd.extend(['--readFilesIn', fastqs[0]]) if len(fastqs) > 1: star_cmd.append(fastqs[1]) star_cmd.extend([ '--quantMode', 'GeneCounts', '--outSAMtype', 'BAM', 'Unsorted', '--outSAMstrandField', 'intronMotif', '--outFileNamePrefix', prefix, '--runThreadN', str(args.n) ]) print("Running %s" % ' '.join(star_cmd)) try: subprocess.check_output(star_cmd, cwd=working_dir) except subprocess.CalledProcessError as ex: raise Exception("STAR returned non-zero exit code: %s" % ex.returncode) # Save the outputs if args.keep_star_output: # Make a subdirectory for this genome index genome_dir = os.path.join(star_output_dir, name.replace(os.sep, "_")) print("Copying STAR outputs to %s" % genome_dir) os.mkdir(genome_dir) for f in os.listdir(working_dir): if f.startswith(prefix): shutil.copy(os.path.join(working_dir, f), os.path.join(genome_dir, f)) # Process the STAR output star_tab_file = os.path.join(working_dir, "%sReadsPerGene.out.tab" % prefix) if not os.path.exists(star_tab_file): raise Exception("Failed to find .out file: %s" % star_tab_file) sum_col2 = 0 sum_col3 = 0 sum_col4 = 0 with open(star_tab_file) as out: for i, line in enumerate(out): if i < 4: # Skip first four lines continue # Process remaining delimited columns cols = line.rstrip('\n').split('\t') sum_col2 += int(cols[1]) sum_col3 += int(cols[2]) sum_col4 += int(cols[3]) print("Sums:") print("- col2: %d" % sum_col2) print("- col3: %d" % sum_col3) print("- col4: %d" % sum_col4) if sum_col2 > 0.0: forward_1st = float(sum_col3) / float(sum_col2) * 100.0 reverse_2nd = float(sum_col4) / float(sum_col2) * 100.0 else: logging.warning("Sum of mapped reads is zero!") forward_1st = 0.0 reverse_2nd = 0.0 print("Strand percentages:") print("- 1st forward: %.2f%%" % forward_1st) print("- 2nd reverse: %.2f%%" % reverse_2nd) # Append to output file data = [name, "%.2f" % forward_1st, "%.2f" % reverse_2nd] if args.counts: data.extend([sum_col2, sum_col3, sum_col4]) fp.write("%s\n" % "\t".join([str(d) for d in data])) # Finished iterating over genomes # Rewind temporary output file fp.seek(0) with open(outfile, 'w') as out: # Header out.write("#fastq_strand version: %s\t" "#Aligner: %s\t" "#Reads in subset: %s\n" % (__version__, "STAR", subset)) columns = ["Genome", "1st forward", "2nd reverse"] if args.counts: columns.extend([ "Unstranded", "1st read strand aligned", "2nd read strand aligned" ]) out.write("#%s\n" % "\t".join(columns)) # Copy content from temp to final file for line in fp: out.write(line) return 0
def main(args=None): # Command line processing if args is None: args = sys.argv[1:] p = optparse.OptionParser(usage="%prog -m PATTERN |-n NREADS infile " "[ infile ... ]", version="%prog "+__version__, description=__description__) p.add_option('-m','--match',action='store',dest='pattern',default=None, help="extract records that match Python regular " "expression PATTERN") p.add_option('-n',action='store',dest='n',default=None, help="extract N random reads from the input file(s). " "If multiple files are supplied (e.g. R1/R2 pair) then " "the same subsets will be extracted for each. " "(Optionally a percentage can be supplied instead e.g. " "'50%' to extract a subset of half the reads.)") p.add_option('-s','--seed',action='store',dest='seed',default=None, help="specify seed for random number generator (used " "for -n option; using the same seed should produce the " "same 'random' sample of reads)") opts,args = p.parse_args(args) if len(args) < 1: p.error("Need to supply at least one input file") # Pattern matching option if opts.pattern is not None: if opts.n is not None: p.error("Need to supply only one of -n or -m options") print "Extracting reads matching '%s'" % opts.pattern for f in args: if f.endswith('.gz'): outfile = os.path.basename(os.path.splitext(f[:-3])[0]) else: outfile = os.path.basename(os.path.splitext(f)[0]) outfile += '.subset_regex.fq' print "Extracting to %s" % outfile with open(outfile,'w') as fp: for read in getreads_regex(f,opts.pattern): fp.write('\n'.join(read) + '\n') else: # Seed random number generator if opts.seed is not None: random.seed(opts.seed) # Count the reads nreads = sum(1 for i in getreads(args[0])) print "Number of reads: %s" % nreads if len(args) > 1: print "Verifying read numbers match between files" for f in args[1:]: if sum(1 for i in getreads(f)) != nreads: print "Inconsistent numbers of reads between files" sys.exit(1) # Generate a subset of read indices to extract try: nsubset = int(opts.n) except ValueError: if str(opts.n).endswith('%'): nsubset = int(float(opts.n[:-1])*nreads/100.0) if nsubset > nreads: print "Requested subset (%s) is larger than file (%s)" % (nsubset, nreads) sys.exit(1) print "Generating set of %s random indices" % nsubset subset_indices = random.sample(xrange(nreads),nsubset) # Extract the reads to separate files for f in args: if f.endswith('.gz'): outfile = os.path.basename(os.path.splitext(f[:-3])[0]) else: outfile = os.path.basename(os.path.splitext(f)[0]) outfile += '.subset_%s.fq' % nsubset print "Extracting to %s" % outfile with open(outfile,'w') as fp: for read in getreads_subset(f,subset_indices): fp.write('\n'.join(read) + '\n')
def main(args=None): # Command line processing if args is None: args = sys.argv[1:] p = argparse.ArgumentParser(version="%(prog)s "+__version__, description=__description__) p.add_argument('-m','--match',action='store',dest='pattern', default=None, help="extract records that match Python regular " "expression PATTERN") p.add_argument('-n',action='store',dest='n',default=None, help="extract N random reads from the input file(s). " "If multiple files are supplied (e.g. R1/R2 pair) then " "the same subsets will be extracted for each. " "(Optionally a percentage can be supplied instead e.g. " "'50%%' to extract a subset of half the reads.)") p.add_argument('-s','--seed',action='store',dest='seed',default=None, help="specify seed for random number generator (used " "for -n option; using the same seed should produce the " "same 'random' sample of reads)") p.add_argument('infiles',metavar='infile',nargs='+', help="input FASTQ, CSFASTA, or QUAL file") args = p.parse_args(args) # Pattern matching option if args.pattern is not None: if args.n is not None: p.error("Need to supply only one of -n or -m options") print("Extracting reads matching '%s'" % args.pattern) for f in args.infiles: if f.endswith('.gz'): outfile = os.path.basename(os.path.splitext(f[:-3])[0]) else: outfile = os.path.basename(os.path.splitext(f)[0]) outfile += '.subset_regex.fq' print("Extracting to %s" % outfile) with open(outfile,'w') as fp: for read in getreads_regex(f,args.pattern): fp.write('\n'.join(read) + '\n') else: # Seed random number generator if args.seed is not None: random.seed(args.seed) # Count the reads nreads = sum(1 for i in getreads(args.infiles[0])) print("Number of reads: %s" % nreads) if len(args.infiles) > 1: print("Verifying read numbers match between files") for f in args.infiles[1:]: if sum(1 for i in getreads(f)) != nreads: print("Inconsistent numbers of reads between files") sys.exit(1) # Generate a subset of read indices to extract try: nsubset = int(args.n) except ValueError: if str(args.n).endswith('%'): nsubset = int(float(args.n[:-1])*nreads/100.0) if nsubset > nreads: print("Requested subset (%s) is larger than file (%s)" % (nsubset, nreads)) sys.exit(1) print("Generating set of %s random indices" % nsubset) subset_indices = random.sample(xrange(nreads),nsubset) # Extract the reads to separate files for f in args.infiles: if f.endswith('.gz'): outfile = os.path.basename(os.path.splitext(f[:-3])[0]) else: outfile = os.path.basename(os.path.splitext(f)[0]) outfile += '.subset_%s.fq' % nsubset print("Extracting to %s" % outfile) with open(outfile,'w') as fp: for read in getreads_subset(f,subset_indices): fp.write('\n'.join(read) + '\n')
def fastq_strand(argv,working_dir=None): """ Driver for fastq_strand Generate strandedness statistics for single FASTQ or FASTQ pair, by running STAR using one or more genome indexes """ # Process command line p = argparse.ArgumentParser( description="Generate strandedness statistics " "for FASTQ or FASTQpair, by running STAR using " "one or more genome indexes", version=__version__) p.add_argument("r1",metavar="READ1", default=None, help="R1 Fastq file") p.add_argument("r2",metavar="READ2", default=None, nargs="?", help="R2 Fastq file") p.add_argument("-g","--genome", dest="star_genomedirs",metavar="GENOMEDIR", default=None, action="append", help="path to directory with STAR index " "for genome to use (use as an alternative " "to -c/--conf; can be specified multiple " "times to include additional genomes)") p.add_argument("--subset", type=int, default=10000, help="use a random subset of read pairs " "from the input Fastqs; set to zero to " "use all reads (default: 10000)") p.add_argument("-o","--outdir", default=None, help="specify directory to write final " "outputs to (default: current directory)") p.add_argument("-c","--conf",metavar="FILE", default=None, help="specify delimited 'conf' file with " "list of NAME and STAR index directory " "pairs. NB if a conf file is supplied " "then any indices specifed on the command " "line will be ignored") p.add_argument("-n", type=int, default=1, help="number of threads to run STAR with " "(default: 1)") p.add_argument("--counts", action="store_true", help="include the count sums for " "unstranded, 1st read strand aligned and " "2nd read strand aligned in the output " "file (default: only include percentages)") p.add_argument("--keep-star-output", action="store_true", help="keep the output from STAR (default: " "delete outputs on completion)") args = p.parse_args(argv) # Print parameters print "READ1\t: %s" % args.r1 print "READ2\t: %s" % args.r2 # Check that STAR is on the path star_exe = find_program("STAR") if star_exe is None: logging.critical("STAR not found") return 1 print "STAR\t: %s" % star_exe # Gather genome indices genome_names = {} if args.conf is not None: print "Conf file\t: %s" % args.conf star_genomedirs = [] with open(args.conf,'r') as fp: for line in fp: if line.startswith('#'): continue name,star_genomedir = line.rstrip().split('\t') star_genomedirs.append(star_genomedir) # Store an associated name genome_names[star_genomedir] = name else: star_genomedirs = args.star_genomedirs if not star_genomedirs: logging.critical("No genome indices specified") return 1 print "Genomes:" for genome in star_genomedirs: print "- %s" % genome # Output directory if args.outdir is None: outdir = os.getcwd() else: outdir = os.path.abspath(args.outdir) if not os.path.exists(outdir): logging.critical("Output directory doesn't exist: %s" % outdir) return 1 # Output file outfile = "%s_fastq_strand.txt" % os.path.join( outdir, os.path.basename(strip_ngs_extensions(args.r1))) if os.path.exists(outfile): logging.warning("Removing existing output file '%s'" % outfile) os.remove(outfile) # Prefix for temporary output prefix = "fastq_strand_" # Working directory if working_dir is None: working_dir = os.getcwd() else: working_dir = os.path.abspath(working_dir) if not os.path.isdir(working_dir): raise Exception("Bad working directory: %s" % working_dir) print "Working directory: %s" % working_dir # Make subset of input read pairs nreads = sum(1 for i in getreads(os.path.abspath(args.r1))) print "%d reads" % nreads if args.subset == 0: print "Using all read pairs in Fastq files" subset = nreads elif args.subset > nreads: print "Actual number of read pairs smaller than requested subset" subset = nreads else: subset = args.subset print "Using random subset of %d read pairs" % subset if subset == nreads: subset_indices = [i for i in xrange(nreads)] else: subset_indices = random.sample(xrange(nreads),subset) fqs_in = filter(lambda fq: fq is not None,(args.r1,args.r2)) fastqs = [] for fq in fqs_in: fq_subset = os.path.join(working_dir, os.path.basename(fq)) if fq_subset.endswith(".gz"): fq_subset = '.'.join(fq_subset.split('.')[:-1]) fq_subset = "%s.subset.fq" % '.'.join(fq_subset.split('.')[:-1]) with open(fq_subset,'w') as fp: for read in getreads_subset(os.path.abspath(fq), subset_indices): fp.write('\n'.join(read) + '\n') fastqs.append(fq_subset) # Make directory to keep output from STAR if args.keep_star_output: star_output_dir = os.path.join(outdir, "STAR.%s.outputs" % os.path.basename( strip_ngs_extensions(args.r1))) print "Output from STAR will be copied to %s" % star_output_dir # Check if directory already exists from earlier run if os.path.exists(star_output_dir): # Move out of the way i = 0 backup_dir = "%s.bak" % star_output_dir while os.path.exists(backup_dir): i += 1 backup_dir = "%s.bak%s" % (star_output_dir,i) logging.warning("Moving existing output directory to %s" % backup_dir) os.rename(star_output_dir,backup_dir) # Make the directory os.mkdir(star_output_dir) # Write output to a temporary file with tempfile.TemporaryFile() as fp: # Iterate over genome indices for star_genomedir in star_genomedirs: # Basename for output for this genome try: name = genome_names[star_genomedir] except KeyError: name = star_genomedir # Build a command line to run STAR star_cmd = [star_exe] star_cmd.extend([ '--runMode','alignReads', '--genomeLoad','NoSharedMemory', '--genomeDir',os.path.abspath(star_genomedir)]) star_cmd.extend(['--readFilesIn', fastqs[0]]) if len(fastqs) > 1: star_cmd.append(fastqs[1]) star_cmd.extend([ '--quantMode','GeneCounts', '--outSAMtype','BAM','Unsorted', '--outSAMstrandField','intronMotif', '--outFileNamePrefix',prefix, '--runThreadN',str(args.n)]) print "Running %s" % ' '.join(star_cmd) try: subprocess.check_output(star_cmd,cwd=working_dir) except subprocess.CalledProcessError as ex: raise Exception("STAR returned non-zero exit code: %s" % ex.returncode) # Save the outputs if args.keep_star_output: # Make a subdirectory for this genome index genome_dir = os.path.join(star_output_dir, name.replace(os.sep,"_")) print "Copying STAR outputs to %s" % genome_dir os.mkdir(genome_dir) for f in os.listdir(working_dir): if f.startswith(prefix): shutil.copy(os.path.join(working_dir,f), os.path.join(genome_dir,f)) # Process the STAR output star_tab_file = os.path.join(working_dir, "%sReadsPerGene.out.tab" % prefix) if not os.path.exists(star_tab_file): raise Exception("Failed to find .out file: %s" % star_tab_file) sum_col2 = 0 sum_col3 = 0 sum_col4 = 0 with open(star_tab_file) as out: for i,line in enumerate(out): if i < 4: # Skip first four lines continue # Process remaining delimited columns cols = line.rstrip('\n').split('\t') sum_col2 += int(cols[1]) sum_col3 += int(cols[2]) sum_col4 += int(cols[3]) print "Sums:" print "- col2: %d" % sum_col2 print "- col3: %d" % sum_col3 print "- col4: %d" % sum_col4 if sum_col2 > 0.0: forward_1st = float(sum_col3)/float(sum_col2)*100.0 reverse_2nd = float(sum_col4)/float(sum_col2)*100.0 else: logging.warning("Sum of mapped reads is zero!") forward_1st = 0.0 reverse_2nd = 0.0 print "Strand percentages:" print "- 1st forward: %.2f%%" % forward_1st print "- 2nd reverse: %.2f%%" % reverse_2nd # Append to output file data = [name, "%.2f" % forward_1st, "%.2f" % reverse_2nd] if args.counts: data.extend([sum_col2,sum_col3,sum_col4]) fp.write("%s\n" % "\t".join([str(d) for d in data])) # Finished iterating over genomes # Rewind temporary output file fp.seek(0) with open(outfile,'w') as out: # Header out.write("#fastq_strand version: %s\t" "#Aligner: %s\t" "#Reads in subset: %s\n" % (__version__, "STAR", subset)) columns = ["Genome","1st forward","2nd reverse"] if args.counts: columns.extend(["Unstranded", "1st read strand aligned", "2nd read strand aligned"]) out.write("#%s\n" % "\t".join(columns)) # Copy content from temp to final file for line in fp: out.write(line) return 0
def assign_reads(args): """ Assign reads to samples from batched ICELL8 ATAC Fastqs Intended to be invoked via 'map' or similar function Arguments are supplied in a single list which should contain the following items: - R1 Fastq: path to R1 Fastq file - R2 Fastq: path to R2 Fastq file - I1 Fastq: path to I1 Fastq file - I2 Fastq: path to I2 Fastq file - well list: path to the well list file - mode: either 'samples' or 'barcodes' - swap_i1_and_i2: boolean indicating whether I1 and I2 Fastqs should be swapped for matching - reverse_complement: either None, 'i1', 'i2' or both - rewrite_fastq_headers: boolean indicating whether to write the matching ICELL8 barcodes into the Fastq read headers on output - working_dir: working directory to write batches to - unassigned: 'sample name' to associate with unassigned read (used as a basename for output file) In 'samples' mode assignment is done to samples only; in 'barcodes' mode assignment is done to samples and barcodes. Arguments: args (list): list containing the arguments supplied to the read assigner Returns: Tuple: tuple consisting of (batch id,barcode_counts, unassigned_barcodes_file). """ # Unpack arguments fastq_r1,fastq_r2,fastq_i1,fastq_i2,well_list_file,mode,swap_i1_and_i2,reverse_complement_index,rewrite_fastq_headers,working_dir,unassigned = args # Batch ID is the trailing part of the name batch_id = AnalysisFastq(fastq_i1).extras.strip('_') # Label is sample name plus batch name label = "%s/%s" % (AnalysisFastq(fastq_i1).sample_name,batch_id) report("[%s] Assigning reads from R1/R2 Fastq pairs based on I1/I2 Fastqs:" % label) report("[%s] -- R1: %s" % (label,os.path.basename(fastq_r1))) report("[%s] -- R2: %s" % (label,os.path.basename(fastq_r2))) report("[%s] -- I1: %s" % (label,os.path.basename(fastq_i1))) report("[%s] -- I2: %s" % (label,os.path.basename(fastq_i2))) report("[%s] -- Well list: %s" % (label,os.path.basename(well_list_file))) report("[%s] Mode is '%s'" % (label,mode)) if swap_i1_and_i2: report("[%s] Swapping I1 and I2 Fastqs for matching to well list" % label) if rewrite_fastq_headers: report("[%s] Rewriting Fastq read headers to include well list " "barcodes" % label) # Check mode if mode not in ("samples","barcodes"): report("[%s] Unrecognised mode!" % label,fp=sys.stderr) # Working directory if working_dir is None: working_dir = os.getcwd() os.mkdir(os.path.join(working_dir,batch_id)) # Read well list file to get barcodes and lookups well_list = ICell8WellList(well_list_file) sample_lookup = defaultdict(lambda: unassigned) barcode_lookup = defaultdict(lambda: unassigned) for sample in well_list.samples(): barcode_lookup[sample] = list() barcodes = well_list.barcodes() for barcode in barcodes: sample = well_list.sample(barcode) sample_lookup[barcode] = sample barcode_lookup[sample].append(barcode) # Generate adjusted versions of barcodes for matching # against barcodes derived from Fastqs fastq_barcode_lookup = defaultdict(lambda: None) for barcode in barcodes: i1,i2 = barcode.split('+') if reverse_complement_index: if reverse_complement_index in ('i1','both'): # Reverse complement the I1 part of each barcode i1 = reverse_complement(i1) if reverse_complement_index in ('i2','both'): # Reverse complement the I2 part of each barcode i2 = reverse_complement(i2) if swap_i1_and_i2: i2,i1 = i1,i2 fastq_barcode_lookup["%s+%s" % (i1,i2)] = barcode # File to write unassigned barcodes to unassigned_barcodes_file = os.path.join(working_dir, batch_id, "unassigned_barcodes.txt") # Set up output files for samples samples = well_list.samples() samples.insert(0,unassigned) fpp = BufferedOutputFiles() for read in ('R1','R2','I1','I2'): for index,sample in enumerate(samples): if mode == 'samples': # Output files will only have sample names name = "%s_%s" % (sample,read) filen = "%s_S%d_%s_001.fastq" % (sample,index,read) fpp.open(name, os.path.join(working_dir,batch_id,filen)) elif mode == 'barcodes': # Output files will have sample name plus barcode if sample != unassigned: # Standard samples for barcode in barcode_lookup[sample]: name = "%s_%s_%s" % (sample,barcode,read) filen = "%s_S%d_%s_%s_001.fastq" % \ (sample,index,barcode,read) fpp.open(name, os.path.join(working_dir,batch_id,filen)) else: # Unassigned reads name = "%s_%s" % (sample,read) filen = "%s_S%d_%s_001.fastq" % (sample,index,read) fpp.open(name, os.path.join(working_dir,batch_id,filen)) barcode_counts = { unassigned: 0, } for barcode in well_list.barcodes(): barcode_counts[barcode] = 0 # Examine indices and assign reads ii = 0 progress = ProgressChecker(every=1000000) if mode == 'samples': # Assigning reads to samples with open(unassigned_barcodes_file,"w") as fp: for r1,r2,i1,i2 in zip(getreads(fastq_r1), getreads(fastq_r2), getreads(fastq_i1), getreads(fastq_i2)): # Get barcodes to match against adjusted # versions from well list fastq_barcode = "%s+%s" % (i1[1],i2[1]) # Get "real" barcode barcode = fastq_barcode_lookup[fastq_barcode] # Add to counts try: barcode_counts[barcode] += 1 except KeyError: barcode_counts[unassigned] += 1 # Determine sample sample = sample_lookup[barcode] # Rewrite read headers to include well list barcode if rewrite_fastq_headers and barcode: r1 = update_fastq_read_index(r1,barcode) r2 = update_fastq_read_index(r2,barcode) i1 = update_fastq_read_index(i1,barcode) i2 = update_fastq_read_index(i2,barcode) # Write the reads to the appropriate destinations fpp.write("%s_R1" % sample,'\n'.join(r1)) fpp.write("%s_R2" % sample,'\n'.join(r2)) fpp.write("%s_I1" % sample,'\n'.join(i1)) fpp.write("%s_I2" % sample,'\n'.join(i2)) # Write Fastq version of unassigned barcode to file if sample == unassigned: fp.write("%s\n" % fastq_barcode) # Report progress ii += 1 if progress.check(ii): report("[%s]...%d reads examined" % (label,ii)) elif mode == 'barcodes': # Assigning reads to barcodes with open(unassigned_barcodes_file,"w") as fp: for r1,r2,i1,i2 in zip(getreads(fastq_r1), getreads(fastq_r2), getreads(fastq_i1), getreads(fastq_i2)): # Get barcodes to match against adjusted # versions from well list fastq_barcode = "%s+%s" % (i1[1],i2[1]) # Get "real" barcode barcode = fastq_barcode_lookup[fastq_barcode] # Add to counts try: barcode_counts[barcode] += 1 except KeyError: barcode_counts[unassigned] += 1 # Determine sample sample = sample_lookup[barcode] # Rewrite read headers to include well list barcode if rewrite_fastq_headers and barcode: r1 = update_fastq_read_index(r1,barcode) r2 = update_fastq_read_index(r2,barcode) i1 = update_fastq_read_index(i1,barcode) i2 = update_fastq_read_index(i2,barcode) # Write the reads to the appropriate destinations if sample != unassigned: # Assign to sample and barcode fpp.write("%s_%s_R1" % (sample,barcode),'\n'.join(r1)) fpp.write("%s_%s_R2" % (sample,barcode),'\n'.join(r2)) fpp.write("%s_%s_I1" % (sample,barcode),'\n'.join(i1)) fpp.write("%s_%s_I2" % (sample,barcode),'\n'.join(i2)) else: # Write unassigned barcode to file fpp.write("%s_R1" % sample,'\n'.join(r1)) fpp.write("%s_R2" % sample,'\n'.join(r2)) fpp.write("%s_I1" % sample,'\n'.join(i1)) fpp.write("%s_I2" % sample,'\n'.join(i2)) # Write Fastq version of unassigned barcode to file fp.write("%s\n" % fastq_barcode) # Report progress ii += 1 if progress.check(ii): report("[%s]...%d reads examined" % (label,ii)) report("[%s] Finished processing batch %s" % (label,batch_id)) # Close files fpp.close() # Remove original files for fq in (fastq_r1,fastq_r2,fastq_i1,fastq_i2): report("[%s] Removing %s" % (label,fq)) os.remove(fq) # Returns tuple with batch ID, barcode counts and # file with list of unassigned barcodes return (batch_id,barcode_counts,unassigned_barcodes_file)