def main(assay: Assay, orig_fastq_dirs: Sequence[Path], adj_fastq_dir: Path, threads: int): fastq_pairs: Iterable[Sequence[Path]] if assay.barcode_adj_performed: if assay.barcode_adj_r1_r2: fastq_pairs = find_grouped_fastq_files(adj_fastq_dir, 2) else: fastq_pairs = [find_adj_fastq_files(adj_fastq_dir)] else: fastq_pairs = chain.from_iterable( find_grouped_fastq_files(fastq_dir, 2) for fastq_dir in orig_fastq_dirs) with ProcessPoolExecutor(max_workers=threads) as executor: futures = [] for i, (r1_fastq_file, r2_fastq_file) in enumerate(fastq_pairs, 1): subdir = OUTPUT_PATH / str(i) subdir.mkdir(exist_ok=True, parents=True) future = executor.submit( trim_reads, r1_fastq_file, r2_fastq_file, subdir, ) futures.append(future) wait(futures)
def main( assay: Assay, orig_fastq_dirs: Sequence[Path], trimmed_fastq_dir: Path, expected_cell_count: Optional[int], threads: Optional[int], ): threads = threads or 1 command = [ piece.format( salmon_option=assay.salmon_option, threads=threads, ) for piece in SALMON_COMMAND ] fastq_pairs: Iterable[Sequence[Path]] if assay.barcode_adj_performed: if assay.barcode_adj_r1_r2: fastq_pairs = list(find_grouped_fastq_files(trimmed_fastq_dir, 2)) else: fastq_pairs = list(find_adj_fastq_files(trimmed_fastq_dir)) else: fastq_pairs = list(find_grouped_fastq_files(trimmed_fastq_dir, 2)) if not fastq_pairs: raise ValueError("No FASTQ files found") if assay.keep_all_barcodes: command.extend(["--keepCBFraction", "1"]) # hack if assay == Assay.SLIDESEQ: # Don't support multiple input directories for Slide-seq; this will # likely cause significantly incorrect results due to barcode overlap # between multiple input data sets if len(orig_fastq_dirs) != 1: raise ValueError("Need exactly 1 input directory for Slide-seq") barcode_file = find_slideseq_barcode_file(orig_fastq_dirs[0]) command.extend(["--whitelist", fspath(barcode_file)]) maybe_cell_count = read_expected_cell_count( orig_fastq_dirs) or expected_cell_count if maybe_cell_count is not None: command.extend(["--forceCells", str(maybe_cell_count)]) for r1_fastq_file, r2_fastq_file in fastq_pairs: fastq_extension = [ "-1", r1_fastq_file, "-2", r2_fastq_file, ] command.extend(fastq_extension) print("Running:", " ".join(str(x) for x in command)) env = environ.copy() # Necessary for Singularity; this environment variable isn't # set by that container runtime but is required to run Salmon env["LD_LIBRARY_PATH"] = "/usr/local/lib" check_call(command)
def main(threads: int, directory: Path): for r1_fastq_file, r2_fastq_file in find_grouped_fastq_files(directory, 2): command = [ piece.format(threads=threads) for piece in SALMON_COMMAND ] fastq_extension = [ '-1', fspath(r1_fastq_file), '-2', fspath(r2_fastq_file), ] command.extend(fastq_extension) print('Running:', command) check_call(command) check_call(TAR_AND_ZIP_COMMAND) #tar and zip auxiliary files sample_id = get_sample_id_from_r1(r1_fastq_file) #Tag output files with sample_id rename_file("out/quant.sf", f"out/{sample_id}-quant.sf") rename_file("out/cmd_info.json", f"out/{sample_id}-cmd_info.json") rename_file("out/aux_files.tar.gz", f"out/{sample_id}-aux_files.tar.gz")
def main(directories: Iterable[Path], assay: Assay): CONCAT_OUTPUT_DIR.mkdir(exist_ok=True, parents=True) for directory in directories: for fastqs_list in find_grouped_fastq_files(directory, assay.fastq_count): num_fastqs = len(fastqs_list) if num_fastqs == 3: # For assays that use three files # flipping R2 and R3 here is deliberate. For the input data, barcodes # are in R2, and R1 and R3 are the two reads which should be aligned # to the genome. Internally in this pipeline, it's more straightforward # to rename these: R1 -> R1, R2 -> barcode, R3 -> R2 r1_fastq = fastqs_list[0] decompress_concat_fastq(r1_fastq, MERGED_FASTQ_R1) r2_fastq = fastqs_list[1] decompress_concat_fastq(r2_fastq, MERGED_FASTQ_BARCODE) r3_fastq = fastqs_list[2] decompress_concat_fastq(r3_fastq, MERGED_FASTQ_R2) elif num_fastqs == 2: r1_fastq = fastqs_list[0] decompress_concat_fastq(r1_fastq, MERGED_FASTQ_R1) r2_fastq = fastqs_list[1] decompress_concat_fastq(r2_fastq, MERGED_FASTQ_R2) else: raise ValueError( f"Could not unzip and concatenate fastqs becuase there are {num_fastqs} of them" )
def main(assay: Assay, fastq_dirs: Iterable[Path], output_filename_prefix: str, output_dir: Path): all_fastqs = chain.from_iterable( find_grouped_fastq_files(fastq_dir, assay.fastq_count) for fastq_dir in fastq_dirs) for fastq_r1, fastq_r2 in all_fastqs: convert_fastq(fastq_r1, output_dir / f"{output_filename_prefix}_R1.fastq") convert_fastq(fastq_r2, output_dir / f"{output_filename_prefix}_R2.fastq")
def main(assay: Assay, orig_fastq_dir: Iterable[Path], adj_fastq_dir: Path, threads: int): command = [ piece.format( salmon_option=assay.salmon_option, threads=threads, ) for piece in SALMON_COMMAND ] fastq_pairs: List[Sequence[Path]] if assay.barcode_adj_performed: if assay.barcode_adj_r1_r2: fastq_pairs = list(find_grouped_fastq_files(adj_fastq_dir, 2)) else: fastq_pairs = [find_adj_fastq_files(adj_fastq_dir)] else: fastq_pairs = list( chain.from_iterable( find_grouped_fastq_files(fastq_dir, 2) for fastq_dir in orig_fastq_dir)) if assay.keep_all_barcodes: command.extend(['--keepCBFraction', '1']) for r1_fastq_file, r2_fastq_file in fastq_pairs: fastq_extension = [ '-1', r1_fastq_file, '-2', r2_fastq_file, ] command.extend(fastq_extension) print('Running:', ' '.join(str(x) for x in command)) env = environ.copy() # Necessary for Singularity; this environment variable isn't # set by that container runtime but is required to run Salmon env['LD_LIBRARY_PATH'] = '/usr/local/lib' check_call(command)
def main( fastq_dirs: Iterable[Path], output_dir: Path = Path(), barcode_filename: Path = BARCODE_ALLOWLIST_FILE, n6_dt_mapping_file: Path = N6_DT_MAPPING_FILE, ): barcode_allowlist = read_barcode_allowlist(barcode_filename) correcter = bu.BarcodeCorrecter(barcode_allowlist, edit_distance=1) n6_dt_mapping = read_n6_dt_mapping(n6_dt_mapping_file) buf = output_dir / BARCODE_UMI_FASTQ_PATH trf = output_dir / TRANSCRIPT_FASTQ_PATH all_fastqs = chain.from_iterable( find_grouped_fastq_files(fastq_dir, 2) for fastq_dir in fastq_dirs) with open(buf, "w") as cbo, open(trf, "w") as tro: for transcript_fastq, barcode_umi_fastq in all_fastqs: usable_count = 0 i = 0 print("Correcting barcodes in", transcript_fastq, "and", barcode_umi_fastq) transcript_reader = fastq_reader(transcript_fastq) barcode_umi_reader = fastq_reader(barcode_umi_fastq) for i, (tr, br) in enumerate( zip(transcript_reader, barcode_umi_reader), 1): barcode_pieces = [revcomp(br.seq[s]) for s in BARCODE_SEGMENTS] rc_corrected = [ correcter.correct(barcode) for barcode in barcode_pieces ] if all(rc_corrected): rc_corrected[2] = n6_dt_mapping[rc_corrected[2]] corrected = [revcomp(rc_bc) for rc_bc in rc_corrected] usable_count += 1 umi_seq = br.seq[UMI_SEGMENT] umi_qual = br.qual[UMI_SEGMENT] new_seq = "".join(corrected + [umi_seq]) new_qual = BARCODE_QUAL_DUMMY + umi_qual new_br = Read( read_id=br.read_id, seq=new_seq, unused=br.unused, qual=new_qual, ) print(tr.serialize(), file=tro) print(new_br.serialize(), file=cbo) print("Total count:", i) print("Usable count:", usable_count) print("Proportion:", usable_count / i)
def main(assay: Assay, fastq_dirs: Iterable[Path], output_filename_prefix, output_dir: Path): baraddedf1 = output_dir / f"{output_filename_prefix}_R1.fastq" baraddedf2 = output_dir / f"{output_filename_prefix}_R2.fastq" all_fastqs = chain.from_iterable( find_grouped_fastq_files(fastq_dir, assay.fastq_count) for fastq_dir in fastq_dirs) with open(baraddedf1, "w") as barf1addedout, open(baraddedf2, "w") as barf2addedout: for fastq1_file, fastq2_file, barcode_file in all_fastqs: i = 0 print("Adding barcodes to", fastq1_file, "and", fastq2_file, "using", barcode_file) fastq1_reader = fastq_reader(fastq1_file) fastq2_reader = fastq_reader(fastq2_file) barcode_reader = fastq_reader(barcode_file) for i, (f1r, f2r, bar) in enumerate( zip(fastq1_reader, fastq2_reader, barcode_reader), 1): if assay == Assay.SNARESEQ: # This code assumes the barcode fastq is the third fastq; e.g. '...R3...' barcode_pieces = [ revcomp(bar.seq[s]) for s in SNARESEQ_BARCODE_SEGMENTS ] # For some reason the barcode pieces are reversed in the list # so get them back in the right order barcode_pieces.reverse() umi_seq = bar.seq[UMI_SEGMENT] elif assay == Assay.SNSEQ: barcode_pieces = [f2r.seq[SNSEQ_BARCODE_SEGMENT]] # print("barcode pieces={}".format(barcode_pieces)) umi_seq = "" else: print( "Could not adjust barcodes for assay {}".format(assay)) create_and_output_barcode_adjusted_reads( f1r, f2r, umi_seq, barcode_pieces, barf1addedout, barf2addedout)
def main(directories: Iterable[Path]): sequence_file_bundles = [] for directory in directories: for r1_fastq_file, r2_fastq_file in find_grouped_fastq_files( directory, 2): sequence_bundle = { "fastq_r1": { "class": "File", "path": fspath(r1_fastq_file), }, "fastq_r2": { "class": "File", "path": fspath(r2_fastq_file), }, } sequence_file_bundles.append(sequence_bundle) print("Sequence file bundles:") pprint(sequence_file_bundles) with open("input.json", "w") as text_file: json.dump(sequence_file_bundles, text_file)