def main(run: str, debug: bool = False): """ For a completed RUN, check for a barcode swap by counting the overlap of exact sequences """ create_logger(debug=debug) config = get_config() run_dir = config.workflow_dir / run manifest_file = run_dir / "manifest.yaml" if not run_dir.exists(): log.error(f"Could not find {run_dir}, has this run completed?") return elif not manifest_file.exists(): log.error(f"Could not find {manifest_file}, has this run completed?") manifest = Manifest.from_file(manifest_file) seq_barcodes = dict() bead_barcodes = dict() log.info("Reading sequenced barcodes and bead barcodes") for library in manifest.libraries: if library.merged.selected_cells.exists(): with gzip.open(library.merged.selected_cells, "rt") as fh: seq_barcodes[library.name] = {line.strip() for line in fh} else: log.debug(f"No file {library.merged.selected_cells}, skipping") if library.bead_barcodes.exists(): with library.bead_barcodes.open("r") as fh: bead_barcodes[library.name] = { line.strip().replace(",", "") for line in fh } else: log.debug(f"No file {library.bead_barcodes}, skipping") log.debug(f"Read data for {len(seq_barcodes)} libraries") max_len = max(map(len, bead_barcodes)) print( f"{'library':{max_len}s}", *(f"{n:{max_len}s}" for n in sorted(bead_barcodes)), sep="\t", ) for lib_a in sorted(seq_barcodes): print(f"{lib_a:{max_len}s}", end="\t") for lib_b in sorted(bead_barcodes): print( f"{len(seq_barcodes[lib_a] & bead_barcodes[lib_b]):{max_len}d}", end="\t", ) print()
def main( flowcell: str, lane: int, library_index: int, manifest_file: str, debug: bool = False, log_file: str = None, ): create_logger(debug=debug, log_file=log_file) config = get_config() log.debug(f"Reading manifest from {manifest_file}") manifest = Manifest.from_file(Path(manifest_file)) # task array is 1-indexed sample = manifest.get_sample(library_index - 1, flowcell, lane) if sample is None: return else: for barcode_ubam in sample.barcode_ubams: if not barcode_ubam.exists(): raise ValueError(f"{barcode_ubam} does not exist") # create a subdirectory for alignment (sample.lane_dir / "alignment").mkdir(exist_ok=True) bead_structure = sample.get_bead_structure() xc_range = ":".join(f"{i}-{j}" for c, i, j in bead_structure if c == "C") xm_range = ":".join(f"{i}-{j}" for c, i, j in bead_structure if c == "M") # if multiple barcodes correspond to one sample, merge them if len(sample.barcode_ubams) > 1: cmd = config.picard_cmd("MergeSamFiles", manifest.tmp_dir) cmd.extend([ "--OUTPUT", sample.raw_ubam, "--SORT_ORDER", "unsorted", "--ASSUME_SORTED", "true", ]) for ubam_file in sample.barcode_ubams: cmd.extend(["--INPUT", ubam_file]) run_command(cmd, "MergeBamFiles", sample) else: # otherwise, just rename the file os.rename(sample.barcode_ubams[0], sample.raw_ubam) procs = [] cmd = config.dropseq_cmd( "TagBamWithReadSequenceExtended", sample.raw_ubam, "/dev/stdout", manifest.tmp_dir, ) cmd.extend([ f"SUMMARY={sample.cellular_tagged_summary}", f"BASE_RANGE={xc_range}", f"BASE_QUALITY={sample.base_quality}", "BARCODED_READ=1", "DISCARD_READ=false", "TAG_NAME=XC", "NUM_BASES_BELOW_QUALITY=1", ]) procs.append( start_popen( cmd, "TagBamWithReadSequenceExtended (Cellular)", sample, lane, )) # Tag bam with read sequence extended molecular cmd = config.dropseq_cmd("TagBamWithReadSequenceExtended", "/dev/stdin", "/dev/stdout", manifest.tmp_dir) cmd.extend([ f"SUMMARY={sample.molecular_tagged_summary}", f"BASE_RANGE={xm_range}", f"BASE_QUALITY={sample.base_quality}", "BARCODED_READ=1", "DISCARD_READ=true", "TAG_NAME=XM", "NUM_BASES_BELOW_QUALITY=1", ]) procs.append( start_popen( cmd, "TagBamWithReadSequenceExtended (Molecular)", sample, lane, procs[-1], )) # Filter low-quality reads cmd = config.dropseq_cmd("FilterBam", "/dev/stdin", "/dev/stdout", manifest.tmp_dir) cmd.extend(["PASSING_READ_THRESHOLD=0.1", "TAG_REJECT=XQ"]) procs.append(start_popen(cmd, "FilterBam", sample, lane, procs[-1])) if sample.start_sequence != constants.NO_START_SEQUENCE: # Trim reads with starting sequence cmd = config.dropseq_cmd("TrimStartingSequence", "/dev/stdin", "/dev/stdout", manifest.tmp_dir) cmd.extend([ f"OUTPUT_SUMMARY={sample.trimming_summary}", f"SEQUENCE={sample.start_sequence}", "MISMATCHES=0", "NUM_BASES=5", ]) procs.append( start_popen(cmd, "TrimStartingSequence", sample, lane, procs[-1])) # Adapter-aware poly A trimming cmd = config.dropseq_cmd("PolyATrimmer", "/dev/stdin", sample.polya_filtered_ubam, manifest.tmp_dir) cmd.extend([ f"OUTPUT_SUMMARY={sample.polya_filtering_summary}", "MISMATCHES=0", "NUM_BASES=6", "USE_NEW_TRIMMER=true", ]) procs.append(start_popen(cmd, "PolyATrimmer", sample, lane, procs[-1])) # close intermediate streams for p in procs[:-1]: p.stdout.close() # wait for final process to finish procs[-1].communicate() log.debug("Finished with pre-alignment processing") # convert to fastq like a loser cmd = config.picard_cmd("SamToFastq", manifest.tmp_dir) cmd.extend([ "-I", sample.polya_filtered_ubam, "-F", sample.polya_filtered_fastq, ]) run_command(cmd, "SamToFastq", sample, lane) # Map reads to genome sequence using STAR cmd = [ "STAR", "--genomeDir", sample.reference.genome_dir, "--readFilesIn", sample.polya_filtered_fastq, "--readFilesCommand", "zcat", "--outFileNamePrefix", sample.star_prefix, "--outStd", "Log", "--outSAMtype", "BAM", "Unsorted", "--outBAMcompression", "0", "--limitOutSJcollapsed", "5000000", "--runThreadN", "8", ] run_command(cmd, "STAR", sample, lane) # Check alignments quality log.debug(f"Writing alignment statistics for {sample.aligned_bam} to" f" {sample.alignment_pickle}") write_alignment_stats(sample.aligned_bam, sample.alignment_pickle) procs = [] # Sort aligned bam cmd = config.picard_cmd("SortSam", manifest.tmp_dir, mem="24g") cmd.extend([ "-I", sample.aligned_bam, "-O", "/dev/stdout", "--SORT_ORDER", "queryname", ]) procs.append(start_popen(cmd, "SortSam", sample, lane)) # Merge unmapped bam and aligned bam cmd = config.picard_cmd("MergeBamAlignment", manifest.tmp_dir, mem="24g") cmd.extend([ "-R", sample.reference.fasta, "--UNMAPPED", sample.polya_filtered_ubam, "--ALIGNED", "/dev/stdin", "-O", "/dev/stdout", "--COMPRESSION_LEVEL", "0", "--INCLUDE_SECONDARY_ALIGNMENTS", "false", "--CLIP_ADAPTERS", "false", ]) procs.append(start_popen(cmd, "MergeBamAlignment", sample, lane, procs[-1])) # Tag read with interval cmd = config.dropseq_cmd("TagReadWithInterval", "/dev/stdin", "/dev/stdout", manifest.tmp_dir) cmd.extend([f"INTERVALS={sample.reference.intervals}", "TAG=XG"]) procs.append( start_popen(cmd, "TagReadWithInterval", sample, lane, procs[-1])) # Tag read with gene function cmd = config.dropseq_cmd( "TagReadWithGeneFunction", "/dev/stdin", sample.processed_bam, manifest.tmp_dir, compression=5, ) cmd.extend([ f"ANNOTATIONS_FILE={sample.reference.annotations}", "CREATE_INDEX=false" ]) procs.append( start_popen(cmd, "TagReadWithGeneFunction", sample, lane, procs[-1])) # close intermediate streams for p in procs[:-1]: p.stdout.close() # wait for final process to finish procs[-1].communicate() log.debug("Finished with post-alignment processing") # removed unneeded files os.remove(sample.polya_filtered_ubam) os.remove(sample.polya_filtered_fastq) os.remove(sample.aligned_bam) log.debug("Setting group permissions") give_group_access(sample.dir) if config.gs_path is not None: log.debug("Copying data to google storage") rsync_to_google( sample.lane_dir, config.gs_path / sample.lane_dir.relative_to(config.library_dir), ) log.info(f"Alignment for {sample} completed")
def main( fastq_r1, fastq_r2, puck_dir, output_dir, tag_sequence, constant_sequence, extra_pdf=False, debug=False, percentile=95.0, tag_mismatch=1, min_dist=1000, ): """ This script generates some plots for mapping barcoded reads. Reads sequences from FASTQ_R1 and FASTQ_R2. Assumes that the first read contains a 15bp barcode split across two locations, along with an 8bp UMI. The second read is assumed to have TAG_SEQUENCE in bases 20-40. """ create_logger(debug, dryrun=False) output_dir = Path(output_dir) output_pdf = output_dir / "plots.pdf" log.info(f"Saving output to {output_dir}") log.debug(f"Reading from {fastq_r1}") with gzip.open(fastq_r1, "rt") as fh: r1_reads = [line.strip() for line in itertools.islice(fh, 1, None, 4)] log.debug(f"Reading from {fastq_r2}") with gzip.open(fastq_r2, "rt") as fh: r2_reads = [line.strip() for line in itertools.islice(fh, 1, None, 4)] assert len(r1_reads) == len(r2_reads), "read different number of reads" log.info(f"Total of {len(r1_reads)} reads") bead_barcodes, xy = get_barcodes(Path(puck_dir)) constant_sequence_hset = slideseq.bead_matching.hamming_set( slideseq.bead_matching.initial_h_set(constant_sequence), include_N=False) barcode_codes = [DEGENERATE_BASE_DICT[b] for b in tag_sequence] # get unique barcodes seq_barcodes = sorted({r1[:8] + r1[26:32] for r1 in r1_reads}) # remove poly-T sequence if present seq_barcodes = [seq for seq in seq_barcodes if set(seq) != {"T"}] log.info(f"{len(seq_barcodes)} unique seq barcodes") log.debug("calculating bead network") degen_bead_barcodes, bead_xy, barcode_mapping = bead_network( bead_barcodes, seq_barcodes, xy) log.debug("calculating tag network") tag_sequences, tag_to_degen_tag = tag_network( r2_reads, barcode_codes, len(tag_sequence) - tag_mismatch) with gzip.open(output_dir / "tag_mapping.txt.gz", "wt") as out: for tag, dtag in tag_to_degen_tag.items(): print(f"{tag}\t{dtag}", file=out) with gzip.open(output_dir / "raw_sequences.txt.gz", "wt") as out: print("\n".join(tag_sequences), file=out) umis_per_bead, raw_umis_per_bead, reads_per_bead = match_tags( r1_reads, r2_reads, barcode_mapping, constant_sequence_hset, tag_to_degen_tag) slideseq.bead_matching.write_barcode_mapping( barcode_mapping, bead_xy, output_dir / "barcode_matching.txt.gz") slideseq.bead_matching.write_barcode_xy( degen_bead_barcodes, bead_xy, output_dir / "barcode_coordinates.txt.gz") filtered_barcodes = [ bc for bc in degen_bead_barcodes if bc in umis_per_bead ] bead_xy_a = np.vstack([bead_xy[dbc] for dbc in filtered_barcodes]) log.info("Writing output files") beads = sorted(umis_per_bead) tags = sorted({t for b in beads for t in umis_per_bead[b]}) raw_tags = sorted({t for b in beads for t in raw_umis_per_bead[b]}) with gzip.open(output_dir / "beads.txt.gz", "wt") as out: print("\n".join(beads), file=out) with gzip.open(output_dir / "tags.txt.gz", "wt") as out: print("\n".join(tags), file=out) with gzip.open(output_dir / "raw_tags.txt.gz", "wt") as out: print("\n".join(raw_tags), file=out) log.debug("Writing umi matrix") m = write_matrix(umis_per_bead, beads, tags, output_dir / "umi_matrix.mtx.gz") log.debug("Writing raw umi matrix") write_matrix(raw_umis_per_bead, beads, raw_tags, output_dir / "raw_umi_matrix.mtx.gz") log.debug("Writing read matrix") write_matrix(reads_per_bead, beads, tags, output_dir / "read_matrix.mtx.gz") bead_dist = [] bead_pairs = [] # tags that appear in >1 bead two_beads = np.asarray((m > 0).sum(0) > 1).squeeze().nonzero()[0] for i in two_beads: # beads with this tag nz_beads = np.asarray(m[:, i].todense()).squeeze().nonzero()[0] for bi, bj in itertools.combinations(nz_beads, 2): bead_pairs.append((bi, bj)) bead_dist.append( np.sqrt(((bead_xy_a[bi, :] - bead_xy_a[bj, :])**2).sum())) long_pairs = [ b_p for b_p, b_d in zip(bead_pairs, bead_dist) if b_d > min_dist ] log.debug(f"Identified {len(long_pairs)} pairs with distance > {min_dist}") if extra_pdf: extra_pdf = output_dir / "extra_plots.pdf" log.info(f"Saving extra plots to {extra_pdf}") extra_pdf = PdfPages(extra_pdf) umi_counts = Counter(r[32:41] for r in r1_reads) log.debug( f"Found {len(umi_counts)} UMIs with {sum(umi_counts.values())} total counts" ) plot_log_hist(umi_counts.values(), "Reads per UMI", extra_pdf) plot_log_hist( [sum(umis_per_bead[bc].values()) for bc in umis_per_bead], "UMIs per bead", extra_pdf, ) plot_hist(bead_dist, "Distance distribution for paired tags", extra_pdf) extra_pdf.close() pdf_pages = PdfPages(output_pdf) log.info("Making plots") umi_dist = [sum(umis_per_bead[bc].values()) for bc in filtered_barcodes] read_dist = [sum(reads_per_bead[bc].values()) for bc in filtered_barcodes] spatial_plot_plus_links( bead_xy_a, long_pairs, umi_dist, f"UMIs per bead and pairs > {min_dist} pixels", pdf_pages, pct=percentile, ) spatial_plot( bead_xy_a, umi_dist, "UMIs per bead", pdf_pages, pct=percentile, ) spatial_plot( bead_xy_a, np.log10(umi_dist), "log10 UMIs per bead", pdf_pages, pct=percentile, ) spatial_plot( bead_xy_a, read_dist, "Reads per bead", pdf_pages, pct=percentile, ) spatial_plot( bead_xy_a, np.log10(read_dist), "log10 reads per bead", pdf_pages, pct=percentile, ) pdf_pages.close() log.info("Done!")
def main(library_index: int, manifest_file: str, debug: bool = False, log_file: str = None): create_logger(debug=debug, log_file=log_file) config = get_config() log.debug(f"Reading manifest from {manifest_file}") manifest = Manifest.from_file(Path(manifest_file)) # task array is 1-indexed library = manifest.get_library(library_index - 1) if not library.gen_downsampling: log.debug("Downsampling not requested, nothing to do") return log.info(f"Downsampling alignments for library {library.name}") library.downsample_dir.mkdir(exist_ok=True, parents=True) if library.run_barcodematching: # this will be much faster on matched bams downsample_input = library.matched downsample_tag = "XB" else: downsample_input = library.merged downsample_tag = "XC" downsample_output = [] # Progressively downsample the BAM from largest to smallest input_bam = downsample_input.bam for n in range(9, 0, -1): ratio = n / 10 downsampled_bam = downsample_input.downsampled_bam(ratio) downsample_output.append( downsample_dge( config=config, bam_file=input_bam, downsampled_bam=downsampled_bam, cell_tag=downsample_tag, library=library, ratio=ratio, tmp_dir=manifest.tmp_dir, )) if input_bam != downsample_input.bam: os.remove(input_bam) input_bam = downsampled_bam # remove final downsampled_bam if input_bam != downsample_input.bam: os.remove(input_bam) plot_downsampling( downsample_output, downsample_input.digital_expression_summary, downsample_input.downsampling_pdf, ) log.debug("Setting group permissions") give_group_access(library.dir) if config.gs_path is not None: log.debug("Copying data to google storage") rsync_to_google(library.dir, config.gs_path / library.date_name) log.info(f"Downsampling for {library} complete")
def main(library_index: int, manifest_file: str, debug: bool = False, log_file: str = None): create_logger(debug=debug, log_file=log_file) config = get_config() log.debug(f"Reading manifest from {manifest_file}") manifest = Manifest.from_file(Path(manifest_file)) # task array is 1-indexed library = manifest.get_library(library_index - 1) log.debug(f"Processing alignments for library {library.name}") # define barcode matching files, if needed barcode_matching_file = (library.barcode_matching_dir / f"{library}_barcode_matching.txt.gz") barcode_coordinate_file = (library.barcode_matching_dir / f"{library}_barcode_xy.txt.gz") matched_barcodes_file = (library.barcode_matching_dir / f"{library}_matched_barcodes.txt.gz") # Combine check_alignments_quality files, and plot histograms alignment_quality.combine_alignment_stats(library) # Merge bam files cmd = config.picard_cmd("MergeSamFiles", manifest.tmp_dir) cmd.extend([ "--CREATE_INDEX", "true", "--CREATE_MD5_FILE", "false", "--OUTPUT", library.merged.bam, "--SORT_ORDER", "coordinate", "--ASSUME_SORTED", "true", ]) for bam_file in library.processed_bams: cmd.extend(["--INPUT", bam_file]) run_command(cmd, "MergeBamFiles", library) # generate various metrics files, including digital expression matrix calc_alignment_metrics(config, library.merged, library, manifest.tmp_dir) if library.run_barcodematching: library.barcode_matching_dir.mkdir(exist_ok=True, parents=True) shutil.copy(library.bead_barcodes, library.barcode_matching_dir) shutil.copy(library.bead_locations, library.barcode_matching_dir) barcode_list, barcode_mapping, bead_xy, _ = bead_matching.match_barcodes( library.merged.selected_cells, library.bead_barcodes, library.bead_locations) bead_matching.write_barcode_mapping(barcode_mapping, bead_xy, barcode_matching_file) bead_matching.write_barcode_xy(barcode_list, bead_xy, barcode_coordinate_file) with gzip.open(matched_barcodes_file, "wt") as out: for bead_bc in sorted(set(barcode_mapping.values())): print(bead_bc, file=out) # subset to the matched beads and add combined barcode as XB tag write_retagged_bam(library.merged.bam, library.matched.bam, barcode_mapping) # do it all again, but should be faster on the smaller file calc_alignment_metrics( config, library.matched, library, manifest.tmp_dir, matched_barcodes_file, "XB", ) write_sparse_matrix(library) make_library_plots(library, bead_xy) else: make_library_plots(library) # remove unneeded files now that we're done for bam_file in library.processed_bams: log.debug(f"Removing {bam_file}") os.remove(bam_file) if matched_barcodes_file.exists(): log.debug(f"Removing {matched_barcodes_file}") os.remove(matched_barcodes_file) log.debug("Setting group permissions") give_group_access(library.dir) if config.gs_path is not None: log.debug("Copying data to google storage") rsync_to_google(library.dir, config.gs_path / library.date_name) log.info(f"Processing for {library} complete")
def main( genome_name: str, reference_fasta: str, reference_gtf: str, mt_sequence: str, filter_biotypes: list[str], overwrite: bool = False, debug: bool = False, dryrun: bool = False, log_file: str = None, ): create_logger(debug=debug, dryrun=dryrun, log_file=log_file) env_name = slideseq.util.get_env_name() log.debug(f"Running in Conda env {env_name}") config = get_config() reference_fasta = Path(reference_fasta) reference_gtf = Path(reference_gtf) output_dir = config.reference_dir / genome_name star_dir = output_dir / "STAR" log.info(f"Building reference for genome {genome_name}") if output_dir.exists(): if overwrite: log.warning(f"{output_dir} exists, overwriting existing reference") if not dryrun: if star_dir.exists(): log.debug(f"Removing and remaking {star_dir}") shutil.rmtree(star_dir) star_dir.mkdir() if (output_dir / f"{genome_name}.dict").exists(): log.debug(f"Removing {output_dir}/{genome_name}.dict") os.remove(output_dir / f"{genome_name}.dict") else: log.error( f"{star_dir} already exists and overwrite=False, aborting") sys.exit(1) else: log.info(f"Creating output directory {star_dir}") if not dryrun: star_dir.mkdir(parents=True) if check_gtf(reference_gtf, mt_sequence): log.info("GTF has all required fields") else: log.error("Need to fix GTF errors") return log.info(f"Creating genome reference for {reference_fasta}") # this script will create a reference for slideseq with importlib.resources.path(slideseq.scripts, "build_reference.sh") as qsub_script: mkref_args = qsub_args( log_file=output_dir / "build_reference.log", CONDA_ENV=env_name, PICARD_JAR=config.picard, DROPSEQ_DIR=config.dropseq_dir, GENOME_NAME=genome_name, REFERENCE_FASTA=reference_fasta, REFERENCE_GTF=reference_gtf, OUTPUT_DIR=output_dir, MT_SEQUENCE=mt_sequence, FILTERED_BIOTYPES=" ".join(f"G={biotype}" for biotype in filter_biotypes), ) mkref_args.append(f"{qsub_script.absolute()}") log.debug(f"Build-reference command:\n\t{' '.join(mkref_args)}") if dryrun: return # qsub may sporadically fail due to network issues for _ in range(constants.MAX_QSUB): proc = run(mkref_args, capture_output=True, text=True) if int(proc.returncode) != 0: log.warning("qsub failed, retrying") log.debug(f"Error: {proc.stderr}") else: break else: log.error(f"Unable to launch build_ref job for {reference_fasta}")
def main( fastq_r1, fastq_r2, barcodes, locations, tag_sequence, output_pdf, extra_pdf=None, debug=False, percentile=95.0, ): """ This script generates some plots for mapping barcoded reads. Reads sequences from FASTQ_R1 and FASTQ_R2. Assumes that the first read contains a 15bp barcode split across two locations, along with an 8bp UMI. The second read is assumed to have TAG_SEQUENCE in bases 20-40. """ create_logger(debug, dryrun=False) output_pdf = Path(output_pdf) log.debug(f"Reading from {fastq_r1}") with gzip.open(fastq_r1, "rt") as fh: r1_reads = [line.strip() for line in itertools.islice(fh, 1, None, 4)] log.debug(f"Reading from {fastq_r2}") with gzip.open(fastq_r2, "rt") as fh: r2_reads = [line.strip() for line in itertools.islice(fh, 1, None, 4)] log.debug(f"Reading {barcodes}") with open(barcodes) as fh: raw_bcs = ["".join(line.strip().split(",")) for line in fh] log.debug(f"Reading {locations}") with open(locations) as fh: x = np.array([float(v) for v in fh.readline().strip().split(",")]) y = np.array([float(v) for v in fh.readline().strip().split(",")]) xy = np.vstack((x, y)).T if extra_pdf is not None: extra_pdf_pages = PdfPages(extra_pdf) umi_counts = Counter(r[32:41] for r in r1_reads) log.debug( f"Found {len(umi_counts)} UMIs with {sum(umi_counts.values())} total counts" ) plot_log_hist(umi_counts.values(), "Reads per UMI", extra_pdf_pages) else: extra_pdf_pages = None # pre-emptively remove poly-T/N sequences ok_barcodes = [not set(bc).issubset({"T", "N"}) for bc in raw_bcs] xy = xy[ok_barcodes, :] bead_barcodes = [bc for ok, bc in zip(ok_barcodes, raw_bcs) if ok] log.info(f"Read {len(raw_bcs)} barcodes and filtered to {len(bead_barcodes)}") seq_barcodes = sorted(r1[:8] + r1[26:32] for r1 in r1_reads) # remove poly-T sequence if present seq_barcodes = [seq for seq in seq_barcodes if set(seq) != {"T"}] log.info(f"Found {len(set(seq_barcodes))} unique barcodes in sequencing data") log.info("Computing barcode matching") log.debug("Computing radius neighbor graph") # adjacency matrix for all beads within radius of each other radius_matrix = radius_neighbors_graph(xy, radius=10.0) log.debug("Computing hamming neighbor graph") # adjacency matrix for all barcodes within hamming distance 1 hamming_matrix = hamming1_adjacency(bead_barcodes) # just multiply together to get the combined adjacency matrix! combined_graph = nx.from_scipy_sparse_matrix(radius_matrix.multiply(hamming_matrix)) # add xy coordinates to graph so we can analyze later for n, (x, y) in zip(combined_graph.nodes, xy): combined_graph.nodes[n]["x"] = x combined_graph.nodes[n]["y"] = y # get connected components to find groups of similar/close barcodes bead_groups = list(nx.connected_components(combined_graph)) # calculate degenerate (ambiguous bases -> N) barcodes degen_bead_barcodes = [ degen_barcode({bead_barcodes[j] for j in bg}) for bg in bead_groups ] log.debug( f"Collapsed {len(bead_groups)} bead groups into" f" {len(set(degen_bead_barcodes))} barcodes" ) # average xy for grouped beads to get centroids bead_xy = dict() for bg, degen_bc in zip(bead_groups, degen_bead_barcodes): bg_graph = combined_graph.subgraph(bg) mean_x, mean_y = np.array( [[nd["x"], nd["y"]] for _, nd in bg_graph.nodes(data=True)] ).mean(0) bead_xy[degen_bc] = (mean_x, mean_y) barcode_matching = bipartite_matching( bead_barcodes, degen_bead_barcodes, bead_groups, seq_barcodes ) if extra_pdf is not None: tag_barcodes = [r2[20:40] for r2 in r2_reads] tag_counts = Counter(tag_barcodes) sum(1 for r1 in r1_reads if (r1[:8] + r1[26:32]) in barcode_matching) umis_per_tag = defaultdict(set) for r1, r2 in zip(r1_reads, r2_reads): umis_per_tag[r2[20:40]].add(r1[32:41]) plot_log_hist(tag_counts.values(), "Reads per tag", extra_pdf_pages) plot_log_hist( list(map(len, umis_per_tag.values())), "UMIs per tag", extra_pdf_pages ) log.debug(f"Counting UMIs and reads per bead for sequence {tag_sequence}") reads_per_umi_per_bead = defaultdict(Counter) umis_per_bead = defaultdict(set) reads_per_bead = Counter() for r1, r2 in zip(r1_reads, r2_reads): seq_bc = r1[:8] + r1[26:32] if seq_bc not in barcode_matching: continue if r2[20:40] != tag_sequence: continue bead_bc = barcode_matching[seq_bc] umi = r1[32:41] reads_per_umi_per_bead[bead_bc][umi] += 1 umis_per_bead[bead_bc].add(umi) reads_per_bead[bead_bc] += 1 filtered_barcodes = [bc for bc in degen_bead_barcodes if umis_per_bead[bc]] bead_xy_a = np.vstack([bead_xy[dbc] for dbc in filtered_barcodes]) with gzip.open(output_pdf.with_suffix(".reads_per_umi.txt.gz"), "wt") as out: print("bead_barcodes\tumi\treads", file=out) for bc in filtered_barcodes: for umi in reads_per_umi_per_bead[bc][umi]: print(f"{bc}\t{umi}\t{reads_per_umi_per_bead[bc][umi]}", file=out) with output_pdf.with_suffix(".txt").open("w") as out: print("bead_barcode\tumis\treads", file=out) for bc in filtered_barcodes: print(f"{bc}\t{len(umis_per_bead[bc])}\t{reads_per_bead[bc]}", file=out) if extra_pdf is not None: plot_log_hist( [len(umis_per_bead[bc]) for bc in filtered_barcodes], "UMIs per bead", extra_pdf_pages, ) extra_pdf_pages.close() pdf_pages = PdfPages(output_pdf) log.info("Making plots") spatial_plot( bead_xy_a, [len(umis_per_bead[bc]) for bc in filtered_barcodes], "UMIs per bead", pdf_pages, pct=percentile, ) spatial_plot( bead_xy_a, [np.log10(len(umis_per_bead[bc])) for bc in filtered_barcodes], "log10 UMIs per bead", pdf_pages, pct=percentile, ) spatial_plot( bead_xy_a, [reads_per_bead[bc] for bc in filtered_barcodes], "Reads per bead", pdf_pages, pct=percentile, ) spatial_plot( bead_xy_a, [np.log10(1 + reads_per_bead[bc]) for bc in filtered_barcodes], "log10 reads per bead", pdf_pages, pct=percentile, ) pdf_pages.close() log.info("Done!")
def main( runs: list[str], demux: bool = True, align: bool = True, processing: bool = True, dryrun: bool = False, debug: bool = False, log_file: str = None, ): """ Submit each RUN to the Slide-seq alignment pipeline. See README.md for instructions and requirements: github.com/MacoskoLab/slideseq-tools """ create_logger(debug=debug, dryrun=dryrun, log_file=log_file) env_name = get_env_name() log.debug(f"Running in conda env {env_name}") config = get_config() # you shouldn't demux without aligning, that's weird if demux and not align: log.debug("Assuming --no-align should also set --no-demux") demux = False log.debug("Fetching Google credentials") google_creds = gutil.get_secrets_manager_credentials(config.gsecret_name) log.debug("Setting up Google Drive service") drive_service = gutil.get_service(google_creds) # get a pandas DataFrame of the worksheet log.debug( f"Downloading Google Sheet, id={config.gsheet_id} worksheet={config.worksheet}" ) worksheet_df = gutil.GoogleSheet(drive_service, config.gsheet_id)[config.worksheet] worksheet_df = worksheet_df.dropna(axis=0, how="all") log.debug( f"Retreived worksheet {config.worksheet} with {len(worksheet_df)} rows" ) log.info(f"Beginning submission for {len(runs)} runs") submitted = set() run_errors = set() for run_name in runs: run_df = worksheet_df.loc[worksheet_df["run_name"] == run_name] if not len(run_df): log.warning( f"{run_name} not found in worksheet; please add to sheet.") run_errors.add(run_name) continue log.debug(f"Found {len(run_df)} libraries in worksheet") # subset to metadata columns run_df = run_df[constants.METADATA_COLS] run_df.columns = [c.lower() for c in constants.METADATA_COLS] if not validate_run_df(run_name, run_df): run_errors.add(run_name) continue # convert columns to desired types run_df = run_df.astype(constants.METADATA_TYPES) # data locations output_dir = config.workflow_dir / run_name flowcell_dirs = sorted(Path(fd) for fd in set(run_df.bclpath)) manifest_file = output_dir / "manifest.yaml" metadata_file = output_dir / "metadata.csv" run_info_list = [] for flowcell_dir in flowcell_dirs: run_info_list.append(get_run_info(flowcell_dir)) manifest = Manifest( run_name=run_name, flowcell_dirs=flowcell_dirs, workflow_dir=output_dir, library_dir=config.library_dir, metadata_file=metadata_file, metadata=split_sample_lanes(run_df, run_info_list), email_addresses=sorted( set(e.strip() for v in run_df.email for e in v.split(","))), ) n_libraries = len(list(manifest.libraries)) if not dryrun: log.debug("Creating output directories") output_dir.mkdir(exist_ok=True) manifest.log_dir.mkdir(exist_ok=True) if list(manifest.log_dir.glob("*.log")): log.warning( "Log files already exist for this job, new output will be appended" ) manifest.library_dir.mkdir(exist_ok=True) manifest.tmp_dir.mkdir(exist_ok=True) log.debug(f"Writing manifest to {manifest_file}") if metadata_file.exists(): log.info(f"Overwriting metadata file {metadata_file}") manifest.to_file(manifest_file) if demux: # make various directories prepare_demux(run_info_list, manifest) elif not validate_demux(manifest): # appears that demux was not run previously run_errors.add(run_name) continue elif not (align or validate_alignment(manifest, n_libraries)): # appears that alignment was not run and wasn't requested run_errors.add(run_name) continue demux_jids = dict() # this script will check the sequencing directory, extract barcodes, # and demultiplex to BAM files with importlib.resources.path(slideseq.scripts, "demultiplex.sh") as qsub_script: for run_info in run_info_list: demux_args = qsub_args( log_file=manifest.log_dir / run_info.demux_log, email=",".join(manifest.email_addresses), PICARD_JAR=config.picard, TMP_DIR=manifest.tmp_dir, FLOWCELL=run_info.flowcell, BASECALLS_DIR=run_info.basecall_dir, READ_STRUCTURE=run_info.read_structure, OUTPUT_DIR=output_dir, ) demux_args.extend([ "-t", f"{min(run_info.lanes)}-{max(run_info.lanes)}", f"{qsub_script.absolute()}", ]) if demux: demux_jids[run_info.flowcell] = attempt_qsub( demux_args, run_name, "demultiplex", dryrun) if demux_jids[run_info.flowcell] is None: run_errors.add(run_name) continue else: demux_jids[run_info.flowcell] = None log.debug("Skipping demux step") if run_name in run_errors: continue alignment_jids = dict() # this script processes/filters the extracted uBAMs and aligns them to the # specified reference. this depends on previous jobs per lane, so we use # -hold_jid on the lane-specific demux job with importlib.resources.path(slideseq.scripts, "alignment.sh") as qsub_script: for run_info in run_info_list: for lane in run_info.lanes: if demux and demux_jids[run_info.flowcell] is None: log.debug( f"Not aligning {lane} because demux was not submitted" ) continue alignment_args = qsub_args( log_file=manifest.log_dir / run_info.alignment_log(lane), email=",".join(manifest.email_addresses), debug=debug, CONDA_ENV=env_name, FLOWCELL=run_info.flowcell, LANE=lane, MANIFEST=manifest_file, ) if demux: alignment_args.extend([ "-hold_jid", f"{demux_jids[run_info.flowcell]}[{lane}]" ]) alignment_args.extend([ "-t", f"1-{n_libraries}", f"{qsub_script.absolute()}" ]) if align: alignment_jids[run_info.flowcell, lane] = attempt_qsub( alignment_args, run_name, "alignment", dryrun) if alignment_jids[run_info.flowcell, lane] is None: run_errors.add(run_name) else: alignment_jids[run_info.flowcell, lane] = None log.info("Skipping alignment step") if run_name in run_errors: continue # this script analyzes the alignment output, generates plots, matches to puck, etc # this is per-library, which means each library needs to wait on the alignment jobs # for the relevant lane(s) that contain that library. We're going to wait on all the # lanes, but use hold_jid_ad to wait on only that library's alignments with importlib.resources.path(slideseq.scripts, "processing.sh") as qsub_script: if align and any(alignment_jids[run_info.flowcell, lane] is None for run_info in run_info_list for lane in run_info.lanes): log.debug( "Not processing because some alignments were not submitted" ) continue processing_args = qsub_args( log_file=manifest.log_dir / "processing.$TASK_ID.log", email=",".join(manifest.email_addresses), debug=debug, CONDA_ENV=env_name, MANIFEST=manifest_file, ) if align: for run_info in run_info_list: for lane in run_info.lanes: processing_args.extend([ "-hold_jid_ad", f"{alignment_jids[run_info.flowcell, lane]}", ]) processing_args.extend( ["-t", f"1-{n_libraries}", f"{qsub_script.absolute()}"]) if processing: processing_jid = attempt_qsub(processing_args, run_name, "processing", dryrun) if processing_jid is None: run_errors.add(run_name) else: processing_jid = None log.info("Skipping processing step") if run_name in run_errors: continue # this (optional) script will downsample the alignment output and plot the results # this is per-library, which means each library needs to wait on the processing jobs # for the library with importlib.resources.path(slideseq.scripts, "downsampling.sh") as qsub_script: if processing and processing_jid is None: log.debug( "Not downsampling because processing job was not submitted" ) continue downsample_args = qsub_args( log_file=manifest.log_dir / "downsampling.$TASK_ID.log", email=",".join(manifest.email_addresses), debug=debug, CONDA_ENV=env_name, MANIFEST=manifest_file, ) if processing: downsample_args.extend(["-hold_jid_ad", f"{processing_jid}"]) downsample_args.extend( ["-t", f"1-{n_libraries}", f"{qsub_script.absolute()}"]) downsample_jid = attempt_qsub(downsample_args, run_name, "downsampling", dryrun) if downsample_jid is None: run_errors.add(run_name) else: submitted.add(run_name) if submitted and not dryrun: log.info(f"Flowcells {', '.join(submitted)} submitted for processing") else: log.info("No flowcells submitted for processing") if run_errors: log.info( f"Flowcells {', '.join(run_errors)} had errors -- see warnings above." )