Пример #1
0
def main(run: str, debug: bool = False):
    """
    For a completed RUN, check for a barcode swap by counting the overlap of exact sequences
    """
    create_logger(debug=debug)
    config = get_config()

    run_dir = config.workflow_dir / run
    manifest_file = run_dir / "manifest.yaml"

    if not run_dir.exists():
        log.error(f"Could not find {run_dir}, has this run completed?")
        return
    elif not manifest_file.exists():
        log.error(f"Could not find {manifest_file}, has this run completed?")

    manifest = Manifest.from_file(manifest_file)

    seq_barcodes = dict()
    bead_barcodes = dict()

    log.info("Reading sequenced barcodes and bead barcodes")
    for library in manifest.libraries:
        if library.merged.selected_cells.exists():
            with gzip.open(library.merged.selected_cells, "rt") as fh:
                seq_barcodes[library.name] = {line.strip() for line in fh}
        else:
            log.debug(f"No file {library.merged.selected_cells}, skipping")

        if library.bead_barcodes.exists():
            with library.bead_barcodes.open("r") as fh:
                bead_barcodes[library.name] = {
                    line.strip().replace(",", "")
                    for line in fh
                }
        else:
            log.debug(f"No file {library.bead_barcodes}, skipping")

    log.debug(f"Read data for {len(seq_barcodes)} libraries")

    max_len = max(map(len, bead_barcodes))

    print(
        f"{'library':{max_len}s}",
        *(f"{n:{max_len}s}" for n in sorted(bead_barcodes)),
        sep="\t",
    )
    for lib_a in sorted(seq_barcodes):
        print(f"{lib_a:{max_len}s}", end="\t")
        for lib_b in sorted(bead_barcodes):
            print(
                f"{len(seq_barcodes[lib_a] & bead_barcodes[lib_b]):{max_len}d}",
                end="\t",
            )
        print()
Пример #2
0
def main(
    flowcell: str,
    lane: int,
    library_index: int,
    manifest_file: str,
    debug: bool = False,
    log_file: str = None,
):
    create_logger(debug=debug, log_file=log_file)
    config = get_config()

    log.debug(f"Reading manifest from {manifest_file}")
    manifest = Manifest.from_file(Path(manifest_file))

    # task array is 1-indexed
    sample = manifest.get_sample(library_index - 1, flowcell, lane)
    if sample is None:
        return
    else:
        for barcode_ubam in sample.barcode_ubams:
            if not barcode_ubam.exists():
                raise ValueError(f"{barcode_ubam} does not exist")

    # create a subdirectory for alignment
    (sample.lane_dir / "alignment").mkdir(exist_ok=True)

    bead_structure = sample.get_bead_structure()
    xc_range = ":".join(f"{i}-{j}" for c, i, j in bead_structure if c == "C")
    xm_range = ":".join(f"{i}-{j}" for c, i, j in bead_structure if c == "M")

    # if multiple barcodes correspond to one sample, merge them
    if len(sample.barcode_ubams) > 1:
        cmd = config.picard_cmd("MergeSamFiles", manifest.tmp_dir)
        cmd.extend([
            "--OUTPUT",
            sample.raw_ubam,
            "--SORT_ORDER",
            "unsorted",
            "--ASSUME_SORTED",
            "true",
        ])

        for ubam_file in sample.barcode_ubams:
            cmd.extend(["--INPUT", ubam_file])

        run_command(cmd, "MergeBamFiles", sample)
    else:
        # otherwise, just rename the file
        os.rename(sample.barcode_ubams[0], sample.raw_ubam)

    procs = []

    cmd = config.dropseq_cmd(
        "TagBamWithReadSequenceExtended",
        sample.raw_ubam,
        "/dev/stdout",
        manifest.tmp_dir,
    )
    cmd.extend([
        f"SUMMARY={sample.cellular_tagged_summary}",
        f"BASE_RANGE={xc_range}",
        f"BASE_QUALITY={sample.base_quality}",
        "BARCODED_READ=1",
        "DISCARD_READ=false",
        "TAG_NAME=XC",
        "NUM_BASES_BELOW_QUALITY=1",
    ])

    procs.append(
        start_popen(
            cmd,
            "TagBamWithReadSequenceExtended (Cellular)",
            sample,
            lane,
        ))

    # Tag bam with read sequence extended molecular
    cmd = config.dropseq_cmd("TagBamWithReadSequenceExtended", "/dev/stdin",
                             "/dev/stdout", manifest.tmp_dir)
    cmd.extend([
        f"SUMMARY={sample.molecular_tagged_summary}",
        f"BASE_RANGE={xm_range}",
        f"BASE_QUALITY={sample.base_quality}",
        "BARCODED_READ=1",
        "DISCARD_READ=true",
        "TAG_NAME=XM",
        "NUM_BASES_BELOW_QUALITY=1",
    ])
    procs.append(
        start_popen(
            cmd,
            "TagBamWithReadSequenceExtended (Molecular)",
            sample,
            lane,
            procs[-1],
        ))

    # Filter low-quality reads
    cmd = config.dropseq_cmd("FilterBam", "/dev/stdin", "/dev/stdout",
                             manifest.tmp_dir)
    cmd.extend(["PASSING_READ_THRESHOLD=0.1", "TAG_REJECT=XQ"])

    procs.append(start_popen(cmd, "FilterBam", sample, lane, procs[-1]))

    if sample.start_sequence != constants.NO_START_SEQUENCE:
        # Trim reads with starting sequence
        cmd = config.dropseq_cmd("TrimStartingSequence", "/dev/stdin",
                                 "/dev/stdout", manifest.tmp_dir)
        cmd.extend([
            f"OUTPUT_SUMMARY={sample.trimming_summary}",
            f"SEQUENCE={sample.start_sequence}",
            "MISMATCHES=0",
            "NUM_BASES=5",
        ])

        procs.append(
            start_popen(cmd, "TrimStartingSequence", sample, lane, procs[-1]))

    # Adapter-aware poly A trimming
    cmd = config.dropseq_cmd("PolyATrimmer", "/dev/stdin",
                             sample.polya_filtered_ubam, manifest.tmp_dir)
    cmd.extend([
        f"OUTPUT_SUMMARY={sample.polya_filtering_summary}",
        "MISMATCHES=0",
        "NUM_BASES=6",
        "USE_NEW_TRIMMER=true",
    ])

    procs.append(start_popen(cmd, "PolyATrimmer", sample, lane, procs[-1]))

    # close intermediate streams
    for p in procs[:-1]:
        p.stdout.close()

    # wait for final process to finish
    procs[-1].communicate()
    log.debug("Finished with pre-alignment processing")

    # convert to fastq like a loser
    cmd = config.picard_cmd("SamToFastq", manifest.tmp_dir)
    cmd.extend([
        "-I",
        sample.polya_filtered_ubam,
        "-F",
        sample.polya_filtered_fastq,
    ])
    run_command(cmd, "SamToFastq", sample, lane)

    # Map reads to genome sequence using STAR
    cmd = [
        "STAR",
        "--genomeDir",
        sample.reference.genome_dir,
        "--readFilesIn",
        sample.polya_filtered_fastq,
        "--readFilesCommand",
        "zcat",
        "--outFileNamePrefix",
        sample.star_prefix,
        "--outStd",
        "Log",
        "--outSAMtype",
        "BAM",
        "Unsorted",
        "--outBAMcompression",
        "0",
        "--limitOutSJcollapsed",
        "5000000",
        "--runThreadN",
        "8",
    ]

    run_command(cmd, "STAR", sample, lane)

    # Check alignments quality
    log.debug(f"Writing alignment statistics for {sample.aligned_bam} to"
              f" {sample.alignment_pickle}")
    write_alignment_stats(sample.aligned_bam, sample.alignment_pickle)

    procs = []

    # Sort aligned bam
    cmd = config.picard_cmd("SortSam", manifest.tmp_dir, mem="24g")
    cmd.extend([
        "-I",
        sample.aligned_bam,
        "-O",
        "/dev/stdout",
        "--SORT_ORDER",
        "queryname",
    ])
    procs.append(start_popen(cmd, "SortSam", sample, lane))

    # Merge unmapped bam and aligned bam
    cmd = config.picard_cmd("MergeBamAlignment", manifest.tmp_dir, mem="24g")
    cmd.extend([
        "-R",
        sample.reference.fasta,
        "--UNMAPPED",
        sample.polya_filtered_ubam,
        "--ALIGNED",
        "/dev/stdin",
        "-O",
        "/dev/stdout",
        "--COMPRESSION_LEVEL",
        "0",
        "--INCLUDE_SECONDARY_ALIGNMENTS",
        "false",
        "--CLIP_ADAPTERS",
        "false",
    ])

    procs.append(start_popen(cmd, "MergeBamAlignment", sample, lane,
                             procs[-1]))

    # Tag read with interval
    cmd = config.dropseq_cmd("TagReadWithInterval", "/dev/stdin",
                             "/dev/stdout", manifest.tmp_dir)
    cmd.extend([f"INTERVALS={sample.reference.intervals}", "TAG=XG"])

    procs.append(
        start_popen(cmd, "TagReadWithInterval", sample, lane, procs[-1]))

    # Tag read with gene function
    cmd = config.dropseq_cmd(
        "TagReadWithGeneFunction",
        "/dev/stdin",
        sample.processed_bam,
        manifest.tmp_dir,
        compression=5,
    )
    cmd.extend([
        f"ANNOTATIONS_FILE={sample.reference.annotations}",
        "CREATE_INDEX=false"
    ])

    procs.append(
        start_popen(cmd, "TagReadWithGeneFunction", sample, lane, procs[-1]))

    # close intermediate streams
    for p in procs[:-1]:
        p.stdout.close()

    # wait for final process to finish
    procs[-1].communicate()
    log.debug("Finished with post-alignment processing")

    # removed unneeded files
    os.remove(sample.polya_filtered_ubam)
    os.remove(sample.polya_filtered_fastq)
    os.remove(sample.aligned_bam)

    log.debug("Setting group permissions")
    give_group_access(sample.dir)
    if config.gs_path is not None:
        log.debug("Copying data to google storage")
        rsync_to_google(
            sample.lane_dir,
            config.gs_path / sample.lane_dir.relative_to(config.library_dir),
        )

    log.info(f"Alignment for {sample} completed")
Пример #3
0
def main(
    fastq_r1,
    fastq_r2,
    puck_dir,
    output_dir,
    tag_sequence,
    constant_sequence,
    extra_pdf=False,
    debug=False,
    percentile=95.0,
    tag_mismatch=1,
    min_dist=1000,
):
    """
    This script generates some plots for mapping barcoded reads.

    Reads sequences from FASTQ_R1 and FASTQ_R2. Assumes that the first read
    contains a 15bp barcode split across two locations, along with an 8bp UMI.
    The second read is assumed to have TAG_SEQUENCE in bases 20-40.
    """
    create_logger(debug, dryrun=False)

    output_dir = Path(output_dir)
    output_pdf = output_dir / "plots.pdf"

    log.info(f"Saving output to {output_dir}")

    log.debug(f"Reading from {fastq_r1}")
    with gzip.open(fastq_r1, "rt") as fh:
        r1_reads = [line.strip() for line in itertools.islice(fh, 1, None, 4)]

    log.debug(f"Reading from {fastq_r2}")
    with gzip.open(fastq_r2, "rt") as fh:
        r2_reads = [line.strip() for line in itertools.islice(fh, 1, None, 4)]

    assert len(r1_reads) == len(r2_reads), "read different number of reads"
    log.info(f"Total of {len(r1_reads)} reads")

    bead_barcodes, xy = get_barcodes(Path(puck_dir))

    constant_sequence_hset = slideseq.bead_matching.hamming_set(
        slideseq.bead_matching.initial_h_set(constant_sequence),
        include_N=False)

    barcode_codes = [DEGENERATE_BASE_DICT[b] for b in tag_sequence]

    # get unique barcodes
    seq_barcodes = sorted({r1[:8] + r1[26:32] for r1 in r1_reads})
    # remove poly-T sequence if present
    seq_barcodes = [seq for seq in seq_barcodes if set(seq) != {"T"}]

    log.info(f"{len(seq_barcodes)} unique seq barcodes")

    log.debug("calculating bead network")
    degen_bead_barcodes, bead_xy, barcode_mapping = bead_network(
        bead_barcodes, seq_barcodes, xy)

    log.debug("calculating tag network")
    tag_sequences, tag_to_degen_tag = tag_network(
        r2_reads, barcode_codes,
        len(tag_sequence) - tag_mismatch)

    with gzip.open(output_dir / "tag_mapping.txt.gz", "wt") as out:
        for tag, dtag in tag_to_degen_tag.items():
            print(f"{tag}\t{dtag}", file=out)

    with gzip.open(output_dir / "raw_sequences.txt.gz", "wt") as out:
        print("\n".join(tag_sequences), file=out)

    umis_per_bead, raw_umis_per_bead, reads_per_bead = match_tags(
        r1_reads, r2_reads, barcode_mapping, constant_sequence_hset,
        tag_to_degen_tag)

    slideseq.bead_matching.write_barcode_mapping(
        barcode_mapping, bead_xy, output_dir / "barcode_matching.txt.gz")

    slideseq.bead_matching.write_barcode_xy(
        degen_bead_barcodes, bead_xy,
        output_dir / "barcode_coordinates.txt.gz")

    filtered_barcodes = [
        bc for bc in degen_bead_barcodes if bc in umis_per_bead
    ]
    bead_xy_a = np.vstack([bead_xy[dbc] for dbc in filtered_barcodes])

    log.info("Writing output files")
    beads = sorted(umis_per_bead)
    tags = sorted({t for b in beads for t in umis_per_bead[b]})
    raw_tags = sorted({t for b in beads for t in raw_umis_per_bead[b]})

    with gzip.open(output_dir / "beads.txt.gz", "wt") as out:
        print("\n".join(beads), file=out)

    with gzip.open(output_dir / "tags.txt.gz", "wt") as out:
        print("\n".join(tags), file=out)

    with gzip.open(output_dir / "raw_tags.txt.gz", "wt") as out:
        print("\n".join(raw_tags), file=out)

    log.debug("Writing umi matrix")
    m = write_matrix(umis_per_bead, beads, tags,
                     output_dir / "umi_matrix.mtx.gz")

    log.debug("Writing raw umi matrix")
    write_matrix(raw_umis_per_bead, beads, raw_tags,
                 output_dir / "raw_umi_matrix.mtx.gz")

    log.debug("Writing read matrix")
    write_matrix(reads_per_bead, beads, tags,
                 output_dir / "read_matrix.mtx.gz")

    bead_dist = []
    bead_pairs = []

    # tags that appear in >1 bead
    two_beads = np.asarray((m > 0).sum(0) > 1).squeeze().nonzero()[0]
    for i in two_beads:
        # beads with this tag
        nz_beads = np.asarray(m[:, i].todense()).squeeze().nonzero()[0]
        for bi, bj in itertools.combinations(nz_beads, 2):
            bead_pairs.append((bi, bj))
            bead_dist.append(
                np.sqrt(((bead_xy_a[bi, :] - bead_xy_a[bj, :])**2).sum()))

    long_pairs = [
        b_p for b_p, b_d in zip(bead_pairs, bead_dist) if b_d > min_dist
    ]

    log.debug(f"Identified {len(long_pairs)} pairs with distance > {min_dist}")

    if extra_pdf:
        extra_pdf = output_dir / "extra_plots.pdf"
        log.info(f"Saving extra plots to {extra_pdf}")
        extra_pdf = PdfPages(extra_pdf)

        umi_counts = Counter(r[32:41] for r in r1_reads)
        log.debug(
            f"Found {len(umi_counts)} UMIs with {sum(umi_counts.values())} total counts"
        )

        plot_log_hist(umi_counts.values(), "Reads per UMI", extra_pdf)

        plot_log_hist(
            [sum(umis_per_bead[bc].values()) for bc in umis_per_bead],
            "UMIs per bead",
            extra_pdf,
        )

        plot_hist(bead_dist, "Distance distribution for paired tags",
                  extra_pdf)

        extra_pdf.close()

    pdf_pages = PdfPages(output_pdf)

    log.info("Making plots")

    umi_dist = [sum(umis_per_bead[bc].values()) for bc in filtered_barcodes]
    read_dist = [sum(reads_per_bead[bc].values()) for bc in filtered_barcodes]

    spatial_plot_plus_links(
        bead_xy_a,
        long_pairs,
        umi_dist,
        f"UMIs per bead and pairs > {min_dist} pixels",
        pdf_pages,
        pct=percentile,
    )

    spatial_plot(
        bead_xy_a,
        umi_dist,
        "UMIs per bead",
        pdf_pages,
        pct=percentile,
    )

    spatial_plot(
        bead_xy_a,
        np.log10(umi_dist),
        "log10 UMIs per bead",
        pdf_pages,
        pct=percentile,
    )

    spatial_plot(
        bead_xy_a,
        read_dist,
        "Reads per bead",
        pdf_pages,
        pct=percentile,
    )

    spatial_plot(
        bead_xy_a,
        np.log10(read_dist),
        "log10 reads per bead",
        pdf_pages,
        pct=percentile,
    )

    pdf_pages.close()
    log.info("Done!")
Пример #4
0
def main(library_index: int,
         manifest_file: str,
         debug: bool = False,
         log_file: str = None):
    create_logger(debug=debug, log_file=log_file)
    config = get_config()

    log.debug(f"Reading manifest from {manifest_file}")
    manifest = Manifest.from_file(Path(manifest_file))

    # task array is 1-indexed
    library = manifest.get_library(library_index - 1)

    if not library.gen_downsampling:
        log.debug("Downsampling not requested, nothing to do")
        return

    log.info(f"Downsampling alignments for library {library.name}")
    library.downsample_dir.mkdir(exist_ok=True, parents=True)

    if library.run_barcodematching:
        # this will be much faster on matched bams
        downsample_input = library.matched
        downsample_tag = "XB"
    else:
        downsample_input = library.merged
        downsample_tag = "XC"

    downsample_output = []

    # Progressively downsample the BAM from largest to smallest
    input_bam = downsample_input.bam
    for n in range(9, 0, -1):
        ratio = n / 10

        downsampled_bam = downsample_input.downsampled_bam(ratio)
        downsample_output.append(
            downsample_dge(
                config=config,
                bam_file=input_bam,
                downsampled_bam=downsampled_bam,
                cell_tag=downsample_tag,
                library=library,
                ratio=ratio,
                tmp_dir=manifest.tmp_dir,
            ))

        if input_bam != downsample_input.bam:
            os.remove(input_bam)
        input_bam = downsampled_bam

    # remove final downsampled_bam
    if input_bam != downsample_input.bam:
        os.remove(input_bam)

    plot_downsampling(
        downsample_output,
        downsample_input.digital_expression_summary,
        downsample_input.downsampling_pdf,
    )

    log.debug("Setting group permissions")
    give_group_access(library.dir)
    if config.gs_path is not None:
        log.debug("Copying data to google storage")
        rsync_to_google(library.dir, config.gs_path / library.date_name)

    log.info(f"Downsampling for {library} complete")
Пример #5
0
def main(library_index: int,
         manifest_file: str,
         debug: bool = False,
         log_file: str = None):
    create_logger(debug=debug, log_file=log_file)
    config = get_config()

    log.debug(f"Reading manifest from {manifest_file}")
    manifest = Manifest.from_file(Path(manifest_file))

    # task array is 1-indexed
    library = manifest.get_library(library_index - 1)

    log.debug(f"Processing alignments for library {library.name}")

    # define barcode matching files, if needed
    barcode_matching_file = (library.barcode_matching_dir /
                             f"{library}_barcode_matching.txt.gz")
    barcode_coordinate_file = (library.barcode_matching_dir /
                               f"{library}_barcode_xy.txt.gz")
    matched_barcodes_file = (library.barcode_matching_dir /
                             f"{library}_matched_barcodes.txt.gz")

    # Combine check_alignments_quality files, and plot histograms
    alignment_quality.combine_alignment_stats(library)

    # Merge bam files
    cmd = config.picard_cmd("MergeSamFiles", manifest.tmp_dir)
    cmd.extend([
        "--CREATE_INDEX",
        "true",
        "--CREATE_MD5_FILE",
        "false",
        "--OUTPUT",
        library.merged.bam,
        "--SORT_ORDER",
        "coordinate",
        "--ASSUME_SORTED",
        "true",
    ])

    for bam_file in library.processed_bams:
        cmd.extend(["--INPUT", bam_file])

    run_command(cmd, "MergeBamFiles", library)

    # generate various metrics files, including digital expression matrix
    calc_alignment_metrics(config, library.merged, library, manifest.tmp_dir)

    if library.run_barcodematching:
        library.barcode_matching_dir.mkdir(exist_ok=True, parents=True)
        shutil.copy(library.bead_barcodes, library.barcode_matching_dir)
        shutil.copy(library.bead_locations, library.barcode_matching_dir)

        barcode_list, barcode_mapping, bead_xy, _ = bead_matching.match_barcodes(
            library.merged.selected_cells, library.bead_barcodes,
            library.bead_locations)

        bead_matching.write_barcode_mapping(barcode_mapping, bead_xy,
                                            barcode_matching_file)
        bead_matching.write_barcode_xy(barcode_list, bead_xy,
                                       barcode_coordinate_file)

        with gzip.open(matched_barcodes_file, "wt") as out:
            for bead_bc in sorted(set(barcode_mapping.values())):
                print(bead_bc, file=out)

        # subset to the matched beads and add combined barcode as XB tag
        write_retagged_bam(library.merged.bam, library.matched.bam,
                           barcode_mapping)

        # do it all again, but should be faster on the smaller file
        calc_alignment_metrics(
            config,
            library.matched,
            library,
            manifest.tmp_dir,
            matched_barcodes_file,
            "XB",
        )

        write_sparse_matrix(library)

        make_library_plots(library, bead_xy)
    else:
        make_library_plots(library)

    # remove unneeded files now that we're done
    for bam_file in library.processed_bams:
        log.debug(f"Removing {bam_file}")
        os.remove(bam_file)

    if matched_barcodes_file.exists():
        log.debug(f"Removing {matched_barcodes_file}")
        os.remove(matched_barcodes_file)

    log.debug("Setting group permissions")
    give_group_access(library.dir)
    if config.gs_path is not None:
        log.debug("Copying data to google storage")
        rsync_to_google(library.dir, config.gs_path / library.date_name)

    log.info(f"Processing for {library} complete")
Пример #6
0
def main(
    genome_name: str,
    reference_fasta: str,
    reference_gtf: str,
    mt_sequence: str,
    filter_biotypes: list[str],
    overwrite: bool = False,
    debug: bool = False,
    dryrun: bool = False,
    log_file: str = None,
):
    create_logger(debug=debug, dryrun=dryrun, log_file=log_file)
    env_name = slideseq.util.get_env_name()
    log.debug(f"Running in Conda env {env_name}")
    config = get_config()

    reference_fasta = Path(reference_fasta)
    reference_gtf = Path(reference_gtf)
    output_dir = config.reference_dir / genome_name
    star_dir = output_dir / "STAR"

    log.info(f"Building reference for genome {genome_name}")

    if output_dir.exists():
        if overwrite:
            log.warning(f"{output_dir} exists, overwriting existing reference")
            if not dryrun:
                if star_dir.exists():
                    log.debug(f"Removing and remaking {star_dir}")
                    shutil.rmtree(star_dir)
                    star_dir.mkdir()
                if (output_dir / f"{genome_name}.dict").exists():
                    log.debug(f"Removing {output_dir}/{genome_name}.dict")
                    os.remove(output_dir / f"{genome_name}.dict")
        else:
            log.error(
                f"{star_dir} already exists and overwrite=False, aborting")
            sys.exit(1)
    else:
        log.info(f"Creating output directory {star_dir}")
        if not dryrun:
            star_dir.mkdir(parents=True)

    if check_gtf(reference_gtf, mt_sequence):
        log.info("GTF has all required fields")
    else:
        log.error("Need to fix GTF errors")
        return

    log.info(f"Creating genome reference for {reference_fasta}")

    # this script will create a reference for slideseq
    with importlib.resources.path(slideseq.scripts,
                                  "build_reference.sh") as qsub_script:
        mkref_args = qsub_args(
            log_file=output_dir / "build_reference.log",
            CONDA_ENV=env_name,
            PICARD_JAR=config.picard,
            DROPSEQ_DIR=config.dropseq_dir,
            GENOME_NAME=genome_name,
            REFERENCE_FASTA=reference_fasta,
            REFERENCE_GTF=reference_gtf,
            OUTPUT_DIR=output_dir,
            MT_SEQUENCE=mt_sequence,
            FILTERED_BIOTYPES=" ".join(f"G={biotype}"
                                       for biotype in filter_biotypes),
        )
        mkref_args.append(f"{qsub_script.absolute()}")

        log.debug(f"Build-reference command:\n\t{' '.join(mkref_args)}")
        if dryrun:
            return

        # qsub may sporadically fail due to network issues
        for _ in range(constants.MAX_QSUB):
            proc = run(mkref_args, capture_output=True, text=True)
            if int(proc.returncode) != 0:
                log.warning("qsub failed, retrying")
                log.debug(f"Error: {proc.stderr}")
            else:
                break
        else:
            log.error(f"Unable to launch build_ref job for {reference_fasta}")
Пример #7
0
def main(
    fastq_r1,
    fastq_r2,
    barcodes,
    locations,
    tag_sequence,
    output_pdf,
    extra_pdf=None,
    debug=False,
    percentile=95.0,
):
    """
    This script generates some plots for mapping barcoded reads.

    Reads sequences from FASTQ_R1 and FASTQ_R2. Assumes that the first read
    contains a 15bp barcode split across two locations, along with an 8bp UMI.
    The second read is assumed to have TAG_SEQUENCE in bases 20-40.
    """
    create_logger(debug, dryrun=False)

    output_pdf = Path(output_pdf)

    log.debug(f"Reading from {fastq_r1}")
    with gzip.open(fastq_r1, "rt") as fh:
        r1_reads = [line.strip() for line in itertools.islice(fh, 1, None, 4)]

    log.debug(f"Reading from {fastq_r2}")
    with gzip.open(fastq_r2, "rt") as fh:
        r2_reads = [line.strip() for line in itertools.islice(fh, 1, None, 4)]

    log.debug(f"Reading {barcodes}")
    with open(barcodes) as fh:
        raw_bcs = ["".join(line.strip().split(",")) for line in fh]

    log.debug(f"Reading {locations}")
    with open(locations) as fh:
        x = np.array([float(v) for v in fh.readline().strip().split(",")])
        y = np.array([float(v) for v in fh.readline().strip().split(",")])
        xy = np.vstack((x, y)).T

    if extra_pdf is not None:
        extra_pdf_pages = PdfPages(extra_pdf)

        umi_counts = Counter(r[32:41] for r in r1_reads)
        log.debug(
            f"Found {len(umi_counts)} UMIs with {sum(umi_counts.values())} total counts"
        )

        plot_log_hist(umi_counts.values(), "Reads per UMI", extra_pdf_pages)
    else:
        extra_pdf_pages = None

    # pre-emptively remove poly-T/N sequences
    ok_barcodes = [not set(bc).issubset({"T", "N"}) for bc in raw_bcs]
    xy = xy[ok_barcodes, :]
    bead_barcodes = [bc for ok, bc in zip(ok_barcodes, raw_bcs) if ok]

    log.info(f"Read {len(raw_bcs)} barcodes and filtered to {len(bead_barcodes)}")

    seq_barcodes = sorted(r1[:8] + r1[26:32] for r1 in r1_reads)
    # remove poly-T sequence if present
    seq_barcodes = [seq for seq in seq_barcodes if set(seq) != {"T"}]

    log.info(f"Found {len(set(seq_barcodes))} unique barcodes in sequencing data")

    log.info("Computing barcode matching")

    log.debug("Computing radius neighbor graph")
    # adjacency matrix for all beads within radius of each other
    radius_matrix = radius_neighbors_graph(xy, radius=10.0)

    log.debug("Computing hamming neighbor graph")
    # adjacency matrix for all barcodes within hamming distance 1
    hamming_matrix = hamming1_adjacency(bead_barcodes)

    # just multiply together to get the combined adjacency matrix!
    combined_graph = nx.from_scipy_sparse_matrix(radius_matrix.multiply(hamming_matrix))

    # add xy coordinates to graph so we can analyze later
    for n, (x, y) in zip(combined_graph.nodes, xy):
        combined_graph.nodes[n]["x"] = x
        combined_graph.nodes[n]["y"] = y

    # get connected components to find groups of similar/close barcodes
    bead_groups = list(nx.connected_components(combined_graph))

    # calculate degenerate (ambiguous bases -> N) barcodes
    degen_bead_barcodes = [
        degen_barcode({bead_barcodes[j] for j in bg}) for bg in bead_groups
    ]

    log.debug(
        f"Collapsed {len(bead_groups)} bead groups into"
        f" {len(set(degen_bead_barcodes))} barcodes"
    )

    # average xy for grouped beads to get centroids
    bead_xy = dict()
    for bg, degen_bc in zip(bead_groups, degen_bead_barcodes):
        bg_graph = combined_graph.subgraph(bg)
        mean_x, mean_y = np.array(
            [[nd["x"], nd["y"]] for _, nd in bg_graph.nodes(data=True)]
        ).mean(0)
        bead_xy[degen_bc] = (mean_x, mean_y)

    barcode_matching = bipartite_matching(
        bead_barcodes, degen_bead_barcodes, bead_groups, seq_barcodes
    )

    if extra_pdf is not None:
        tag_barcodes = [r2[20:40] for r2 in r2_reads]
        tag_counts = Counter(tag_barcodes)

        sum(1 for r1 in r1_reads if (r1[:8] + r1[26:32]) in barcode_matching)

        umis_per_tag = defaultdict(set)
        for r1, r2 in zip(r1_reads, r2_reads):
            umis_per_tag[r2[20:40]].add(r1[32:41])

        plot_log_hist(tag_counts.values(), "Reads per tag", extra_pdf_pages)
        plot_log_hist(
            list(map(len, umis_per_tag.values())), "UMIs per tag", extra_pdf_pages
        )

    log.debug(f"Counting UMIs and reads per bead for sequence {tag_sequence}")
    reads_per_umi_per_bead = defaultdict(Counter)
    umis_per_bead = defaultdict(set)
    reads_per_bead = Counter()

    for r1, r2 in zip(r1_reads, r2_reads):
        seq_bc = r1[:8] + r1[26:32]

        if seq_bc not in barcode_matching:
            continue
        if r2[20:40] != tag_sequence:
            continue

        bead_bc = barcode_matching[seq_bc]
        umi = r1[32:41]

        reads_per_umi_per_bead[bead_bc][umi] += 1
        umis_per_bead[bead_bc].add(umi)
        reads_per_bead[bead_bc] += 1

    filtered_barcodes = [bc for bc in degen_bead_barcodes if umis_per_bead[bc]]
    bead_xy_a = np.vstack([bead_xy[dbc] for dbc in filtered_barcodes])

    with gzip.open(output_pdf.with_suffix(".reads_per_umi.txt.gz"), "wt") as out:
        print("bead_barcodes\tumi\treads", file=out)
        for bc in filtered_barcodes:
            for umi in reads_per_umi_per_bead[bc][umi]:
                print(f"{bc}\t{umi}\t{reads_per_umi_per_bead[bc][umi]}", file=out)

    with output_pdf.with_suffix(".txt").open("w") as out:
        print("bead_barcode\tumis\treads", file=out)
        for bc in filtered_barcodes:
            print(f"{bc}\t{len(umis_per_bead[bc])}\t{reads_per_bead[bc]}", file=out)

    if extra_pdf is not None:
        plot_log_hist(
            [len(umis_per_bead[bc]) for bc in filtered_barcodes],
            "UMIs per bead",
            extra_pdf_pages,
        )

        extra_pdf_pages.close()

    pdf_pages = PdfPages(output_pdf)

    log.info("Making plots")
    spatial_plot(
        bead_xy_a,
        [len(umis_per_bead[bc]) for bc in filtered_barcodes],
        "UMIs per bead",
        pdf_pages,
        pct=percentile,
    )

    spatial_plot(
        bead_xy_a,
        [np.log10(len(umis_per_bead[bc])) for bc in filtered_barcodes],
        "log10 UMIs per bead",
        pdf_pages,
        pct=percentile,
    )

    spatial_plot(
        bead_xy_a,
        [reads_per_bead[bc] for bc in filtered_barcodes],
        "Reads per bead",
        pdf_pages,
        pct=percentile,
    )

    spatial_plot(
        bead_xy_a,
        [np.log10(1 + reads_per_bead[bc]) for bc in filtered_barcodes],
        "log10 reads per bead",
        pdf_pages,
        pct=percentile,
    )

    pdf_pages.close()
    log.info("Done!")
Пример #8
0
def main(
    runs: list[str],
    demux: bool = True,
    align: bool = True,
    processing: bool = True,
    dryrun: bool = False,
    debug: bool = False,
    log_file: str = None,
):
    """
    Submit each RUN to the Slide-seq alignment pipeline.

    See README.md for instructions and requirements: github.com/MacoskoLab/slideseq-tools
    """

    create_logger(debug=debug, dryrun=dryrun, log_file=log_file)
    env_name = get_env_name()
    log.debug(f"Running in conda env {env_name}")
    config = get_config()

    # you shouldn't demux without aligning, that's weird
    if demux and not align:
        log.debug("Assuming --no-align should also set --no-demux")
        demux = False

    log.debug("Fetching Google credentials")
    google_creds = gutil.get_secrets_manager_credentials(config.gsecret_name)

    log.debug("Setting up Google Drive service")
    drive_service = gutil.get_service(google_creds)

    # get a pandas DataFrame of the worksheet
    log.debug(
        f"Downloading Google Sheet, id={config.gsheet_id} worksheet={config.worksheet}"
    )
    worksheet_df = gutil.GoogleSheet(drive_service,
                                     config.gsheet_id)[config.worksheet]
    worksheet_df = worksheet_df.dropna(axis=0, how="all")

    log.debug(
        f"Retreived worksheet {config.worksheet} with {len(worksheet_df)} rows"
    )

    log.info(f"Beginning submission for {len(runs)} runs")
    submitted = set()
    run_errors = set()

    for run_name in runs:
        run_df = worksheet_df.loc[worksheet_df["run_name"] == run_name]

        if not len(run_df):
            log.warning(
                f"{run_name} not found in worksheet; please add to sheet.")
            run_errors.add(run_name)
            continue

        log.debug(f"Found {len(run_df)} libraries in worksheet")

        # subset to metadata columns
        run_df = run_df[constants.METADATA_COLS]
        run_df.columns = [c.lower() for c in constants.METADATA_COLS]

        if not validate_run_df(run_name, run_df):
            run_errors.add(run_name)
            continue

        # convert columns to desired types
        run_df = run_df.astype(constants.METADATA_TYPES)

        # data locations
        output_dir = config.workflow_dir / run_name
        flowcell_dirs = sorted(Path(fd) for fd in set(run_df.bclpath))

        manifest_file = output_dir / "manifest.yaml"
        metadata_file = output_dir / "metadata.csv"

        run_info_list = []

        for flowcell_dir in flowcell_dirs:
            run_info_list.append(get_run_info(flowcell_dir))

        manifest = Manifest(
            run_name=run_name,
            flowcell_dirs=flowcell_dirs,
            workflow_dir=output_dir,
            library_dir=config.library_dir,
            metadata_file=metadata_file,
            metadata=split_sample_lanes(run_df, run_info_list),
            email_addresses=sorted(
                set(e.strip() for v in run_df.email for e in v.split(","))),
        )

        n_libraries = len(list(manifest.libraries))

        if not dryrun:
            log.debug("Creating output directories")
            output_dir.mkdir(exist_ok=True)

            manifest.log_dir.mkdir(exist_ok=True)
            if list(manifest.log_dir.glob("*.log")):
                log.warning(
                    "Log files already exist for this job, new output will be appended"
                )

            manifest.library_dir.mkdir(exist_ok=True)
            manifest.tmp_dir.mkdir(exist_ok=True)

            log.debug(f"Writing manifest to {manifest_file}")
            if metadata_file.exists():
                log.info(f"Overwriting metadata file {metadata_file}")
            manifest.to_file(manifest_file)

            if demux:
                # make various directories
                prepare_demux(run_info_list, manifest)
            elif not validate_demux(manifest):
                # appears that demux was not run previously
                run_errors.add(run_name)
                continue
            elif not (align or validate_alignment(manifest, n_libraries)):
                # appears that alignment was not run and wasn't requested
                run_errors.add(run_name)
                continue

        demux_jids = dict()

        # this script will check the sequencing directory, extract barcodes,
        # and demultiplex to BAM files
        with importlib.resources.path(slideseq.scripts,
                                      "demultiplex.sh") as qsub_script:
            for run_info in run_info_list:
                demux_args = qsub_args(
                    log_file=manifest.log_dir / run_info.demux_log,
                    email=",".join(manifest.email_addresses),
                    PICARD_JAR=config.picard,
                    TMP_DIR=manifest.tmp_dir,
                    FLOWCELL=run_info.flowcell,
                    BASECALLS_DIR=run_info.basecall_dir,
                    READ_STRUCTURE=run_info.read_structure,
                    OUTPUT_DIR=output_dir,
                )
                demux_args.extend([
                    "-t",
                    f"{min(run_info.lanes)}-{max(run_info.lanes)}",
                    f"{qsub_script.absolute()}",
                ])

                if demux:
                    demux_jids[run_info.flowcell] = attempt_qsub(
                        demux_args, run_name, "demultiplex", dryrun)
                    if demux_jids[run_info.flowcell] is None:
                        run_errors.add(run_name)
                        continue
                else:
                    demux_jids[run_info.flowcell] = None
                    log.debug("Skipping demux step")

        if run_name in run_errors:
            continue

        alignment_jids = dict()

        # this script processes/filters the extracted uBAMs and aligns them to the
        # specified reference. this depends on previous jobs per lane, so we use
        # -hold_jid on the lane-specific demux job
        with importlib.resources.path(slideseq.scripts,
                                      "alignment.sh") as qsub_script:
            for run_info in run_info_list:
                for lane in run_info.lanes:
                    if demux and demux_jids[run_info.flowcell] is None:
                        log.debug(
                            f"Not aligning {lane} because demux was not submitted"
                        )
                        continue

                    alignment_args = qsub_args(
                        log_file=manifest.log_dir /
                        run_info.alignment_log(lane),
                        email=",".join(manifest.email_addresses),
                        debug=debug,
                        CONDA_ENV=env_name,
                        FLOWCELL=run_info.flowcell,
                        LANE=lane,
                        MANIFEST=manifest_file,
                    )

                    if demux:
                        alignment_args.extend([
                            "-hold_jid",
                            f"{demux_jids[run_info.flowcell]}[{lane}]"
                        ])

                    alignment_args.extend([
                        "-t", f"1-{n_libraries}", f"{qsub_script.absolute()}"
                    ])

                    if align:
                        alignment_jids[run_info.flowcell, lane] = attempt_qsub(
                            alignment_args, run_name, "alignment", dryrun)
                        if alignment_jids[run_info.flowcell, lane] is None:
                            run_errors.add(run_name)
                    else:
                        alignment_jids[run_info.flowcell, lane] = None
                        log.info("Skipping alignment step")

        if run_name in run_errors:
            continue

        # this script analyzes the alignment output, generates plots, matches to puck, etc
        # this is per-library, which means each library needs to wait on the alignment jobs
        # for the relevant lane(s) that contain that library. We're going to wait on all the
        # lanes, but use hold_jid_ad to wait on only that library's alignments
        with importlib.resources.path(slideseq.scripts,
                                      "processing.sh") as qsub_script:
            if align and any(alignment_jids[run_info.flowcell, lane] is None
                             for run_info in run_info_list
                             for lane in run_info.lanes):
                log.debug(
                    "Not processing because some alignments were not submitted"
                )
                continue

            processing_args = qsub_args(
                log_file=manifest.log_dir / "processing.$TASK_ID.log",
                email=",".join(manifest.email_addresses),
                debug=debug,
                CONDA_ENV=env_name,
                MANIFEST=manifest_file,
            )

            if align:
                for run_info in run_info_list:
                    for lane in run_info.lanes:
                        processing_args.extend([
                            "-hold_jid_ad",
                            f"{alignment_jids[run_info.flowcell, lane]}",
                        ])

            processing_args.extend(
                ["-t", f"1-{n_libraries}", f"{qsub_script.absolute()}"])

            if processing:
                processing_jid = attempt_qsub(processing_args, run_name,
                                              "processing", dryrun)
                if processing_jid is None:
                    run_errors.add(run_name)
            else:
                processing_jid = None
                log.info("Skipping processing step")

        if run_name in run_errors:
            continue

        # this (optional) script will downsample the alignment output and plot the results
        # this is per-library, which means each library needs to wait on the processing jobs
        # for the library
        with importlib.resources.path(slideseq.scripts,
                                      "downsampling.sh") as qsub_script:
            if processing and processing_jid is None:
                log.debug(
                    "Not downsampling because processing job was not submitted"
                )
                continue

            downsample_args = qsub_args(
                log_file=manifest.log_dir / "downsampling.$TASK_ID.log",
                email=",".join(manifest.email_addresses),
                debug=debug,
                CONDA_ENV=env_name,
                MANIFEST=manifest_file,
            )

            if processing:
                downsample_args.extend(["-hold_jid_ad", f"{processing_jid}"])

            downsample_args.extend(
                ["-t", f"1-{n_libraries}", f"{qsub_script.absolute()}"])

            downsample_jid = attempt_qsub(downsample_args, run_name,
                                          "downsampling", dryrun)
            if downsample_jid is None:
                run_errors.add(run_name)
            else:
                submitted.add(run_name)

    if submitted and not dryrun:
        log.info(f"Flowcells {', '.join(submitted)} submitted for processing")
    else:
        log.info("No flowcells submitted for processing")

    if run_errors:
        log.info(
            f"Flowcells {', '.join(run_errors)} had errors -- see warnings above."
        )