예제 #1
0
def main(assay: Assay, orig_fastq_dirs: Sequence[Path], adj_fastq_dir: Path,
         threads: int):
    fastq_pairs: Iterable[Sequence[Path]]
    if assay.barcode_adj_performed:
        if assay.barcode_adj_r1_r2:
            fastq_pairs = find_grouped_fastq_files(adj_fastq_dir, 2)
        else:
            fastq_pairs = [find_adj_fastq_files(adj_fastq_dir)]
    else:
        fastq_pairs = chain.from_iterable(
            find_grouped_fastq_files(fastq_dir, 2)
            for fastq_dir in orig_fastq_dirs)

    with ProcessPoolExecutor(max_workers=threads) as executor:
        futures = []
        for i, (r1_fastq_file, r2_fastq_file) in enumerate(fastq_pairs, 1):
            subdir = OUTPUT_PATH / str(i)
            subdir.mkdir(exist_ok=True, parents=True)
            future = executor.submit(
                trim_reads,
                r1_fastq_file,
                r2_fastq_file,
                subdir,
            )
            futures.append(future)
        wait(futures)
def main(
    assay: Assay,
    orig_fastq_dirs: Sequence[Path],
    trimmed_fastq_dir: Path,
    expected_cell_count: Optional[int],
    threads: Optional[int],
):
    threads = threads or 1
    command = [
        piece.format(
            salmon_option=assay.salmon_option,
            threads=threads,
        ) for piece in SALMON_COMMAND
    ]

    fastq_pairs: Iterable[Sequence[Path]]
    if assay.barcode_adj_performed:
        if assay.barcode_adj_r1_r2:
            fastq_pairs = list(find_grouped_fastq_files(trimmed_fastq_dir, 2))
        else:
            fastq_pairs = list(find_adj_fastq_files(trimmed_fastq_dir))
    else:
        fastq_pairs = list(find_grouped_fastq_files(trimmed_fastq_dir, 2))

    if not fastq_pairs:
        raise ValueError("No FASTQ files found")

    if assay.keep_all_barcodes:
        command.extend(["--keepCBFraction", "1"])
    # hack
    if assay == Assay.SLIDESEQ:
        # Don't support multiple input directories for Slide-seq; this will
        # likely cause significantly incorrect results due to barcode overlap
        # between multiple input data sets
        if len(orig_fastq_dirs) != 1:
            raise ValueError("Need exactly 1 input directory for Slide-seq")
        barcode_file = find_slideseq_barcode_file(orig_fastq_dirs[0])
        command.extend(["--whitelist", fspath(barcode_file)])

    maybe_cell_count = read_expected_cell_count(
        orig_fastq_dirs) or expected_cell_count
    if maybe_cell_count is not None:
        command.extend(["--forceCells", str(maybe_cell_count)])

    for r1_fastq_file, r2_fastq_file in fastq_pairs:
        fastq_extension = [
            "-1",
            r1_fastq_file,
            "-2",
            r2_fastq_file,
        ]
        command.extend(fastq_extension)

    print("Running:", " ".join(str(x) for x in command))
    env = environ.copy()
    # Necessary for Singularity; this environment variable isn't
    # set by that container runtime but is required to run Salmon
    env["LD_LIBRARY_PATH"] = "/usr/local/lib"
    check_call(command)
예제 #3
0
def main(threads: int, directory: Path):
    for r1_fastq_file, r2_fastq_file in find_grouped_fastq_files(directory, 2):

        command = [
            piece.format(threads=threads)
            for piece in SALMON_COMMAND
        ]

        fastq_extension = [
            '-1',
            fspath(r1_fastq_file),
            '-2',
            fspath(r2_fastq_file),
        ]

        command.extend(fastq_extension)
        print('Running:', command)
        check_call(command)

        check_call(TAR_AND_ZIP_COMMAND)
        #tar and zip auxiliary files

        sample_id = get_sample_id_from_r1(r1_fastq_file)

        #Tag output files with sample_id
        rename_file("out/quant.sf", f"out/{sample_id}-quant.sf")
        rename_file("out/cmd_info.json", f"out/{sample_id}-cmd_info.json")
        rename_file("out/aux_files.tar.gz", f"out/{sample_id}-aux_files.tar.gz")
def main(directories: Iterable[Path], assay: Assay):
    CONCAT_OUTPUT_DIR.mkdir(exist_ok=True, parents=True)
    for directory in directories:
        for fastqs_list in find_grouped_fastq_files(directory,
                                                    assay.fastq_count):
            num_fastqs = len(fastqs_list)
            if num_fastqs == 3:
                # For assays that use three files
                # flipping R2 and R3 here is deliberate. For the input data, barcodes
                # are in R2, and R1 and R3 are the two reads which should be aligned
                # to the genome. Internally in this pipeline, it's more straightforward
                # to rename these: R1 -> R1, R2 -> barcode, R3 -> R2
                r1_fastq = fastqs_list[0]
                decompress_concat_fastq(r1_fastq, MERGED_FASTQ_R1)
                r2_fastq = fastqs_list[1]
                decompress_concat_fastq(r2_fastq, MERGED_FASTQ_BARCODE)
                r3_fastq = fastqs_list[2]
                decompress_concat_fastq(r3_fastq, MERGED_FASTQ_R2)
            elif num_fastqs == 2:
                r1_fastq = fastqs_list[0]
                decompress_concat_fastq(r1_fastq, MERGED_FASTQ_R1)
                r2_fastq = fastqs_list[1]
                decompress_concat_fastq(r2_fastq, MERGED_FASTQ_R2)
            else:
                raise ValueError(
                    f"Could not unzip and concatenate fastqs becuase there are {num_fastqs} of them"
                )
def main(assay: Assay, fastq_dirs: Iterable[Path], output_filename_prefix: str,
         output_dir: Path):
    all_fastqs = chain.from_iterable(
        find_grouped_fastq_files(fastq_dir, assay.fastq_count)
        for fastq_dir in fastq_dirs)

    for fastq_r1, fastq_r2 in all_fastqs:
        convert_fastq(fastq_r1,
                      output_dir / f"{output_filename_prefix}_R1.fastq")
        convert_fastq(fastq_r2,
                      output_dir / f"{output_filename_prefix}_R2.fastq")
예제 #6
0
def main(assay: Assay, orig_fastq_dir: Iterable[Path], adj_fastq_dir: Path,
         threads: int):
    command = [
        piece.format(
            salmon_option=assay.salmon_option,
            threads=threads,
        ) for piece in SALMON_COMMAND
    ]

    fastq_pairs: List[Sequence[Path]]
    if assay.barcode_adj_performed:
        if assay.barcode_adj_r1_r2:
            fastq_pairs = list(find_grouped_fastq_files(adj_fastq_dir, 2))
        else:
            fastq_pairs = [find_adj_fastq_files(adj_fastq_dir)]
    else:
        fastq_pairs = list(
            chain.from_iterable(
                find_grouped_fastq_files(fastq_dir, 2)
                for fastq_dir in orig_fastq_dir))

    if assay.keep_all_barcodes:
        command.extend(['--keepCBFraction', '1'])

    for r1_fastq_file, r2_fastq_file in fastq_pairs:
        fastq_extension = [
            '-1',
            r1_fastq_file,
            '-2',
            r2_fastq_file,
        ]
        command.extend(fastq_extension)

    print('Running:', ' '.join(str(x) for x in command))
    env = environ.copy()
    # Necessary for Singularity; this environment variable isn't
    # set by that container runtime but is required to run Salmon
    env['LD_LIBRARY_PATH'] = '/usr/local/lib'
    check_call(command)
def main(
    fastq_dirs: Iterable[Path],
    output_dir: Path = Path(),
    barcode_filename: Path = BARCODE_ALLOWLIST_FILE,
    n6_dt_mapping_file: Path = N6_DT_MAPPING_FILE,
):
    barcode_allowlist = read_barcode_allowlist(barcode_filename)
    correcter = bu.BarcodeCorrecter(barcode_allowlist, edit_distance=1)

    n6_dt_mapping = read_n6_dt_mapping(n6_dt_mapping_file)

    buf = output_dir / BARCODE_UMI_FASTQ_PATH
    trf = output_dir / TRANSCRIPT_FASTQ_PATH

    all_fastqs = chain.from_iterable(
        find_grouped_fastq_files(fastq_dir, 2) for fastq_dir in fastq_dirs)

    with open(buf, "w") as cbo, open(trf, "w") as tro:
        for transcript_fastq, barcode_umi_fastq in all_fastqs:
            usable_count = 0
            i = 0
            print("Correcting barcodes in", transcript_fastq, "and",
                  barcode_umi_fastq)
            transcript_reader = fastq_reader(transcript_fastq)
            barcode_umi_reader = fastq_reader(barcode_umi_fastq)
            for i, (tr, br) in enumerate(
                    zip(transcript_reader, barcode_umi_reader), 1):
                barcode_pieces = [revcomp(br.seq[s]) for s in BARCODE_SEGMENTS]
                rc_corrected = [
                    correcter.correct(barcode) for barcode in barcode_pieces
                ]
                if all(rc_corrected):
                    rc_corrected[2] = n6_dt_mapping[rc_corrected[2]]
                    corrected = [revcomp(rc_bc) for rc_bc in rc_corrected]
                    usable_count += 1
                    umi_seq = br.seq[UMI_SEGMENT]
                    umi_qual = br.qual[UMI_SEGMENT]
                    new_seq = "".join(corrected + [umi_seq])
                    new_qual = BARCODE_QUAL_DUMMY + umi_qual
                    new_br = Read(
                        read_id=br.read_id,
                        seq=new_seq,
                        unused=br.unused,
                        qual=new_qual,
                    )
                    print(tr.serialize(), file=tro)
                    print(new_br.serialize(), file=cbo)

            print("Total count:", i)
            print("Usable count:", usable_count)
            print("Proportion:", usable_count / i)
예제 #8
0
def main(assay: Assay, fastq_dirs: Iterable[Path], output_filename_prefix,
         output_dir: Path):

    baraddedf1 = output_dir / f"{output_filename_prefix}_R1.fastq"
    baraddedf2 = output_dir / f"{output_filename_prefix}_R2.fastq"

    all_fastqs = chain.from_iterable(
        find_grouped_fastq_files(fastq_dir, assay.fastq_count)
        for fastq_dir in fastq_dirs)

    with open(baraddedf1, "w") as barf1addedout, open(baraddedf2,
                                                      "w") as barf2addedout:
        for fastq1_file, fastq2_file, barcode_file in all_fastqs:

            i = 0
            print("Adding barcodes to", fastq1_file, "and", fastq2_file,
                  "using", barcode_file)
            fastq1_reader = fastq_reader(fastq1_file)
            fastq2_reader = fastq_reader(fastq2_file)
            barcode_reader = fastq_reader(barcode_file)

            for i, (f1r, f2r, bar) in enumerate(
                    zip(fastq1_reader, fastq2_reader, barcode_reader), 1):
                if assay == Assay.SNARESEQ:
                    # This code assumes the barcode fastq is the third fastq; e.g. '...R3...'
                    barcode_pieces = [
                        revcomp(bar.seq[s]) for s in SNARESEQ_BARCODE_SEGMENTS
                    ]
                    # For some reason the barcode pieces are reversed in the list
                    # so get them back in the right order
                    barcode_pieces.reverse()
                    umi_seq = bar.seq[UMI_SEGMENT]
                elif assay == Assay.SNSEQ:
                    barcode_pieces = [f2r.seq[SNSEQ_BARCODE_SEGMENT]]
                    # print("barcode pieces={}".format(barcode_pieces))
                    umi_seq = ""
                else:
                    print(
                        "Could not adjust barcodes for assay {}".format(assay))

                create_and_output_barcode_adjusted_reads(
                    f1r, f2r, umi_seq, barcode_pieces, barf1addedout,
                    barf2addedout)
예제 #9
0
def main(directories: Iterable[Path]):
    sequence_file_bundles = []

    for directory in directories:
        for r1_fastq_file, r2_fastq_file in find_grouped_fastq_files(
                directory, 2):
            sequence_bundle = {
                "fastq_r1": {
                    "class": "File",
                    "path": fspath(r1_fastq_file),
                },
                "fastq_r2": {
                    "class": "File",
                    "path": fspath(r2_fastq_file),
                },
            }
            sequence_file_bundles.append(sequence_bundle)

    print("Sequence file bundles:")
    pprint(sequence_file_bundles)

    with open("input.json", "w") as text_file:
        json.dump(sequence_file_bundles, text_file)