def main(
    fastq_dirs: Iterable[Path],
    output_dir: Path = Path(),
    barcode_filename: Path = BARCODE_ALLOWLIST_FILE,
    n6_dt_mapping_file: Path = N6_DT_MAPPING_FILE,
):
    barcode_allowlist = read_barcode_allowlist(barcode_filename)
    correcter = bu.BarcodeCorrecter(barcode_allowlist, edit_distance=1)

    n6_dt_mapping = read_n6_dt_mapping(n6_dt_mapping_file)

    buf = output_dir / BARCODE_UMI_FASTQ_PATH
    trf = output_dir / TRANSCRIPT_FASTQ_PATH

    all_fastqs = chain.from_iterable(
        find_grouped_fastq_files(fastq_dir, 2) for fastq_dir in fastq_dirs)

    with open(buf, "w") as cbo, open(trf, "w") as tro:
        for transcript_fastq, barcode_umi_fastq in all_fastqs:
            usable_count = 0
            i = 0
            print("Correcting barcodes in", transcript_fastq, "and",
                  barcode_umi_fastq)
            transcript_reader = fastq_reader(transcript_fastq)
            barcode_umi_reader = fastq_reader(barcode_umi_fastq)
            for i, (tr, br) in enumerate(
                    zip(transcript_reader, barcode_umi_reader), 1):
                barcode_pieces = [revcomp(br.seq[s]) for s in BARCODE_SEGMENTS]
                rc_corrected = [
                    correcter.correct(barcode) for barcode in barcode_pieces
                ]
                if all(rc_corrected):
                    rc_corrected[2] = n6_dt_mapping[rc_corrected[2]]
                    corrected = [revcomp(rc_bc) for rc_bc in rc_corrected]
                    usable_count += 1
                    umi_seq = br.seq[UMI_SEGMENT]
                    umi_qual = br.qual[UMI_SEGMENT]
                    new_seq = "".join(corrected + [umi_seq])
                    new_qual = BARCODE_QUAL_DUMMY + umi_qual
                    new_br = Read(
                        read_id=br.read_id,
                        seq=new_seq,
                        unused=br.unused,
                        qual=new_qual,
                    )
                    print(tr.serialize(), file=tro)
                    print(new_br.serialize(), file=cbo)

            print("Total count:", i)
            print("Usable count:", usable_count)
            print("Proportion:", usable_count / i)
예제 #2
0
def convert(mapper: BarcodeMapper, input_fastq: Path, output_dir: Path,
            basename: str):
    output_dir.mkdir(exist_ok=True, parents=True)
    print("Converting", input_fastq)
    barcode_umi_path = output_dir / f"{basename}_R1.fastq"
    transcript_path = output_dir / f"{basename}_R2.fastq"
    decompress_fastq(input_fastq, transcript_path)

    with smart_open(barcode_umi_path, "wt") as f:
        for transcript_read in fastq_reader(input_fastq):
            id_pieces = transcript_read.read_id.lstrip("@").split("|")
            p7 = mapper.p7_mapping[id_pieces[2]]
            p5 = mapper.p5_mapping[id_pieces[3]]
            rt2_id, lig_id = id_pieces[4].split("_")
            rt2 = mapper.rt2_mapping[rt2_id]
            lig = mapper.ligation_mapping[lig_id]
            umi = id_pieces[5]

            barcode_umi = umi + p7 + p5 + rt2 + lig
            barcode_umi_read = Read(
                read_id=transcript_read.read_id,
                seq=barcode_umi,
                unused=transcript_read.unused,
                qual=get_base_qual_str(len(barcode_umi)),
            )
            print(barcode_umi_read.serialize(), file=f)
예제 #3
0
def main(assay: Assay, fastq_dirs: Iterable[Path], output_filename_prefix,
         output_dir: Path):

    baraddedf1 = output_dir / f"{output_filename_prefix}_R1.fastq"
    baraddedf2 = output_dir / f"{output_filename_prefix}_R2.fastq"

    all_fastqs = chain.from_iterable(
        find_grouped_fastq_files(fastq_dir, assay.fastq_count)
        for fastq_dir in fastq_dirs)

    with open(baraddedf1, "w") as barf1addedout, open(baraddedf2,
                                                      "w") as barf2addedout:
        for fastq1_file, fastq2_file, barcode_file in all_fastqs:

            i = 0
            print("Adding barcodes to", fastq1_file, "and", fastq2_file,
                  "using", barcode_file)
            fastq1_reader = fastq_reader(fastq1_file)
            fastq2_reader = fastq_reader(fastq2_file)
            barcode_reader = fastq_reader(barcode_file)

            for i, (f1r, f2r, bar) in enumerate(
                    zip(fastq1_reader, fastq2_reader, barcode_reader), 1):
                if assay == Assay.SNARESEQ:
                    # This code assumes the barcode fastq is the third fastq; e.g. '...R3...'
                    barcode_pieces = [
                        revcomp(bar.seq[s]) for s in SNARESEQ_BARCODE_SEGMENTS
                    ]
                    # For some reason the barcode pieces are reversed in the list
                    # so get them back in the right order
                    barcode_pieces.reverse()
                    umi_seq = bar.seq[UMI_SEGMENT]
                elif assay == Assay.SNSEQ:
                    barcode_pieces = [f2r.seq[SNSEQ_BARCODE_SEGMENT]]
                    # print("barcode pieces={}".format(barcode_pieces))
                    umi_seq = ""
                else:
                    print(
                        "Could not adjust barcodes for assay {}".format(assay))

                create_and_output_barcode_adjusted_reads(
                    f1r, f2r, umi_seq, barcode_pieces, barf1addedout,
                    barf2addedout)
def convert_fastq(input_path: Path, output_path: Union[str, Path]):
    print("Converting", input_path, "to", output_path)
    with open(output_path, "wt") as f:
        for read in fastq_reader(input_path):
            # We know the read ID format of this single data set, but
            # be lenient in what we accept and how we parse this
            barcode, *rest = read.read_id.lstrip("@").split(":")
            new_read_id = READ_ID_FORMAT.format(
                barcode=barcode,
                umi="",
                previous_read_id="".join(rest),
            )

            new_read = Read(
                read_id=new_read_id,
                seq=read.seq,
                unused=read.unused,
                qual=read.qual,
            )
            print(new_read.serialize(), file=f)
예제 #5
0
def convert(mapper: BarcodeMapper, input_fastq: Path, output_dir: Path,
            basename: str):
    output_dir.mkdir(exist_ok=True, parents=True)
    print('Converting', input_fastq)
    barcode_umi_path = output_dir / f'{basename}_R1.fastq'
    transcript_path = output_dir / f'{basename}_R2.fastq'
    decompress_fastq(input_fastq, transcript_path)

    with smart_open(barcode_umi_path, 'wt') as f:
        for transcript_read in fastq_reader(input_fastq):
            id_pieces = transcript_read.read_id.lstrip('@').split('|')
            p7 = mapper.p7_mapping[id_pieces[2]]
            p5 = mapper.p5_mapping[id_pieces[3]]
            rt2 = mapper.rt2_mapping[id_pieces[4]]
            umi = id_pieces[5]

            barcode_umi = p7 + p5 + rt2 + umi
            barcode_umi_read = Read(
                read_id=transcript_read.read_id,
                seq=barcode_umi,
                unused=transcript_read.unused,
                qual=get_base_qual_str(len(barcode_umi)),
            )
            print(barcode_umi_read.serialize(), file=f)