def main( fastq_dirs: Iterable[Path], output_dir: Path = Path(), barcode_filename: Path = BARCODE_ALLOWLIST_FILE, n6_dt_mapping_file: Path = N6_DT_MAPPING_FILE, ): barcode_allowlist = read_barcode_allowlist(barcode_filename) correcter = bu.BarcodeCorrecter(barcode_allowlist, edit_distance=1) n6_dt_mapping = read_n6_dt_mapping(n6_dt_mapping_file) buf = output_dir / BARCODE_UMI_FASTQ_PATH trf = output_dir / TRANSCRIPT_FASTQ_PATH all_fastqs = chain.from_iterable( find_grouped_fastq_files(fastq_dir, 2) for fastq_dir in fastq_dirs) with open(buf, "w") as cbo, open(trf, "w") as tro: for transcript_fastq, barcode_umi_fastq in all_fastqs: usable_count = 0 i = 0 print("Correcting barcodes in", transcript_fastq, "and", barcode_umi_fastq) transcript_reader = fastq_reader(transcript_fastq) barcode_umi_reader = fastq_reader(barcode_umi_fastq) for i, (tr, br) in enumerate( zip(transcript_reader, barcode_umi_reader), 1): barcode_pieces = [revcomp(br.seq[s]) for s in BARCODE_SEGMENTS] rc_corrected = [ correcter.correct(barcode) for barcode in barcode_pieces ] if all(rc_corrected): rc_corrected[2] = n6_dt_mapping[rc_corrected[2]] corrected = [revcomp(rc_bc) for rc_bc in rc_corrected] usable_count += 1 umi_seq = br.seq[UMI_SEGMENT] umi_qual = br.qual[UMI_SEGMENT] new_seq = "".join(corrected + [umi_seq]) new_qual = BARCODE_QUAL_DUMMY + umi_qual new_br = Read( read_id=br.read_id, seq=new_seq, unused=br.unused, qual=new_qual, ) print(tr.serialize(), file=tro) print(new_br.serialize(), file=cbo) print("Total count:", i) print("Usable count:", usable_count) print("Proportion:", usable_count / i)
def convert(mapper: BarcodeMapper, input_fastq: Path, output_dir: Path, basename: str): output_dir.mkdir(exist_ok=True, parents=True) print("Converting", input_fastq) barcode_umi_path = output_dir / f"{basename}_R1.fastq" transcript_path = output_dir / f"{basename}_R2.fastq" decompress_fastq(input_fastq, transcript_path) with smart_open(barcode_umi_path, "wt") as f: for transcript_read in fastq_reader(input_fastq): id_pieces = transcript_read.read_id.lstrip("@").split("|") p7 = mapper.p7_mapping[id_pieces[2]] p5 = mapper.p5_mapping[id_pieces[3]] rt2_id, lig_id = id_pieces[4].split("_") rt2 = mapper.rt2_mapping[rt2_id] lig = mapper.ligation_mapping[lig_id] umi = id_pieces[5] barcode_umi = umi + p7 + p5 + rt2 + lig barcode_umi_read = Read( read_id=transcript_read.read_id, seq=barcode_umi, unused=transcript_read.unused, qual=get_base_qual_str(len(barcode_umi)), ) print(barcode_umi_read.serialize(), file=f)
def main(assay: Assay, fastq_dirs: Iterable[Path], output_filename_prefix, output_dir: Path): baraddedf1 = output_dir / f"{output_filename_prefix}_R1.fastq" baraddedf2 = output_dir / f"{output_filename_prefix}_R2.fastq" all_fastqs = chain.from_iterable( find_grouped_fastq_files(fastq_dir, assay.fastq_count) for fastq_dir in fastq_dirs) with open(baraddedf1, "w") as barf1addedout, open(baraddedf2, "w") as barf2addedout: for fastq1_file, fastq2_file, barcode_file in all_fastqs: i = 0 print("Adding barcodes to", fastq1_file, "and", fastq2_file, "using", barcode_file) fastq1_reader = fastq_reader(fastq1_file) fastq2_reader = fastq_reader(fastq2_file) barcode_reader = fastq_reader(barcode_file) for i, (f1r, f2r, bar) in enumerate( zip(fastq1_reader, fastq2_reader, barcode_reader), 1): if assay == Assay.SNARESEQ: # This code assumes the barcode fastq is the third fastq; e.g. '...R3...' barcode_pieces = [ revcomp(bar.seq[s]) for s in SNARESEQ_BARCODE_SEGMENTS ] # For some reason the barcode pieces are reversed in the list # so get them back in the right order barcode_pieces.reverse() umi_seq = bar.seq[UMI_SEGMENT] elif assay == Assay.SNSEQ: barcode_pieces = [f2r.seq[SNSEQ_BARCODE_SEGMENT]] # print("barcode pieces={}".format(barcode_pieces)) umi_seq = "" else: print( "Could not adjust barcodes for assay {}".format(assay)) create_and_output_barcode_adjusted_reads( f1r, f2r, umi_seq, barcode_pieces, barf1addedout, barf2addedout)
def convert_fastq(input_path: Path, output_path: Union[str, Path]): print("Converting", input_path, "to", output_path) with open(output_path, "wt") as f: for read in fastq_reader(input_path): # We know the read ID format of this single data set, but # be lenient in what we accept and how we parse this barcode, *rest = read.read_id.lstrip("@").split(":") new_read_id = READ_ID_FORMAT.format( barcode=barcode, umi="", previous_read_id="".join(rest), ) new_read = Read( read_id=new_read_id, seq=read.seq, unused=read.unused, qual=read.qual, ) print(new_read.serialize(), file=f)
def convert(mapper: BarcodeMapper, input_fastq: Path, output_dir: Path, basename: str): output_dir.mkdir(exist_ok=True, parents=True) print('Converting', input_fastq) barcode_umi_path = output_dir / f'{basename}_R1.fastq' transcript_path = output_dir / f'{basename}_R2.fastq' decompress_fastq(input_fastq, transcript_path) with smart_open(barcode_umi_path, 'wt') as f: for transcript_read in fastq_reader(input_fastq): id_pieces = transcript_read.read_id.lstrip('@').split('|') p7 = mapper.p7_mapping[id_pieces[2]] p5 = mapper.p5_mapping[id_pieces[3]] rt2 = mapper.rt2_mapping[id_pieces[4]] umi = id_pieces[5] barcode_umi = p7 + p5 + rt2 + umi barcode_umi_read = Read( read_id=transcript_read.read_id, seq=barcode_umi, unused=transcript_read.unused, qual=get_base_qual_str(len(barcode_umi)), ) print(barcode_umi_read.serialize(), file=f)