def primer_len_filter(path, sample): sequence = dinopy.FastqReader(path) assembled = dinopy.FastqWriter( path.rsplit("_", 1)[0] + "_assembled.fastq") filt_out = dinopy.FastqWriter( path.rsplit("_", 1)[0] + "_filtered_out.fastq") assembled_counter = 0 filt_out_counter = 0 assembled.open() filt_out.open() for read in sequence.reads(quality_values=True): name = read.name.decode() seq = check_for_match(read.sequence.decode(), sample) if seq[0] and snakemake.params.maxlen >= len( seq[1]) >= snakemake.params.minlen: assembled.write(seq[1].encode(), name.split(" ")[0].encode(), read.quality) assembled_counter += 1 else: filt_out.write(read.sequence, name.split(" ")[0].encode(), read.quality) filt_out_counter += 1 logging.info("{}: {} sequences were kept, \ {} sequences were filtered out".format(sample, assembled_counter, filt_out_counter)) assembled.close() filt_out.close()
def get_dedup_coverages(fq_file): singleton_pattern = "at_locus:'(\d+)'" consensus_pattern = "Locus_(\d+)" locus_coverage_counter = Counter() fqr = dp.FastqReader(fq_file) for r in fqr.reads(): name = r.name.decode() singleton = re.search(singleton_pattern, name) consensus = re.findall(consensus_pattern, name) if singleton is not None: locus = singleton.groups()[0] locus_coverage_counter[int(locus)] += 1 if consensus: merged_loci = [int(s) for s in consensus] # print(merged_loci) if len(set(merged_loci)) > 1: raise ValueError("Overmerge!") else: locus_coverage_counter[list(set(merged_loci))[0]] += 1 if singleton is None and not consensus: raise ValueError("BAD NAME. No matches") if singleton is not None and consensus: raise ValueError("BAD NAME. Two matches") loci, coverage = zip(*locus_coverage_counter.items()) df = pd.DataFrame({"locus": loci, "after_dedup": coverage}) return df
def split_files(p5_file, p7_file, force): fqr_fw = dp.FastqReader(p5_file) fqr_rev = dp.FastqReader(p7_file) output_files = {} for (fw, rev) in zip(fqr_fw.reads(), fqr_rev.reads()): # get bthe nameline of the read (forward or reverse doesn't matter) nl = rev.name items = nl.split() # This uses the perfect p7 barcode from the annotation # To use the simulated barcode, which can contain sequencing errors, # use items[1].split[b":"][-1] if items[5].startswith(b"p7_bc"): # extract the barcode sequence p7_bc = items[5].split(b":")[1].strip(b"'") # check if a file writer for the barcode is already available if p7_bc not in output_files: filename_fw = f"reads_{p7_bc.decode()}_1.fq.gz" filename_rev = f"reads_{p7_bc.decode()}_2.fq.gz" fqw_fw = dp.FastqWriter(filename_fw, force_overwrite=force) fqw_rev = dp.FastqWriter(filename_rev, force_overwrite=force) fqw_fw.open() fqw_rev.open() print(f"\nFound new barcode: {p7_bc.decode()}") print(f"Writing to:") print(f" -> {filename_fw}") print(f" -> {filename_rev}") output_files[p7_bc] = (fqw_fw, fqw_rev) else: fqw_fw, fqw_rev = output_files[p7_bc] # write reads back to the writer with the chosen barcode fqw_fw.write(*fw) fqw_rev.write(*rev)
def fasta2dazzdb(args: argparse.Namespace): """Fix the FASTA/FASTQ header/id's to a DAZZ_DB compatible format such that these reads can be imported.""" file_format = args.format if not file_format: if args.input != sys.stdin: filename = args.input.name file_ext = filename[filename.rfind('.')+1:] file_format = 'fastq' if file_ext in ('fq', 'fastq') else 'fasta' if not file_format: logger.error("Could not determine file format. Please specify using " "the -f option.") return if file_format == 'fastq': seq_iter = iter(dinopy.FastqReader(args.input).reads( quality_values=False)) else: seq_iter = iter(dinopy.FastaReader(args.input).reads(read_names=True)) if args.input == sys.stdin: name = args.name if args.name else random_string(10) else: name = os.path.basename(args.input.name) moviename = daligner.generate_moviename_hash(name) name_mapping = {} seq_iter = iter(daligner.fix_header(seq_iter, moviename, name_mapping)) logger.info("Converting FASTA/FASTQ entries...") with dinopy.FastaWriter(args.output, force_overwrite=True) as fw: fw.write_entries(seq_iter) if args.translations: logger.info("Writing name mappings to file...") json.dump(name_mapping, args.translations) logger.info("Done.")
def parse_fq_file(fq_file): pcr_counts = defaultdict(PCRRecord) fqr = dp.FastqReader(fq_file) for read in fqr.reads(): line_info = parse_info_line(read.name.decode()) try: locus = line_info["at_locus"] except: print(read, line_info) raise # count the number of real an PCR reads for this locus # tested against a solution with grep + wc -l # grep ^@ data/ddRAGEdataset_ATCACG_1.fastq | grep "at_locus:'1'" | grep -v PCR | wc -l if line_info["pcr_copy"]: pcr_counts[locus].pcr += 1 else: pcr_counts[locus].real += 1 return pcr_counts
def demultiplexer(file_path_list): samples = [] output_filepaths = [] for sample in primertable.keys(): samples.append(sample + '_R1') samples.append(sample + '_R2') output_filepaths.append('demultiplexed/' + sample + '_R1.fastq.gz') output_filepaths.append('demultiplexed/' + sample + '_R2.fastq.gz') # Create a dict of writers. writers = { name: dinopy.FastqWriter(path) for name, path in zip(samples, output_filepaths) } # Open all writers. for writer in writers.values(): writer.open() # Start writing. for sample in file_path_list: sequence = dinopy.FastqReader(sample) for read in sequence.reads(quality_values=True): for sample in primertable.keys(): if check_for_match_fwd_demulti(read.sequence.decode(), sample): writers[sample + '_R1'].write(read.sequence, read.name, read.quality) elif check_for_match_rev_demulti(read.sequence.decode(), sample): writers[sample + '_R2'].write(read.sequence, read.name, read.quality) else: pass # Close all writers. for writer in writers.values(): writer.close()
"""Merge the p5 and p7 reads from the input files into one read, joined by join bases. """ import sys import dinopy # redirect stderr to logfile sys.stderr = open(snakemake.log[0], "w") # get and open input files from the calling snakemake rules input directive p5_file, p7_file, p5_length, p7_length = snakemake.input print(f"Opening files:\n {p5_file}\n {p7_file}", file=sys.stderr) print(f"Writing to:\n {snakemake.output.merged}", file=sys.stderr) p5_reader = dinopy.FastqReader(p5_file) p7_reader = dinopy.FastqReader(p7_file) with open(p5_length, "r") as p5_len_file: p5_len = int(p5_len_file.readline().strip()) with open(p7_length, "r") as p7_len_file: p7_len = int(p7_len_file.readline().strip()) # check if the quality value for the join sequence is valid join_quality = snakemake.params.join_quality if len(join_quality) != 1: print( "Please specify a single Sanger Phred+33 quality value for " "join_quality.", file=sys.stderr) sys.exit(1) else:
""" """ import sys import dinopy from shutil import copyfile # redirect stderr to logfile sys.stderr = open(snakemake.log[0], "w") p7_reader = dinopy.FastqReader(snakemake.input.fq2) umi_len = snakemake.params.umi["len"] # copy p5 file without touching it copyfile(snakemake.input.fq1, snakemake.output.fq1) # trim the first UMI-len bases from the p7 read with dinopy.FastqWriter(snakemake.output.fq2, force_overwrite=True) as writer: for seq, name, qual in p7_reader.reads(read_names=True, quality_values=True): writer.write(seq[umi_len:], name, qual[umi_len:])
import dinopy import sys in_fastq = dinopy.FastqReader(sys.stdin) with dinopy.FastqWriter(sys.stdout) as out_fastq: out_fastq.write_reads( (read.sequence, read.name.split(b" ")[0], read.quality) for read in in_fastq.reads())
filemode="a+", format="%(asctime)-15s %(levelname)-8s %(message)s") logging.info(run_id) WORKING_DIR = os.path.abspath(pathname) logging.info("WORKING_DIR") logging.info(WORKING_DIR) logging.info("input: SRA fastq file") logging.info(input_id) logging.info("output: trimmed file") logging.info(output_file) input_path = input_id + ".fastq" out_path_cbsu = os.path.join(WORKING_DIR, output_file) fqr = dinopy.FastqReader(input_path) good_reads = OrderedDict() reads_length = [] pass_quality = 0 pass_length = 0 has_adapter = 0 sample2inadapter = { "SRR2078285": "GATCAGCAG", "SRR2078286": "ACACAGCAG", "SRR2078287": "ACTCAGCAG", "SRR2078288": "ACGCAGCAG", "SRR2078289": "AGACAGCAG", "SRR2078290": "ATCCAGCAG", "SRR2078291": "ATGCAGCAG",
def read_sorter(primertable): if not os.path.exists('demultiplexed/not_sorted'): os.mkdir('demultiplexed/not_sorted') samples = [] output_filepaths = [] for sample in primertable.keys(): samples.append(sample + snakemake.params.name_ext[:-1] + '1') samples.append(sample + snakemake.params.name_ext[:-1] + '2') samples.append(sample + '_not_sorted') output_filepaths.append('demultiplexed/' + sample + '_R1.fastq.gz') output_filepaths.append('demultiplexed/' + sample + '_R2.fastq.gz') output_filepaths.append('demultiplexed/not_sorted/' + sample + '_not_sorted.fastq.gz') # Create a dict of writers. writers = { name: dinopy.FastqWriter(path) for name, path in zip(samples, output_filepaths) } # Open all writers. for writer in writers.values(): writer.open() # Start writing. for sample in primertable.keys(): fwd = dinopy.FastqReader('../' + data_folder + '/' + sample + str(snakemake.params.name_ext)[:-1] + '1.fastq.gz') rev = dinopy.FastqReader('../' + data_folder + '/' + sample + str(snakemake.params.name_ext)[:-1] + '2.fastq.gz') for read_f, read_r in zip(fwd.reads(quality_values=True), rev.reads(quality_values=True)): if check_for_match_sort_fwd( read_f.sequence.decode(), sample.split('/')[-1]) and check_for_match_sort_rev( read_r.sequence.decode(), sample.split('/')[-1]): writers[sample + '_R1'].write(read_f.sequence, read_f.name, read_f.quality) writers[sample + '_R2'].write(read_r.sequence, read_r.name, read_r.quality) elif check_for_match_sort_rev( read_f.sequence.decode(), sample.split('/')[-1]) and check_for_match_sort_fwd( read_r.sequence.decode(), sample.split('/')[-1]): writers[sample + '_R2'].write(read_f.sequence, read_f.name, read_f.quality) writers[sample + '_R1'].write(read_r.sequence, read_r.name, read_r.quality) else: writers[sample + '_not_sorted'].write(read_f.sequence, read_f.name, read_f.quality) writers[sample + '_not_sorted'].write(read_r.sequence, read_r.name, read_r.quality) # Close all writers. for writer in writers.values(): writer.close()
import pysam import dinopy import numpy as np log = open(snakemake.log[0], "w") def parse_clusters(stdout): for consensus, size, seqids in csv.reader(stdout, delimiter="\t"): # parse seqids and subtract 1 because starcode provides 1-based indices yield np.fromiter(map(int, seqids.split(",")), dtype=int) - 1 # load dbr sequences dbrs = np.array([seq[:snakemake.params.dbr_len].decode() for seq in dinopy.FastqReader(snakemake.input.fq2) .reads(read_names=False, quality_values=False)]) clusters = dict() # cluster by read sequences with subprocess.Popen(f"starcode --dist {snakemake.params.seq_dist} --seq-id " f"-1 <(gzip -d -c {snakemake.input.fq1}) -2 <(seqtk trimfq " f"-b {snakemake.params.dbr_len} {snakemake.input.fq2})", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, executable="bash", universal_newlines=True) as seqclust: cluster_id = 0 # iterate over clusters for seqids in parse_clusters(seqclust.stdout): # get DBRs of clustered sequences cluster_dbrs = dbrs[seqids] # cluster by DBRs