plot_data_dict[data_pair[6]][5].append(data_pair[4]) plot_data_dict[data_pair[6]][6].append(data_pair[5]) plot_data_dict[data_pair[6]][7].append(color_dict[data_pair[6]]) count = len(plot_data_dict[data_pair[6]][0]) if count > 1: previous = plot_data_dict[data_pair[6]][0][count - 2] plot_data_dict[data_pair[6]][8]\ .append(plot_data_dict[data_pair[6]][8][count - 2] + 0.0007 + (0.5*previous) + data_pair[0] * 0.5) return plot_data_dict # This is here to run the module as a stand-alone. if __name__ == '__main__': ToolBox.debug_messenger("Standing Alone") parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('--options_file', action='store', dest='options_file', required=True, help='File containing program parameters.') options_parser = ToolBox.options_file(parser) args = options_parser.parse_args() scarmapperplot(args)
def temp_file_writer(self, limit): """ Write the temporary FASTQ files. Also create list of temporary BAM file names for use later. :return: """ self.log.info("Begin writing temporary FASTQ files.") i = 0 temp_file1 = None temp_file2 = None fastq_file_list = [] bam_file_list = [] read_count = 0 limit_counter = 0 while read_count <= self.read_count: try: # This generator is returning actual reads not lines. fastq1_read = next(self.fastq1_file.seq_read()) fastq2_read = next(self.fastq2_file.seq_read()) if self.index1_file is not None: fastq3_read = next(self.index1_file.seq_read()) except StopIteration: read_count += 1 continue read_count += 1 try: fastq1_n_frac = fastq1_read.seq.count("N") / len( fastq1_read.seq) fastq2_n_frac = fastq2_read.seq.count("N") / len( fastq2_read.seq) except ZeroDivisionError: continue # Apply Filters if (len(fastq1_read.seq) < int(self.args.Minimum_Length) or len(fastq2_read.seq) < int(self.args.Minimum_Length) or fastq1_n_frac >= float(self.args.N_Limit) or fastq2_n_frac >= float(self.args.N_Limit)): continue if limit_counter % limit == 0: if temp_file1: temp_file1.close() limit_counter = 0 if temp_file2: temp_file2.close() file1 = "{0}{1}_R1_tmp_{2}.fastq.gz".format( self.args.WorkingFolder, self.args.Job_Name, i) file2 = "{0}{1}_R2_tmp_{2}.fastq.gz".format( self.args.WorkingFolder, self.args.Job_Name, i) bam_file_list.append("{0}{1}_R1_tmp_{2}.bam".format( self.args.WorkingFolder, self.args.Job_Name, i)) fastq_file_list.append((file1, file2)) temp_file1 = Writer(self.log, file1) temp_file2 = Writer(self.log, file2) self.log.info("Writing {0} and {1}".format(file1, file2)) i += 1 limit_counter += 1 # BAM files are missing the barcodes because of a space in some of the header files. # fastq1_read.name = fastq1_read.name.replace(" ", ":") # fastq2_read.name = fastq2_read.name.replace(" ", ":") # Add the UMT's to the header. if self.args.HaloPLEX: umi = fastq3_read.seq header1 = "{0}|{1}:{2}".format( fastq1_read.name.split(":")[-1], umi, fastq1_read.name) header2 = "{0}|{1}:{2}".format( fastq2_read.name.split(":")[-1], umi, fastq2_read.name) elif self.args.ThruPLEX: # header1 = "{0}|{1}".format(fastq1_read.name.split(":")[-1], fastq1_read.name) umt1 = fastq1_read.seq[:6] umt2 = fastq2_read.seq[:6] header1 = "{0}|{1}:{2}".format(umt1, umt2, fastq1_read.name) header2 = "{0}|{1}:{2}".format(umt1, umt2, fastq2_read.name) else: Tool_Box.debug_messenger( "Only HaloPLEX or ThruPLEX currently enabled.") self.log.error("Only HaloPLEX or ThruPLEX currently enabled.") raise SystemExit(1) # Trim adapter sequences from 5' end if needed. if int(self.args.trim) > 0: fastq1_read.seq = fastq1_read.seq[int(self.args.trim):] fastq1_read.qual = fastq1_read.qual[int(self.args.trim):] fastq2_read.seq = fastq2_read.seq[int(self.args.trim):] fastq2_read.qual = fastq2_read.qual[int(self.args.trim):] fastq1_read.name = header1 fastq2_read.name = header2 temp_file1.write(self.fastq1_file) temp_file2.write(self.fastq2_file) if temp_file1: temp_file1.close() if temp_file2: temp_file2.close() self.log.info("All temporary FASTQ files written") return fastq_file_list, bam_file_list
def consensus_demultiplex(self): """ Takes a FASTQ file of consensus reads and identifies each by index. Handles writing demultiplexed FASTQ if user desired. """ self.log.info("Consensus Index Search") eof = False start_time = time.time() split_time = time.time() fastq_file_name_list = [] fastq_data_dict = collections.defaultdict(lambda: collections.defaultdict(list)) indexed_read_count = 0 key_counts = [] while not eof: # Debugging Code Block if self.args.Verbose == "DEBUG": read_limit = 1000000 if self.read_count > read_limit: if self.args.Demultiplex: for index_name in fastq_data_dict: r1_data = fastq_data_dict[index_name]["R1"] r1, r2 = self.fastq_outfile_dict[index_name] r1.write(r1_data) r1.close() if not self.args.PEAR: r2_data = fastq_data_dict[index_name]["R2"] r2.write(r2_data) r2.close() Tool_Box.debug_messenger("Limiting Reads Here to {}".format(read_limit)) eof = True fastq2_read = None try: fastq1_read = next(self.fastq1.seq_read()) if not self.args.PEAR: fastq2_read = next(self.fastq2.seq_read()) except StopIteration: if self.args.Demultiplex: for index_name in fastq_data_dict: r1_data = fastq_data_dict[index_name]["R1"] r1, r2 = self.fastq_outfile_dict[index_name] r1.write(r1_data) r1.close() if not self.args.PEAR: r2_data = fastq_data_dict[index_name]["R2"] r2.write(r2_data) r2.close() eof = True continue self.read_count += 1 if self.read_count % 100000 == 0: elapsed_time = int(time.time() - start_time) block_time = int(time.time() - split_time) split_time = time.time() self.log.info("Processed {} reads in {} seconds. Total elapsed time: {} seconds." .format(self.read_count, block_time, elapsed_time)) # Match read with library index. match_found, left_seq, right_seq, index_name, fastq1_read, fastq2_read = \ self.index_matching(fastq1_read, fastq2_read) if match_found: indexed_read_count += 1 locus = self.index_dict[index_name][7] phase_key = "{}+{}".format(index_name, locus) r2_found = False r1_found = False if self.args.Platform == "Illumina": # Score the phasing and place the reads in a dictionary. for r2_phase, r1_phase in zip(self.phase_dict[locus]["R2"], self.phase_dict[locus]["R1"]): r2_phase_name = r2_phase[1] r1_phase_name = r1_phase[1] # Tag reads that should not have any phasing. if not r1_phase[0]: self.phase_count[phase_key]["Phase " + r1_phase_name] = -1 self.phase_count[phase_key]["Phase " + r2_phase_name] = -1 continue else: self.phase_count[phase_key]["Phase " + r1_phase_name] += 0 self.phase_count[phase_key]["Phase " + r2_phase_name] += 0 # The phasing is the last N nucleotides of the consensus. if r2_phase[0] == Sequence_Magic.rcomp(fastq1_read.seq[-len(r2_phase[0]):]) and not r2_found: self.phase_count[phase_key]["Phase "+r2_phase_name] += 1 r2_found = True if r1_phase[0] == fastq1_read.seq[:len(r1_phase[0])] and not r1_found: self.phase_count[phase_key]["Phase "+r1_phase_name] += 1 r1_found = True # if no phasing is found then note that. if not r2_found: self.phase_count[phase_key]["No Read 2 Phasing"] += 1 if not r1_found: self.phase_count[phase_key]["No Read 1 Phasing"] += 1 # The adapters on Gupta Lab AAVS1.1 are reversed causing the reads to be reversed. if locus == "AAVS1.1": self.sequence_dict[index_name].append(fastq1_read.seq) else: self.sequence_dict[index_name].append(fastq1_read.seq) elif self.args.Platform == "TruSeq": self.sequence_dict[index_name].append(right_seq) elif self.args.Platform == "Ramsden": self.sequence_dict[index_name].append(Sequence_Magic.rcomp(fastq1_read.seq)) else: self.log.error("--Platform {} not correctly defined. Edit parameter file and try again" .format(self.args.Platform)) raise SystemExit(1) if self.args.Demultiplex: fastq_data_dict[index_name]["R1"].append([fastq1_read.name, fastq1_read.seq, fastq1_read.qual]) if not self.args.PEAR: fastq_data_dict[index_name]["R2"].append([fastq2_read.name, fastq2_read.seq, fastq2_read.qual]) fastq_file_name_list.append("{}{}_{}_Consensus.fastq" .format(self.args.WorkingFolder, self.args.Job_Name, index_name)) elif self.args.Demultiplex and not match_found: fastq_data_dict['Unknown']["R1"].append([fastq1_read.name, fastq1_read.seq, fastq1_read.qual]) fastq_data_dict['Unknown']["R2"].append([fastq1_read.name, fastq1_read.seq, fastq1_read.qual]) fastq_file_name_list.append("{}{}_Unknown_Consensus.fastq" .format(self.args.WorkingFolder, self.args.Job_Name)) if self.args.Demultiplex: self.fastq_compress(list(set(fastq_file_name_list))) for key in self.sequence_dict: key_counts.append(len(self.sequence_dict[key])) # The lower limit is used when plotting the data. Generally the lowest values are just noise. if len(key_counts) == 0: self.log.error("No Scar Patterns Found") raise SystemExit(1) lower, upper_limit = stats.norm.interval(0.9, loc=statistics.mean(key_counts), scale=stats.sem(key_counts)) lower_limit = statistics.mean(key_counts)-lower return indexed_read_count, lower_limit