def main(command_line_args=None): """ :param command_line_args: """ VersionDependencies.python_check() if not command_line_args: command_line_args = sys.argv parser = argparse.ArgumentParser( description="A package to process Synthetic Lethal Data.\n {0} v{1}". format(__package__, __version__), formatter_class=RawTextHelpFormatter) parser.add_argument('--options_file', action='store', dest='options_file', required=True, help='File containing program parameters.') # Convert universal variables intended as boolean from string to boolean. args, options_parser = string_to_boolean(Tool_Box.options_file(parser)) # Check file names and paths for errors error_checking(args) log = Tool_Box.Logger(args) Tool_Box.log_environment_info(log, args, command_line_args) start_time = time.time() module_name = "Synthetic_Lethal" log.info( "{0} v{1}; Module: Synthetic Lethal Analysis v{2} Beginning".format( __package__, __version__, Synthetic_Lethal.__version__)) synthetic_lethal = Synthetic_Lethal.SyntheticLethal(log, args) if args.TargetSearch: synthetic_lethal.fastq_analysis() elif args.Statistics: synthetic_lethal.statistics() else: log.error('No module selected to run.') warning = "\033[1;31m **See warnings above**\033[m" if log.warning_occurred else '' elapsed_time = int(time.time() - start_time) log.info( "****Völundr {0} complete ({1} seconds, {2} Mb peak memory).****\n{3}". format(module_name, elapsed_time, Tool_Box.peak_memory(), warning))
def seg_count_file(self): """ This function parses the tab delimited SegCopy file into a complex dictionary. :return: """ prior_ploidy = { } # This is essentially a tracking dictionary that I make here because the keys are available. bin_tracking_dict = Tool_Box.VivifiedDictionary() line_num = 0 seg_copy_array = self.array_builder() seg_count = list(csv.reader(open(self.input_file), delimiter='\t')) for line in seg_count: if line_num > 0: bin_tracking_dict[line[0]][line_num] = (line[1], line[2]) elif line_num == 0: # First line is the header. label_list = line for i in range(len(label_list)): if i > 2: prior_ploidy[label_list[i]] = [-1, False, 0, 0, 0] line_num += 1 if not eval(self.chrY): with suppress(KeyError): bin_tracking_dict.pop("chrY") return prior_ploidy, bin_tracking_dict, seg_copy_array
def main(command_line_args=None): VersionDependencies.python_check() if not command_line_args: command_line_args = sys.argv run_start = datetime.datetime.today().strftime("%a %b %d %H:%M:%S %Y") parser = argparse.ArgumentParser(description="A little ditty to manipulate FASTQ files.\n {0} v{1}" .format(__package__, __version__), formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('--options_file', action='store', dest='options_file', required=True, help='File containing program parameters.') options_parser = Tool_Box.options_file(parser) args = options_parser.parse_args() # args, options_parser = string_to_boolean(args, options_parser) options_parser.set_defaults(Trim5=0) options_parser.set_defaults(Trim3=0) options_parser.set_defaults(Minimum_Length=100) options_parser.set_defaults(N_Limit=100) options_parser.set_defaults(HaloPLEX=False) options_parser.set_defaults(ThruPLEX=False) options_parser.set_defaults(FASTQ_PreProcess=True) args = options_parser.parse_args() # Check options file for errors. error_checking(args) log = Tool_Box.Logger(args) Tool_Box.log_environment_info(log, args, command_line_args) start_time = time.time() module_name = "" # Initialize generator to read each FASTQ file fastq1 = FASTQ_Tools.FASTQ_Reader(args.FASTQ1, log) fastq2 = FASTQ_Tools.FASTQ_Reader(args.FASTQ2, log) index1 = FASTQ_Tools.FASTQ_Reader(args.Index1, log) index2 = FASTQ_Tools.FASTQ_Reader(args.Index2, log) splitter_data = FASTQ_Tools.FastqSplitter(args, log, fastq1, fastq2, index1, index2, paired_end=True) new_fastq1, new_fastq2 = splitter_data.file_writer() warning = "\033[1;31m **See warnings above**\033[m" if log.warning_occurred else '' elapsed_time = int(time.time() - start_time) log.info("****FASTQ Preprocessing {0} complete ({1} seconds, {2} Mb peak memory).****" .format(module_name, elapsed_time, Tool_Box.peak_memory(), warning))
def data_processing(self): self._log.info("Begin Family Size and UMT Analysis") umt_stats_outstring = "UMT\tCount" family_size_outstring = "Family\tCount" for index_key in self._family_data: count_list = [] for k, v in sorted(Counter(self._family_data[index_key]).items(), key=lambda x: x[1], reverse=True): umt_stats_outstring += "\n{0}\t{1}".format(k, v) count_list.append(str(v)) c = dict(Counter(count_list)) for k in natsort.natsorted(c): family_size_outstring += "\n{0}\t{1}".format(k, c[k]) stats_filename = "{0}{1}_UMT_Stats.txt".format( self._args.Working_Folder, self._data_source) size_filename = "{0}{1}_Family_Size.txt".format( self._args.Working_Folder, self._data_source) # Deleting the files if they exist prevents a random text file busy OSError I am getting using VBox on Windows. Tool_Box.delete([stats_filename, size_filename]) umt_stats_file = open( "{0}{1}_UMT_Stats.txt".format(self._args.Working_Folder, self._data_source), 'w') family_size_file = open(size_filename, "w") umt_stats_file.write(umt_stats_outstring) family_size_file.write(family_size_outstring) umt_stats_file.close() family_size_file.close() self._log.info( "{0} {1} UMT Family Size and Stats Files Written".format( self._args.Job_Name, self._data_source))
def string_to_boolean(parser): """ Converts strings to boolean. Done to keep the eval() function out of the code. :param parser: :return: """ options_parser = Tool_Box.options_file(parser) args = options_parser.parse_args() options_parser.set_defaults(PairedEnd=bool(strtobool(args.PairedEnd))) options_parser.set_defaults(Build_PhiX_DataFrame=bool(strtobool(args.Build_PhiX_DataFrame))) args = options_parser.parse_args() return args, options_parser
def main(): """ """ VersionDependencies.python_check() parser = argparse.ArgumentParser( description="A package to process Synthetic Lethal Data.\n {0} v{1}". format(__package__, __version__), formatter_class=RawTextHelpFormatter) parser.add_argument('--options_file', action='store', dest='options_file', required=True, help='File containing program parameters.') # Convert strings to int, float, boolean, check file names and paths for errors args, log = error_checking(parser) start_time = time.time() # Initialize program synthetic_lethal = Synthetic_Lethal.SyntheticLethal(log, args) if args.TargetSearch: module_name = "Target Search" log.info("{} v{}; Module: {} v{} Beginning".format( __package__, __version__, module_name, Synthetic_Lethal.__version__)) synthetic_lethal.fastq_analysis() elif args.Statistics: module_name = "Statistical Analysis" log.info("{} v{}; Module: {} v{} Beginning".format( __package__, __version__, module_name, Synthetic_Lethal.__version__)) synthetic_lethal.statistics() else: module_name = "No module selected" log.error('No module selected to run.') warning = "\033[1;31m **See warnings above**\033[m" if log.warning_occurred else '' elapsed_time = int(time.time() - start_time) log.info( "****Völundr {0} complete ({1} seconds, {2} Mb peak memory).****\n{3}". format(module_name, elapsed_time, Tool_Box.peak_memory(), warning))
def string_to_boolean(parser): """ Converts strings to boolean. Done to keep the eval() function out of the code. :param parser: :return: """ options_parser = Tool_Box.options_file(parser) args = options_parser.parse_args() if args.IndelProcessing == "True": # Tool_Box.debug_messenger("Pear set to FALSE.") options_parser.set_defaults(PEAR=True) options_parser.set_defaults( Demultiplex=bool(strtobool(args.Demultiplex))) options_parser.set_defaults( OutputRawData=bool(strtobool(args.OutputRawData))) options_parser.set_defaults( DeleteConsensusFASTQ=bool(strtobool(args.DeleteConsensusFASTQ))) options_parser.set_defaults( IndelProcessing=bool(strtobool(args.IndelProcessing))) options_parser.set_defaults(Verbose=args.Verbose.upper()) return options_parser.parse_args()
def file_writer(self): """ Process FASTQ file(s) and write new version(s) suitable for aligners. Return file name. :return: """ self.log.info("Begin writing temporary FASTQ files.") current_read_count = 0 file1 = "{0}{1}_R1_processed.fastq.gz".format(self.args.WorkingFolder, self.args.Job_Name) file2 = "{0}{1}_R2_processed.fastq.gz".format(self.args.WorkingFolder, self.args.Job_Name) temp_file1 = Writer(self.log, file1) temp_file2 = Writer(self.log, file2) self.log.info("Writing {0} and {1}".format(file1, file2)) fastq1_list = [] fastq2_list = [] eof = False Read = collections.namedtuple('Read', 'name, seq, index, qual') # This generator returns objects not lines. while not eof: try: fastq1_read = next(self.fastq1_file.seq_read()) fastq2_read = next(self.fastq2_file.seq_read()) if self.index1_file is not None: index1_read = next(self.index1_file.seq_read()) if self.index2_file is not None: index2_read = next(self.index2_file.seq_read()) except StopIteration: eof = True continue current_read_count += 1 # Apply Filters trim_5 = int(self.args.Trim5) trim_3 = int(self.args.Trim3) min_length = int(self.args.Minimum_Length) + trim_5 + trim_3 # Filter reads based on length and number of N's. if (len(fastq1_read.seq) < min_length or len(fastq2_read.seq) < min_length or fastq1_read.seq.count("N") / len(fastq1_read.seq) >= float(self.args.N_Limit) or fastq2_read.seq.count("N") / len(fastq2_read.seq) >= float(self.args.N_Limit)): continue # Add the UMT's to the header. if self.args.HaloPLEX: header1 = "{0}|{0}:{1}".format(index1_read.seq, fastq1_read.name) header2 = "{0}|{0}:{1}".format(index1_read.seq, fastq2_read.name) # Fixme: This needs to be exposed to the user. # Short HaloPLEX reads have issues. Found that reads <= 100 all show a 3' -1 or -2 error if len(fastq1_read.seq) <= 100: read_trim(fastq1_read, trim5=0, trim3=3) if len(fastq2_read.seq) <= 100: read_trim(fastq2_read, trim5=0, trim3=3) elif self.args.ThruPLEX: # header1 = "{0}|{1}".format(fastq1_read.name.split(":")[-1], fastq1_read.name) umt1 = fastq1_read.seq[:6] umt2 = fastq2_read.seq[:6] header1 = "{0}|{1}:{2}".format(umt1, umt2, fastq1_read.name) header2 = "{0}|{1}:{2}".format(umt1, umt2, fastq2_read.name) read_trim(fastq1_read, trim5=len(umt1), trim3=0) read_trim(fastq2_read, trim5=len(umt2), trim3=0) elif self.args.FASTQ_PreProcess: # The indices are after the last ":" in the header. header1 = "{}:{}+{}".format(fastq1_read.name, index1_read.seq, index2_read.seq) header2 = "{}:{}+{}".format(fastq2_read.name, index1_read.seq, index2_read.seq) else: self.log.error("Only HaloPLEX or ThruPLEX currently enabled.") raise SystemExit(1) # Trim sequences from ends if needed. if trim_5 > 0 or trim_3 > 0: read_trim(fastq1_read, trim_5, trim_3) read_trim(fastq2_read, trim_5, trim_3) fastq1_read.name = header1 fastq2_read.name = header2 fastq1_list.append( Read(fastq1_read.name, fastq1_read.seq, fastq1_read.index, fastq1_read.qual)) fastq2_list.append( Read(fastq2_read.name, fastq2_read.seq, fastq2_read.index, fastq2_read.qual)) # empirically determined for UNC Longleaf cluster. May need to expose this to user. Writes blocks of data # to disk speeding up entire process. if current_read_count % 1000000 == 0: temp_file1.write(fastq1_list) temp_file2.write(fastq2_list) fastq1_list.clear() fastq2_list.clear() # Cleans up any writes still needed and closes files if fastq1_list: temp_file1.write(fastq1_list) fastq1_list.clear() if fastq2_list: temp_file2.write(fastq2_list) fastq2_list.clear() if temp_file1: temp_file1.close() if temp_file2: temp_file2.close() self.log.info("Modified FASTQ file(s) written") if self.args.FASTQ_PreProcess: Tool_Box.compress_files(file1, self.log) Tool_Box.compress_files(file2, self.log) return file1, file2
def main(command_line_args=None): """ :param command_line_args: """ VersionDependencies.python_check() if not command_line_args: command_line_args = sys.argv parser = argparse.ArgumentParser( description="A package to process Synthetic Lethal Data.\n {0} v{1}". format(__package__, __version__), formatter_class=RawTextHelpFormatter) parser.add_argument('--options_file', action='store', dest='options_file', required=True, help='File containing program parameters.') options_parser = Tool_Box.options_file(parser) args = options_parser.parse_args() # If we are doing statistical analysis the user will not input an Index_Mismatch value if not getattr(args, "Index_Mismatch", False): options_parser.add_argument("--Index_Mismatch", dest="Index_Mismatch", default=0) options_parser.add_argument("--Analyze_Unknowns", dest="Analyze_Unknowns", default="False") args = options_parser.parse_args() log = Tool_Box.Logger(args) Tool_Box.log_environment_info(log, args, command_line_args) start_time = time.time() module_name = "Synthetic_Lethal" log.info( "{0} v{1}; Module: Synthetic Lethal Analysis v{2} Beginning".format( __package__, __version__, Synthetic_Lethal.__version__)) # Convert universal variables intended as boolean from string to boolean. # ToDo: Should be a cleaner method to do this. if args.Target_Search == "True": options_parser.set_defaults(Target_Search=True) if args.RevComp == "True": options_parser.set_defaults(RevComp=True) else: options_parser.set_defaults(RevComp=False) if args.Delete_Demultiplexed_FASTQ == "True": options_parser.set_defaults(Delete_Demultiplexed_FASTQ=True) else: options_parser.set_defaults(Delete_Demultiplexed_FASTQ=False) if args.compress == "True": options_parser.set_defaults(compress=True) else: options_parser.set_defaults(compress=False) else: options_parser.set_defaults(Target_Search=False) if args.Statistics == "True": options_parser.set_defaults(Statistics=True) else: options_parser.set_defaults(Statistics=False) if args.Analyze_Unknowns == "True": options_parser.set_defaults(Analyze_Unknowns=True) else: options_parser.set_defaults(Analyze_Unknowns=False) args = options_parser.parse_args() synthetic_lethal = Synthetic_Lethal.SyntheticLethal(log, args) # Add some parameters to our options parser object. args = options_parser.parse_args() if args.Target_Search: synthetic_lethal.fastq_analysis() elif args.Statistics: synthetic_lethal.statistics() else: log.error('No module selected to run.') warning = "\033[1;31m **See warnings above**\033[m" if log.warning_occurred else '' elapsed_time = int(time.time() - start_time) log.info( "****Volundr {0} complete ({1} seconds, {2} Mb peak memory).****\n{3}". format(module_name, elapsed_time, Tool_Box.peak_memory(), warning))
def pear_consensus(args, log): """ This will take the input FASTQ files and use PEAR to generate a consensus file. :param args: :param log: :return: """ log.info("Beginning PEAR Consensus") fastq_consensus_prefix = "{}{}".format(args.WorkingFolder, args.Job_Name) fastq_consensus_file = "{}.assembled.fastq".format(fastq_consensus_prefix) discarded_fastq = "{}.discarded.fastq".format(fastq_consensus_prefix) r1_unassembled = "{}.unassembled.forward.fastq".format( fastq_consensus_prefix) r2_unassembled = "{}.unassembled.reverse.fastq".format( fastq_consensus_prefix) y = "-y {} ".format(args.Memory) j = "-j {} ".format(int(args.Spawn) - 1) p_value = '' if args.PValue: p_value = "-p {} ".format(args.PValue) min_overlap = '' if args.MinOverlap: min_overlap = "-v {} ".format(args.MinOverlap) quality_threshold = "" if args.QualityThreshold: quality_threshold = "-q {} ".format(args.QualityThreshold) phred_value = "" if args.PhredValue: phred_value = "-b {} ".format(args.PhredValue) test_method = "" if args.TestMethod: test_method = "-g {}".format(args.TestMethod) n = "" if args.MinConsensusLength: n = "-n {} ".format(args.MinConsensusLength) proc = subprocess.run( "{}{}Pear{}bin{}./pear -f {} -r {} -o {} {}{}{}{}{}{}{}".format( pathlib.Path(__file__).parent.absolute(), os.sep, os.sep, os.sep, args.FASTQ1, args.FASTQ2, fastq_consensus_prefix, y, j, n, p_value, min_overlap, quality_threshold, phred_value, test_method), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) if proc.stderr: log.error("{}\n{}\n".format(proc.stderr.decode(), proc.stdout.decode())) return else: log.info( "Begin PEAR Output\n" "----------------------------------------------------------------------------------------------------------\n{}" "\n----------------------------------------------------------------------------------------------------------\n" .format(proc.stdout.decode())) file_list = [fastq_consensus_file, r1_unassembled, r2_unassembled] if os.stat(discarded_fastq).st_size > 0: file_list.append(discarded_fastq) else: Tool_Box.delete([discarded_fastq]) return file_list
def consensus_demultiplex(self): """ Takes a FASTQ file of consensus reads and identifies each by index. Handles writing demultiplexed FASTQ if user desired. """ self.log.info("Consensus Index Search") eof = False start_time = time.time() split_time = time.time() fastq_file_name_list = [] fastq_data_dict = collections.defaultdict(lambda: collections.defaultdict(list)) indexed_read_count = 0 key_counts = [] while not eof: # Debugging Code Block if self.args.Verbose == "DEBUG": read_limit = 1000000 if self.read_count > read_limit: if self.args.Demultiplex: for index_name in fastq_data_dict: r1_data = fastq_data_dict[index_name]["R1"] r1, r2 = self.fastq_outfile_dict[index_name] r1.write(r1_data) r1.close() if not self.args.PEAR: r2_data = fastq_data_dict[index_name]["R2"] r2.write(r2_data) r2.close() Tool_Box.debug_messenger("Limiting Reads Here to {}".format(read_limit)) eof = True fastq2_read = None try: fastq1_read = next(self.fastq1.seq_read()) if not self.args.PEAR: fastq2_read = next(self.fastq2.seq_read()) except StopIteration: if self.args.Demultiplex: for index_name in fastq_data_dict: r1_data = fastq_data_dict[index_name]["R1"] r1, r2 = self.fastq_outfile_dict[index_name] r1.write(r1_data) r1.close() if not self.args.PEAR: r2_data = fastq_data_dict[index_name]["R2"] r2.write(r2_data) r2.close() eof = True continue self.read_count += 1 if self.read_count % 100000 == 0: elapsed_time = int(time.time() - start_time) block_time = int(time.time() - split_time) split_time = time.time() self.log.info("Processed {} reads in {} seconds. Total elapsed time: {} seconds." .format(self.read_count, block_time, elapsed_time)) # Match read with library index. match_found, left_seq, right_seq, index_name, fastq1_read, fastq2_read = \ self.index_matching(fastq1_read, fastq2_read) if match_found: indexed_read_count += 1 locus = self.index_dict[index_name][7] phase_key = "{}+{}".format(index_name, locus) r2_found = False r1_found = False if self.args.Platform == "Illumina": # Score the phasing and place the reads in a dictionary. for r2_phase, r1_phase in zip(self.phase_dict[locus]["R2"], self.phase_dict[locus]["R1"]): r2_phase_name = r2_phase[1] r1_phase_name = r1_phase[1] # Tag reads that should not have any phasing. if not r1_phase[0]: self.phase_count[phase_key]["Phase " + r1_phase_name] = -1 self.phase_count[phase_key]["Phase " + r2_phase_name] = -1 continue else: self.phase_count[phase_key]["Phase " + r1_phase_name] += 0 self.phase_count[phase_key]["Phase " + r2_phase_name] += 0 # The phasing is the last N nucleotides of the consensus. if r2_phase[0] == Sequence_Magic.rcomp(fastq1_read.seq[-len(r2_phase[0]):]) and not r2_found: self.phase_count[phase_key]["Phase "+r2_phase_name] += 1 r2_found = True if r1_phase[0] == fastq1_read.seq[:len(r1_phase[0])] and not r1_found: self.phase_count[phase_key]["Phase "+r1_phase_name] += 1 r1_found = True # if no phasing is found then note that. if not r2_found: self.phase_count[phase_key]["No Read 2 Phasing"] += 1 if not r1_found: self.phase_count[phase_key]["No Read 1 Phasing"] += 1 # The adapters on Gupta Lab AAVS1.1 are reversed causing the reads to be reversed. if locus == "AAVS1.1": self.sequence_dict[index_name].append(fastq1_read.seq) else: self.sequence_dict[index_name].append(fastq1_read.seq) elif self.args.Platform == "TruSeq": self.sequence_dict[index_name].append(right_seq) elif self.args.Platform == "Ramsden": self.sequence_dict[index_name].append(Sequence_Magic.rcomp(fastq1_read.seq)) else: self.log.error("--Platform {} not correctly defined. Edit parameter file and try again" .format(self.args.Platform)) raise SystemExit(1) if self.args.Demultiplex: fastq_data_dict[index_name]["R1"].append([fastq1_read.name, fastq1_read.seq, fastq1_read.qual]) if not self.args.PEAR: fastq_data_dict[index_name]["R2"].append([fastq2_read.name, fastq2_read.seq, fastq2_read.qual]) fastq_file_name_list.append("{}{}_{}_Consensus.fastq" .format(self.args.WorkingFolder, self.args.Job_Name, index_name)) elif self.args.Demultiplex and not match_found: fastq_data_dict['Unknown']["R1"].append([fastq1_read.name, fastq1_read.seq, fastq1_read.qual]) fastq_data_dict['Unknown']["R2"].append([fastq1_read.name, fastq1_read.seq, fastq1_read.qual]) fastq_file_name_list.append("{}{}_Unknown_Consensus.fastq" .format(self.args.WorkingFolder, self.args.Job_Name)) if self.args.Demultiplex: self.fastq_compress(list(set(fastq_file_name_list))) for key in self.sequence_dict: key_counts.append(len(self.sequence_dict[key])) # The lower limit is used when plotting the data. Generally the lowest values are just noise. if len(key_counts) == 0: self.log.error("No Scar Patterns Found") raise SystemExit(1) lower, upper_limit = stats.norm.interval(0.9, loc=statistics.mean(key_counts), scale=stats.sem(key_counts)) lower_limit = statistics.mean(key_counts)-lower return indexed_read_count, lower_limit
def string_conversions(parser): """ Convert True/False statements in parameter file to boolean :param parser: :return: """ options_parser = Tool_Box.options_file(parser) initial_args = options_parser.parse_args() options_parser.set_defaults( TargetSearch=bool(strtobool(initial_args.TargetSearch))) options_parser.set_defaults( Statistics=bool(strtobool(initial_args.Statistics))) options_parser.set_defaults(Verbose=initial_args.Verbose.upper()) if initial_args.Statistics == "False": options_parser.set_defaults( AnchorSeq=initial_args.AnchorSeq.upper()) options_parser.set_defaults(Analyze_Unknowns=bool( strtobool(initial_args.Analyze_Unknowns))) options_parser.set_defaults(Delete_Demultiplexed_FASTQ=bool( strtobool(initial_args.Delete_Demultiplexed_FASTQ))) options_parser.set_defaults( RevComp=bool(strtobool(initial_args.RevComp))) options_parser.set_defaults(BatchSize=int(initial_args.BatchSize)) options_parser.set_defaults( Target_Mismatch=int(initial_args.Target_Mismatch)) options_parser.set_defaults( MinimumReadLength=int(initial_args.MinimumReadLength)) options_parser.set_defaults(N_Limit=10) options_parser.set_defaults( Target_Length=int(initial_args.Target_Length)) options_parser.set_defaults( Target_Start=int(initial_args.Target_Start)) # options_parser.set_defaults(Index_Mismatch=int(initial_args.Index_Mismatch)) options_parser.set_defaults(Spawn=int(initial_args.Spawn)) options_parser.set_defaults( Target_Padding=int(initial_args.Target_Padding)) options_parser.set_defaults( Expected_Position=int(initial_args.Expected_Position)) options_parser.set_defaults( AnchorMismatch=int(initial_args.AnchorMismatch)) options_parser.set_defaults( AnchorStart=int(initial_args.AnchorStart)) options_parser.set_defaults( AnchorStop=int(initial_args.AnchorStop)) else: options_parser.set_defaults( Write_TDnorm_Log2_sgRNA_Control_File=bool( strtobool( initial_args.Write_TDnorm_Log2_sgRNA_Control_File))) options_parser.set_defaults( Write_TDnorm_Log2_sgRNA_Sample_File=bool( strtobool( initial_args.Write_TDnorm_Log2_sgRNA_Sample_File))) options_parser.set_defaults(Write_Log2_sgRNA_File=bool( strtobool(initial_args.Write_Log2_sgRNA_File))) options_parser.set_defaults(Write_Permuted_Log2_Data_File=bool( strtobool(initial_args.Write_Permuted_Log2_Data_File))) options_parser.set_defaults(Bad_sgRNA_Lower_Percentile=float( initial_args.Bad_sgRNA_Lower_Percentile)) options_parser.set_defaults(Bad_sgRNA_Upper_Percentile=float( initial_args.Bad_sgRNA_Upper_Percentile)) options_parser.set_defaults( UpperPercentile=float(initial_args.UpperPercentile)) options_parser.set_defaults( LowerPercentile=float(initial_args.LowerPercentile)) options_parser.set_defaults( PermutationCount=int(initial_args.PermutationCount)) options_parser.set_defaults(Alpha=float(initial_args.Alpha)) options_parser.set_defaults( Target_Mismatch=float(initial_args.Target_Mismatch)) options_parser.set_defaults( UpperGuideLimit=float(initial_args.UpperGuideLimit)) options_parser.set_defaults( LowerGuideLimit=float(initial_args.LowerGuideLimit)) initial_args = options_parser.parse_args() return initial_args
def error_checking(parser): """ Check parameter file for errors and return parser object. :param parser: :return: """ def string_conversions(parser): """ Convert True/False statements in parameter file to boolean :param parser: :return: """ options_parser = Tool_Box.options_file(parser) initial_args = options_parser.parse_args() options_parser.set_defaults( TargetSearch=bool(strtobool(initial_args.TargetSearch))) options_parser.set_defaults( Statistics=bool(strtobool(initial_args.Statistics))) options_parser.set_defaults(Verbose=initial_args.Verbose.upper()) if initial_args.Statistics == "False": options_parser.set_defaults( AnchorSeq=initial_args.AnchorSeq.upper()) options_parser.set_defaults(Analyze_Unknowns=bool( strtobool(initial_args.Analyze_Unknowns))) options_parser.set_defaults(Delete_Demultiplexed_FASTQ=bool( strtobool(initial_args.Delete_Demultiplexed_FASTQ))) options_parser.set_defaults( RevComp=bool(strtobool(initial_args.RevComp))) options_parser.set_defaults(BatchSize=int(initial_args.BatchSize)) options_parser.set_defaults( Target_Mismatch=int(initial_args.Target_Mismatch)) options_parser.set_defaults( MinimumReadLength=int(initial_args.MinimumReadLength)) options_parser.set_defaults(N_Limit=10) options_parser.set_defaults( Target_Length=int(initial_args.Target_Length)) options_parser.set_defaults( Target_Start=int(initial_args.Target_Start)) # options_parser.set_defaults(Index_Mismatch=int(initial_args.Index_Mismatch)) options_parser.set_defaults(Spawn=int(initial_args.Spawn)) options_parser.set_defaults( Target_Padding=int(initial_args.Target_Padding)) options_parser.set_defaults( Expected_Position=int(initial_args.Expected_Position)) options_parser.set_defaults( AnchorMismatch=int(initial_args.AnchorMismatch)) options_parser.set_defaults( AnchorStart=int(initial_args.AnchorStart)) options_parser.set_defaults( AnchorStop=int(initial_args.AnchorStop)) else: options_parser.set_defaults( Write_TDnorm_Log2_sgRNA_Control_File=bool( strtobool( initial_args.Write_TDnorm_Log2_sgRNA_Control_File))) options_parser.set_defaults( Write_TDnorm_Log2_sgRNA_Sample_File=bool( strtobool( initial_args.Write_TDnorm_Log2_sgRNA_Sample_File))) options_parser.set_defaults(Write_Log2_sgRNA_File=bool( strtobool(initial_args.Write_Log2_sgRNA_File))) options_parser.set_defaults(Write_Permuted_Log2_Data_File=bool( strtobool(initial_args.Write_Permuted_Log2_Data_File))) options_parser.set_defaults(Bad_sgRNA_Lower_Percentile=float( initial_args.Bad_sgRNA_Lower_Percentile)) options_parser.set_defaults(Bad_sgRNA_Upper_Percentile=float( initial_args.Bad_sgRNA_Upper_Percentile)) options_parser.set_defaults( UpperPercentile=float(initial_args.UpperPercentile)) options_parser.set_defaults( LowerPercentile=float(initial_args.LowerPercentile)) options_parser.set_defaults( PermutationCount=int(initial_args.PermutationCount)) options_parser.set_defaults(Alpha=float(initial_args.Alpha)) options_parser.set_defaults( Target_Mismatch=float(initial_args.Target_Mismatch)) options_parser.set_defaults( UpperGuideLimit=float(initial_args.UpperGuideLimit)) options_parser.set_defaults( LowerGuideLimit=float(initial_args.LowerGuideLimit)) initial_args = options_parser.parse_args() return initial_args args = string_conversions(parser) log = Tool_Box.Logger(args) Tool_Box.log_environment_info(log, args, sys.argv) if not pathlib.Path(args.WorkingFolder).exists(): print( "\033[1;31mERROR:\n\tWorking Folder Path: {} Not Found. Check Parameter File." .format(args.WorkingFolder)) raise SystemExit(1) if args.Statistics: if not pathlib.Path(args.DataFiles).exists(): print( "\033[1;31mERROR:\n\t--DataFiles Folder Path: {} Not Found. Check Parameter File." .format(args.DataFiles)) raise SystemExit(1) if not pathlib.Path(args.SampleManifest).exists(): print( "\033[1;31mERROR:\n\t--SampleManifest: {} Not Found. Check Parameter File." .format(args.SampleManifest)) raise SystemExit(1) if not pathlib.Path(args.Master_Index_File).exists(): print( "\033[1;31mERROR:\n\t--Master_Index_File: {} Not Found. Check Parameter File." .format(args.Master_Index_File)) raise SystemExit(1) if not pathlib.Path(args.Target_File).exists(): print( "\033[1;31mERROR:\n\t--Target_File: {} Not Found. Check Parameter File." .format(args.Target_File)) raise SystemExit(1) if args.TargetSearch: if getattr(args, "FASTQ1", False) and not pathlib.Path(args.FASTQ1).exists(): print( "\033[1;31mERROR:\n\t--FASTQ1: {} Not Found. Check Parameter File." .format(args.FASTQ1)) raise SystemExit(1) try: mime_type1 = magic.from_file(args.FASTQ1, mime=True).decode() except AttributeError: mime_type1 = magic.from_file(args.FASTQ1, mime=True) if "text" in mime_type1 or "gzip" in mime_type1: pass else: log.error( "Unsupported FASTQ file-type. Only TEXT or GZIP Allowed.") raise SystemExit(1) return args, log
plot_data_dict[data_pair[6]][5].append(data_pair[4]) plot_data_dict[data_pair[6]][6].append(data_pair[5]) plot_data_dict[data_pair[6]][7].append(color_dict[data_pair[6]]) count = len(plot_data_dict[data_pair[6]][0]) if count > 1: previous = plot_data_dict[data_pair[6]][0][count - 2] plot_data_dict[data_pair[6]][8]\ .append(plot_data_dict[data_pair[6]][8][count - 2] + 0.0007 + (0.5*previous) + data_pair[0] * 0.5) return plot_data_dict # This is here to run the module as a stand-alone. if __name__ == '__main__': ToolBox.debug_messenger("Standing Alone") parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('--options_file', action='store', dest='options_file', required=True, help='File containing program parameters.') options_parser = ToolBox.options_file(parser) args = options_parser.parse_args() scarmapperplot(args)
def quality_check(data_bundle, fastq_files): """ Called by the multiprocessor pool. Examines the indices and determines the mismatches and N counts. :param data_bundle: :param fastq_files: :return: """ prog_check = data_bundle[0] index_list = data_bundle[1] file1_anchor_seq = data_bundle[2] file2_anchor_seq = data_bundle[3] fastq1 = FASTQ_Reader(fastq_files[0]) fastq2 = FASTQ_Reader(fastq_files[1]) umt_dict = collections.defaultdict( lambda: collections.defaultdict(int)) anchor_dict = Tool_Box.VivifiedDictionary() read_count = 0 try: while True: fastq1_read = next(fastq1.seq_read()) fastq2_read = next(fastq2.seq_read()) read_count += 1 if read_count % int(prog_check) == 0: print(" -->Processed {0} reads in file {1} and {2}.". format(read_count, fastq_files[0], fastq_files[1])) # Get read index and UMT. umt = "{0}{1}".format( fastq1_read.name.split("|")[0], fastq2_read.name.split("|")[1].split(":")[0]) read_index = fastq1_read.name.split(":")[-1] # Quantify anchor lengths. unknown_anchor1 = fastq1_read.seq[7:18] unknown_anchor2 = fastq2_read.seq[7:18] match1 = Levenshtein.distance(file1_anchor_seq, unknown_anchor1) match2 = Levenshtein.distance(file2_anchor_seq, unknown_anchor2) for index in index_list: index_match = Levenshtein.distance(read_index, index[0][:6]) # Add anchor and UMT data to dictionaries. if index[0] in anchor_dict and index_match < 2: anchor_dict[index[0]]["R1"][match1] += 1 anchor_dict[index[0]]["R2"][match2] += 1 umt_dict[index[0]][umt] += 1 # if umt in umt_dict[index[0]]: # umt_dict[index[0]][umt] += 1 # else: # umt_dict[index[0]][umt] = 1 elif index_match < 2: anchor_dict[ index[0]]["R1"] = [0] * len(file1_anchor_seq) anchor_dict[ index[0]]["R2"] = [0] * len(file2_anchor_seq) anchor_dict[index[0]]["R1"][match1] += 1 anchor_dict[index[0]]["R2"][match2] += 1 umt_dict[index[0]][umt] = 1 except StopIteration: return anchor_dict, umt_dict
def temp_file_writer(self, limit): """ Write the temporary FASTQ files. Also create list of temporary BAM file names for use later. :return: """ self.log.info("Begin writing temporary FASTQ files.") i = 0 temp_file1 = None temp_file2 = None fastq_file_list = [] bam_file_list = [] read_count = 0 limit_counter = 0 while read_count <= self.read_count: try: # This generator is returning actual reads not lines. fastq1_read = next(self.fastq1_file.seq_read()) fastq2_read = next(self.fastq2_file.seq_read()) if self.index1_file is not None: fastq3_read = next(self.index1_file.seq_read()) except StopIteration: read_count += 1 continue read_count += 1 try: fastq1_n_frac = fastq1_read.seq.count("N") / len( fastq1_read.seq) fastq2_n_frac = fastq2_read.seq.count("N") / len( fastq2_read.seq) except ZeroDivisionError: continue # Apply Filters if (len(fastq1_read.seq) < int(self.args.Minimum_Length) or len(fastq2_read.seq) < int(self.args.Minimum_Length) or fastq1_n_frac >= float(self.args.N_Limit) or fastq2_n_frac >= float(self.args.N_Limit)): continue if limit_counter % limit == 0: if temp_file1: temp_file1.close() limit_counter = 0 if temp_file2: temp_file2.close() file1 = "{0}{1}_R1_tmp_{2}.fastq.gz".format( self.args.WorkingFolder, self.args.Job_Name, i) file2 = "{0}{1}_R2_tmp_{2}.fastq.gz".format( self.args.WorkingFolder, self.args.Job_Name, i) bam_file_list.append("{0}{1}_R1_tmp_{2}.bam".format( self.args.WorkingFolder, self.args.Job_Name, i)) fastq_file_list.append((file1, file2)) temp_file1 = Writer(self.log, file1) temp_file2 = Writer(self.log, file2) self.log.info("Writing {0} and {1}".format(file1, file2)) i += 1 limit_counter += 1 # BAM files are missing the barcodes because of a space in some of the header files. # fastq1_read.name = fastq1_read.name.replace(" ", ":") # fastq2_read.name = fastq2_read.name.replace(" ", ":") # Add the UMT's to the header. if self.args.HaloPLEX: umi = fastq3_read.seq header1 = "{0}|{1}:{2}".format( fastq1_read.name.split(":")[-1], umi, fastq1_read.name) header2 = "{0}|{1}:{2}".format( fastq2_read.name.split(":")[-1], umi, fastq2_read.name) elif self.args.ThruPLEX: # header1 = "{0}|{1}".format(fastq1_read.name.split(":")[-1], fastq1_read.name) umt1 = fastq1_read.seq[:6] umt2 = fastq2_read.seq[:6] header1 = "{0}|{1}:{2}".format(umt1, umt2, fastq1_read.name) header2 = "{0}|{1}:{2}".format(umt1, umt2, fastq2_read.name) else: Tool_Box.debug_messenger( "Only HaloPLEX or ThruPLEX currently enabled.") self.log.error("Only HaloPLEX or ThruPLEX currently enabled.") raise SystemExit(1) # Trim adapter sequences from 5' end if needed. if int(self.args.trim) > 0: fastq1_read.seq = fastq1_read.seq[int(self.args.trim):] fastq1_read.qual = fastq1_read.qual[int(self.args.trim):] fastq2_read.seq = fastq2_read.seq[int(self.args.trim):] fastq2_read.qual = fastq2_read.qual[int(self.args.trim):] fastq1_read.name = header1 fastq2_read.name = header2 temp_file1.write(self.fastq1_file) temp_file2.write(self.fastq2_file) if temp_file1: temp_file1.close() if temp_file2: temp_file2.close() self.log.info("All temporary FASTQ files written") return fastq_file_list, bam_file_list
def main(command_line_args=None): """ Let's get this party started. :param command_line_args: """ start_time = time.time() VersionDependencies.python_check() if not command_line_args: command_line_args = sys.argv run_start = datetime.datetime.today().strftime("%H:%M:%S %Y %a %b %d") parser = argparse.ArgumentParser( description= "A package to map genomic repair scars at defined loci.\n {} v{}". format(__package__, __version__), formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('--options_file', action='store', dest='options_file', required=True, help='File containing program parameters.') # Check options file for errors and return object. args = error_checking(string_to_boolean(parser)) log = Tool_Box.Logger(args) Tool_Box.log_environment_info(log, args, command_line_args) module_name = "" log.info("{} v{}".format(__package__, __version__)) if args.IndelProcessing: file_list = [] if args.Platform == "Illumina" or args.Platform == "Ramsden" or args.Platform == "TruSeq": log.info("Sending FASTQ files to FASTQ preprocessor.") if args.PEAR: file_list = pear_consensus(args, log) if not file_list: log.error("PEAR failed. Check logs.") raise SystemExit(1) fastq_consensus = file_list[0] fq1 = FASTQ_Tools.FASTQ_Reader(fastq_consensus, log) fq2 = None else: fq2 = FASTQ_Tools.FASTQ_Reader(args.FASTQ2, log) fq1 = FASTQ_Tools.FASTQ_Reader(args.FASTQ1, log) sample_manifest = Tool_Box.FileParser.indices( log, args.SampleManifest) indel_processing = \ Indel_Processing.DataProcessing(log, args, run_start, __version__, Target_Mapper.TargetMapper(log, args, sample_manifest), fq1, fq2) indel_processing.main_loop() # Compress or delete PEAR files. if args.PEAR and file_list: if args.DeleteConsensusFASTQ: log.info("Deleting PEAR FASTQ Files.") Tool_Box.delete(file_list) else: log.info( "Compressing {} FASTQ Files Generated by PEAR.".format( len(file_list))) p = pathos.multiprocessing.Pool(int(args.Spawn)) p.starmap(Tool_Box.compress_files, zip(file_list, itertools.repeat(log))) else: log.error( "Only 'Illumina', 'TruSeq' or 'Ramsden' --Platform methods currently allowed." ) raise SystemExit(1) elif not args.IndelProcessing: # Run frequency file Combine module run_start = datetime.datetime.today().strftime("%a %b %d %H:%M:%S %Y") log.info("Process Replicates.") data_dict = collections.defaultdict(list) file_list = [ f for f in glob.glob("{}*ScarMapper_Frequency.txt".format( args.DataFiles, )) ] file_count = len(file_list) page_header = "# ScarMapper File Merge v{}\n# Run: {}\n# Sample Name: {}\n" \ .format(__version__, run_start, args.SampleName) line_num = 0 index_file = list(csv.reader(open(file_list[0]), delimiter='\t')) for line in index_file: if not line: break elif line_num > 3: page_header += "{}\n".format(line[0]) line_num += 1 page_header += "\n\n" for file_name in file_list: freq_file_data = Tool_Box.FileParser.indices(log, file_name) for row in freq_file_data: key = "{}|{}|{}|{}".format(row[3], row[4], row[6], row[8]) row_data = row[2:] if key in data_dict: data_dict[key][0].append(float(row[1])) else: data_dict[key] = [[float(row[1])], row_data] # Process Data and Write Combined Frequency results file plot_data_dict = collections.defaultdict(list) label_dict = collections.defaultdict(float) output_data_dict = collections.defaultdict(list) marker_list = [] for key, row_list in data_dict.items(): # Force pattern to be in at least half of the files. if len(row_list[0]) / file_count >= 0.5: row_string = "\t".join(row_list[1]) freq = gmean(row_list[0]) sem = stats.sem(row_list[0]) freq_results_outstring = "{}\t{}\t{}\n".format( freq, sem, row_string) output_key = freq # Freq is a 17 digit float so it is very unlikely to be duplicated but if it is this increments it by # a small number then checks the uniqueness again. if output_key in output_data_dict: output_key = output_key + 1e-16 if output_key in output_data_dict: output_key = output_key + 1e-16 scar_type = row_list[1][0] label_dict[scar_type] += freq # Gather up our data for plotting lft_del = int(row_list[1][1]) rt_del = int(row_list[1][2]) mh_size = int(row_list[1][5]) ins_size = int(row_list[1][7]) output_data_dict[output_key] = \ [(freq, lft_del, rt_del, mh_size, ins_size, scar_type), freq_results_outstring] freq_results_outstring = \ "{}# Frequency\tSEM\tScar Type\tLeft Deletions\tRight Deletions\tDeletion Size\tMicrohomology\t" \ "Microhomology Size\tInsertion\tInsertion Size\tLeft Template\tRight Template\tConsensus Left Junction\t" \ "Consensus Right Junction\tTarget Left Junction\tTarget Right Junction\tConsensus\tTarget Region\n" \ .format(page_header) # Now draw a pretty graph of the data if we are not dealing with a negative control. for k in natsort.natsorted(output_data_dict, reverse=True): data_list = output_data_dict[k] freq_results_outstring += data_list[1] freq = data_list[0][0] lft_del = data_list[0][1] rt_del = data_list[0][2] mh_size = data_list[0][3] ins_size = data_list[0][4] scar_type = data_list[0][5] # Plotting all scar patterns is messy. This provides a cutoff. if freq < 0.00025: continue y_value = freq * 0.5 lft_ins_width = freq rt_ins_width = freq # This is gathered up to find the largest value. Used to set the x-axis limits. marker_list.extend([ lft_del + (mh_size * 0.5), rt_del + (mh_size * 0.5), ins_size ]) # Deletion size included half the size of any microhomology present. lft_del_plot_value = (lft_del + (mh_size * 0.5)) * -1 rt_del_plot_value = rt_del + (mh_size * 0.5) # Insertions are centered on 0 so we need to take half the value for each side. lft_ins_plot_value = (ins_size * 0.5) * -1 rt_ins_plot_value = ins_size * 0.5 # Scale the width of bars for insertions inside of deletions if lft_del + (mh_size * 0.5) != 0: lft_ins_width = freq * 0.5 if rt_del + (mh_size * 0.5) != 0: rt_ins_width = freq * 0.5 if scar_type not in plot_data_dict: plot_data_dict[scar_type] = \ [[freq], [lft_del_plot_value], [rt_del_plot_value], [lft_ins_plot_value], [rt_ins_plot_value], [lft_ins_width], [rt_ins_width], [y_value]] else: # Get some previous plot data count = len(plot_data_dict[scar_type][0]) previous_freq = plot_data_dict[scar_type][0][count - 1] previous_y = plot_data_dict[scar_type][7][count - 1] plot_data_dict[scar_type][0].append(freq) plot_data_dict[scar_type][1].append(lft_del_plot_value) plot_data_dict[scar_type][2].append(rt_del_plot_value) plot_data_dict[scar_type][3].append(lft_ins_plot_value) plot_data_dict[scar_type][4].append(rt_ins_plot_value) plot_data_dict[scar_type][5].append(lft_ins_width) plot_data_dict[scar_type][6].append(rt_ins_width) # Use the previous plot data to find the y-value of the current bar. plot_data_dict[scar_type][7] \ .append(previous_y + 0.002 + (0.5 * previous_freq) + y_value) plot_data_dict['Marker'] = [(max(marker_list)) * -1, max(marker_list)] # sample_name = "{}.{}".format(args.Job_Name, args.SampleName) ScarMapperPlot.scarmapperplot(args, datafile=None, sample_name=args.SampleName, plot_data_dict=plot_data_dict, label_dict=label_dict) freq_results_file = \ open("{}{}_ScarMapper_Combined_Frequency.txt".format(args.WorkingFolder, args.SampleName), "w") freq_results_file.write(freq_results_outstring) freq_results_file.close() warning = "\033[1;31m **See warnings above**\033[m" if log.warning_occurred else '' elapsed_time = int(time.time() - start_time) log.info( "****ScarMapper {0} complete ({1} seconds, {2} Mb peak memory).****". format(module_name, elapsed_time, Tool_Box.peak_memory(), warning)) # All done so we need to quit otherwise Python will not release the log file on virtual Linux. exit(0)