def _reindex(self): """ Reindex values for non-pythonic input to DartReader (better for users) """ self.scheme = {k: int(v+1) for k, v in self.scheme.items()} for k, v in sorted(self.scheme.items(), key=operator.itemgetter(1)): stamp(k, "=", v) stamp("Please check these values in your data to ensure correct input for DartQC.")
def __init__(self, data, attributes): QualityControl.__init__(self, data, attributes) self.name = "individual" self.attributes["modules"][self.name] = {} self._set_log() stamp("Inititating Sample Module.")
def _reindex(self): """ Reindex values for non-pythonic input to DartReader (better for users) """ self.scheme = {k: int(v + 1) for k, v in self.scheme.items()} for k, v in sorted(self.scheme.items(), key=operator.itemgetter(1)): stamp(k, "=", v) stamp( "Please check these values in your data to ensure correct input for DartQC." )
def _get_pop_results(self): try: removed = {"monomorphic": self.attributes["modules"]["population"]["results"]["removed"]} params = {"monomorphic": self.attributes["modules"]["population"]["settings"]["value"]} except KeyError: stamp("Could not detect results for Population Module, skipping...") params = {"monomorphic": None} removed = {"monomorphic": None} return params, removed
def _write_scheme(self): if self.output_name is None: name, ext = os.path.splitext(os.path.basename(self.file_path)) file_name = name + "_scheme.json" else: file_name = self.output_name + "_scheme.json" out_file = os.path.join(self.output_path, file_name) stamp("Writing scheme to:", out_file) with open(out_file, "w") as outfile: json.dump(self.scheme, outfile, indent=4)
def _get_preprocessing_results(self): try: params = {"preprocess": self.attributes["modules"]["preprocessor"]["settings"]["read_count_sum_threshold"], "calls": self.attributes["modules"]["preprocessor"]["settings"]["results"]["total_calls"], "missing": self.attributes["modules"]["preprocessor"]["settings"]["results"]["before_missing"]} removed = {"preprocess": self.attributes["modules"]["preprocessor"]["settings"]["results"]["replaced_calls"], "calls": self.attributes["modules"]["preprocessor"]["settings"]["results"]["replaced_calls"], "missing": self.attributes["modules"]["preprocessor"]["settings"]["results"]["replaced_calls"]} except KeyError: stamp("Could not detect results for Preprocessing Module, skipping...") params = {"preprocess": None, "calls": None, "missing": None} removed = {"preprocess": None, "calls": None, "missing": None} return params, removed
def _get_preprocessing_results(self): try: params = {"preprocess": self.attributes["modules"]["preprocessor"]["settings"]["read_count_sum_threshold"], "calls": self.attributes["modules"]["preprocessor"]["settings"]["results"]["total_calls"], "missing": self.attributes["modules"]["preprocessor"]["settings"]["results"]["before_missing"]} removed = { "preprocess": self.attributes["modules"]["preprocessor"]["settings"]["results"]["replaced_calls"], "calls": self.attributes["modules"]["preprocessor"]["settings"]["results"]["replaced_calls"], "missing": self.attributes["modules"]["preprocessor"]["settings"]["results"]["replaced_calls"]} except KeyError: stamp("Could not detect results for Preprocessing Module, skipping...") params = {"preprocess": None, "calls": None, "missing": None} removed = {"preprocess": None, "calls": None, "missing": None} return params, removed
def _get_sample_results(self): try: removed = {"mind": self.attributes["individual"]["results"]["mind"]["removed_samples"], "samples": self.attributes["individual"]["results"]["mind"]["removed_samples"]} params = {"mind": self.attributes["individual"]["results"]["mind"]["value"], "samples": len(self.attributes["individual"]["states"]["mind"]["sample_names_original"])} except KeyError: stamp("Could not detect results for Sample Module, skipping...") params = {"mind": None, "samples": None} removed = {"mind": None, "samples": None} return params, removed
def _get_redundancy_results(self): try: parameters = self.attributes["modules"]["redundancy"]["settings"] params = {"clusters": parameters["clusters"], "duplicates": parameters["duplicates"], "identity:": parameters["identity"]} results = self.attributes["modules"]["redundancy"]["results"] removed = {"clusters": results["clusters"]["removed"], "duplicates": results["duplicates"]["removed"], "identity": None} except KeyError: stamp("Could not detect results for Redundancy Module, skipping...") params = {"clusters": None, "duplicates": None, "identity": None} removed = {"clusters": None, "duplicates": None, "identity": None} return params, removed
def get_cdhit_message(self, identity, time=True): cluster_msg = textwrap.dedent(""" CLUSTERING ------------------------------- Running CDHIT-EST... Threshold: {0}% ------------------------------- """ .format(identity*100)) if time: stamp("Running CD-HIT at nucleotide identity {0}%".format(identity*100)) else: print(cluster_msg)
def _get_snp_results(self): """Extract entry from Attributes""" try: results = self.attributes["modules"]["snp"]["results"] parameters = self.attributes["modules"]["snp"]["settings"]["parameters"] params = {entry[0]: entry[1] for entry in parameters} removed = {param: result["removed"] for param, result in results.items()} except KeyError: stamp("Could not detect results for SNP Module, skipping...") params = {"maf": None, "hwe": None, "call_rate": None, "rep_average": None} removed = {"maf": None, "hwe": None, "call_rate": None, "rep_average": None} return params, removed
def get_cdhit_message(self, identity, time=True): cluster_msg = textwrap.dedent(""" CLUSTERING ------------------------------- Running CDHIT-EST... Threshold: {0}% ------------------------------- """.format(identity * 100)) if time: stamp("Running CD-HIT at nucleotide identity {0}%".format( identity * 100)) else: print(cluster_msg)
def _convert_excel(self): stamp("Converting from Excel") stamp("File is", self.file_path) stamp("Sheet is", self.excel_sheet) data_xls = pandas.read_excel(self.file_path, self.excel_sheet, index_col=None) name, ext = os.path.splitext(os.path.basename(self.file_path)) outfile = os.path.join(self.output_path, name + ".csv") stamp("Writing to file", outfile) data_xls.to_csv(outfile, encoding='utf-8', index=False) self.file_path = outfile
def _get_row_indices(self): """ This function gets the row indices for samples and data, as well as the header, assuming: - header row begins after rows starting with "*" - data row starts after header row - samples are specified in the header row (above calls or raw counts) """ stamp("Guessing data configuration:") for i, row in self.top.iterrows(): if row[0] != "*": self.header = row self.sample_row = i self.data_row = i + 1 self.scheme["sample_row"] = self.sample_row self.scheme["data_row"] = self.data_row break
def get_redundancy_message(self, type, initial, removed, retained, time=True): redundancy_msg = textwrap.dedent(""" REDUNDANCY ------------------------------- {0} Initial: {1} Removed: {2} Retained: {3} ------------------------------- """.format(type.upper(), initial, removed, retained)) if time: stamp("Redundancy module {0}".format(type.upper())) stamp("Removed {0} SNPs".format(removed)) stamp("Retained {0} SNPs".format(retained)) else: print(redundancy_msg)
def get_filter_message(self, filter, threshold, initial, removed, retained, time=True): filter_msg = textwrap.dedent(""" SNP Filter ------------------------------- {0} at {1} Initial: {2} Removed: {3} Retained: {4} ------------------------------- """.format(filter.upper(), threshold, initial, removed, retained)) if time: stamp("Filtered {0} at {1}".format(filter.upper(), threshold)) stamp("Removed {0} SNPs".format(removed)) stamp("Retained {0} SNPs".format(retained)) else: print(filter_msg)
def _get_row_indices(self): """ This function gets the row indices for samples and data, as well as the header, assuming: - header row begins after rows starting with "*" - data row starts after header row - samples are specified in the header row (above calls or raw counts) """ stamp("Guessing data configuration:") for i, row in self.top.iterrows(): if row[0] != "*": self.header = row self.sample_row = i self.data_row = i+1 self.scheme["sample_row"] = self.sample_row self.scheme["data_row"] = self.data_row break
def _run_cdhit(self, fasta_path, identity=0.95, word_size=5, description_length=0, cdhit_path=None): """ Run CDHIT-EST for sequences, install with sudo apt install cd-hit on Ubuntu """ self.messages.get_cdhit_message(identity) if cdhit_path is None: cdhit_path = "cd-hit-est" file_name = self.project + "_IdentityClusters_" + str(identity) out_file = os.path.join(self.tmp_path, file_name) cluster_path = os.path.join(self.tmp_path, file_name + '.clstr') stamp("Calling cd-hit-est: " + cdhit_path + " -i " + fasta_path + " -o " + out_file + " -c " + str(identity) + " -n " + str(word_size) + " -d " + str(description_length)) with open(os.devnull, "w") as devnull: call([cdhit_path, "-i", fasta_path, "-o", out_file, "-c", str(identity), "-n", str(word_size), "-d", str(description_length)], stdout=devnull) return cluster_path
def filter_data(self, mind=0.2, recalculate=True): """ Re-write with Pandas """ if mind is None: stamp("Returning data without filtering.") return self.data, self.attributes stamp("Filtering samples with missing data >", mind) stamp("Missing data calculated over", len(self.data), "SNPs") mind_prop = self._calculate_mind() to_remove = mind_prop[mind_prop > mind].index.tolist() filtered_data = {} for snp, data in self.data.items(): data["calls"] = [snp_call for i, snp_call in self._iterate_call_indices(data["calls"]) if i not in to_remove] filtered_data[snp] = data attributes = self._adjust_attributes(self.attributes, mind, to_remove) percent_removed = format((len(to_remove) / attributes["sample_size"]) * 100, ".2f") stamp("Removed {r} samples out of {t} samples ({p}%)" .format(r=len(to_remove), t=attributes["sample_size"], p=percent_removed)) # Recalculating SNP parameters: if recalculate: stamp("Recalculating MAF, CALL RATE and HWE for SNPs") marker = SNPModule(filtered_data, attributes) filtered_data, attributes = marker.get_data(threshold=None) return filtered_data, attributes
def filter_data(self, mind=0.2, recalculate=True): """ Re-write with Pandas """ if mind is None: stamp("Returning data without filtering.") return self.data, self.attributes stamp("Filtering samples with missing data >", mind) stamp("Missing data calculated over", len(self.data), "SNPs") mind_prop = self._calculate_mind() to_remove = mind_prop[mind_prop > mind].index.tolist() filtered_data = {} for snp, data in self.data.items(): data["calls"] = [snp_call for i, snp_call in self._iterate_call_indices(data["calls"]) if i not in to_remove] filtered_data[snp] = data attributes = self._adjust_attributes(self.attributes, mind, to_remove) percent_removed = format((len(to_remove) / attributes["sample_size"])*100, ".2f") stamp("Removed {r} samples out of {t} samples ({p}%)" .format(r=len(to_remove), t=attributes["sample_size"], p=percent_removed)) # Recalculating SNP parameters: if recalculate: stamp("Recalculating MAF, CALL RATE and HWE for SNPs") marker = SNPModule(filtered_data, attributes) filtered_data, attributes = marker.get_data(threshold=None) return filtered_data, attributes
def write_seq_vals(self, out_file=None): # Add the CSV headers? output_csv = [["Cluster #", "Ref Seq", "Cluster Sequences..."]] for seq in self.seq_vals: type_val = { "GOOD": 0, "BAD ID": 1, "BAD LOC": 2, "BAD ID & LOC": 3, "UNKNOWN": 4 } seq["sequences"].sort(key=lambda x: type_val[x[0]]) # sorted_seq = sorted(seq["sequences"], cmp=lambda seq1,seq2: cmp(type_val[seq1[0]], type_val[seq2[0]])) row_data = [str(seq["cluster_num"]), seq["ref_seq_str"]] + [str(item) for sublist in seq["sequences"] for item in sublist] for item in seq["sequences"]: if type_val[item[0]] == 1: sys.stderr.write("WARNING: [" + item[0] + ":" + item[1] + "] " + item[1] + " renamed to " + seq["ref_seq_str"] + "\n") if type_val[item[0]] > 1: sys.stderr.write("ERROR: [" + item[0] + ":" + item[1] + "] Unexpected clone ID! This needs manual fixing\n") output_csv.append(row_data) if out_file is None: out_file = os.path.abspath( os.path.join(self.attributes["out_path"], self.attributes["project"] + "_seq_vals.csv")) with open(out_file, 'w') as vals_out: csv_writer = csv.writer(vals_out, delimiter=",", lineterminator='\n') csv_writer.writerows(output_csv) print("\n") stamp("Sequence ID filtering info written to ", out_file) stamp("Look at this file for more information on any ERRORS and WARNINGS")
def write_json(self, file_name, data_indent=0, attribute_indent=4): data_file = os.path.abspath(os.path.join(self.attributes["out_path"], file_name + "_data.json")) attribute_file = os.path.abspath(os.path.join(self.attributes["out_path"], file_name + "_attr.json")) stamp("Writing data to JSON") stamp("Data file:", data_file) stamp("Attribute file:", attribute_file) with open(data_file, "w") as data_out: json.dump(self.data, data_out, indent=data_indent) with open(attribute_file, "w") as attr_out: json.dump(self.attributes, attr_out, indent=attribute_indent)
def get_filter_message(self, filter, threshold, initial, removed, retained, time=True): filter_msg = textwrap.dedent(""" SNP Filter ------------------------------- {0} at {1} Initial: {2} Removed: {3} Retained: {4} ------------------------------- """ .format(filter.upper(), threshold, initial, removed, retained)) if time: stamp("Filtered {0} at {1}".format(filter.upper(), threshold)) stamp("Removed {0} SNPs".format(removed)) stamp("Retained {0} SNPs".format(retained)) else: print(filter_msg)
def get_redundancy_message(self, type, initial, removed, retained, time=True): redundancy_msg = textwrap.dedent(""" REDUNDANCY ------------------------------- {0} Initial: {1} Removed: {2} Retained: {3} ------------------------------- """ .format(type.upper(), initial, removed, retained)) if time: stamp("Redundancy module {0}".format(type.upper())) stamp("Removed {0} SNPs".format(removed)) stamp("Retained {0} SNPs".format(retained)) else: print(redundancy_msg)
def write_plink(self, file_name, sep="\t", remove_space=False): snp_order = sorted(self.data.keys()) stamp("Decoding calls...") snp_rows = [[self.decoding_scheme[snp] for snp in self.data[snp_id]["calls"]] for snp_id in snp_order] stamp("Transposing calls...") snps_by_sample = numpy.asarray(snp_rows).transpose(1, 0, 2) genotypes = [sample.flatten().tolist() for sample in snps_by_sample] names = self.attributes["sample_names"] pops = [self.attributes["pops"][sample] for sample in names] if remove_space: names = ["_".join(name.split()) for name in names] pops = ["_".join(pop.split()) for pop in pops] ped_file = os.path.join(self.attributes["out_path"], file_name + '.ped') map_file = os.path.join(self.attributes["out_path"], file_name + '.map') paternal = ["0"] * len(names) maternal = ["0"] * len(names) sex = ["0"] * len(names) phenotype = ["-9"] * len(names) plink = zip(pops, names, paternal, maternal, sex, phenotype, genotypes) stamp("Formatting calls...") ped_data = [] for row in plink: new_row = list(row[:6]) for geno in row[6]: new_row.append(geno) ped_data.append(new_row) stamp("Writing PLINK") stamp("PED file:", ped_file) stamp("MAP file:", map_file) with open(ped_file, 'w') as ped_out: ped_writer = csv.writer(ped_out, delimiter=sep) ped_writer.writerows(ped_data) # MAP Formatting map_data = [["0", snp_id, "0", "0"] for snp_id in snp_order] with open(map_file, 'w') as map_out: ped_writer = csv.writer(map_out, delimiter=sep) ped_writer.writerows(map_data)
def check_concordance(self): if set(self.sample_names) != set(self.call_names): stamp( "Sample names from the read count file are not the same as sample names from the data file." ) stamp("Sample difference, present in one but not the other data:") for sample in set(self.sample_names).difference( set(self.call_names)): stamp(sample) exit(0) else: stamp( "Concordance between sample names in call and count data, all is good." ) if len(self.call_data) != len(self.data): diff = set(self.call_data.keys()).difference(set(self.data.keys())) inter = set(self.call_data.keys()).intersection( set(self.data.keys())) stamp("Number of SNPs are different, there are:", len(self.call_data), "SNPs in the called set and", len(self.data), "SNPs in the raw set.") stamp(len(diff), "SNPs have a different ID. Keeping the intersection of", len(inter), "SNPs...") self.data = {k: v for (k, v) in self.data.items() if k in inter} self.call_data = { k: v for (k, v) in self.call_data.items() if k in inter } if set(self.call_data.keys()) != set(self.data.keys()): stamp( "SNP IDs are not the same, removal not effective, please re-format your data." )
def _read_csv(self): stamp("Loading file", self.file_path) self.top = pandas.read_csv(self.file_path, header=None, nrows=30)
def write_plink(self, file_name, sep="\t", remove_space=False): snp_order = sorted(self.data.keys()) stamp("Decoding calls...") snp_rows = [[self.decoding_scheme[snp] for snp in self.data[snp_id]["calls"]] for snp_id in snp_order] # Update to output actual ACGT values for alleles rather than just A or B - this maintains the most info. stamp("Transposing calls...") snps_by_sample = numpy.asarray(snp_rows).transpose(1, 0, 2) genotypes = [sample.flatten().tolist() for sample in snps_by_sample] names = self.attributes["sample_names"] pops = [self.attributes["pops"][sample] for sample in names] if remove_space: names = ["_".join(name.split()) for name in names] pops = ["_".join(pop.split()) for pop in pops] ped_file = os.path.join(self.attributes["out_path"], file_name + '.ped') map_file = os.path.join(self.attributes["out_path"], file_name + '.map') paternal = ["0"] * len(names) maternal = ["0"] * len(names) sex = ["0"] * len(names) phenotype = ["-9"] * len(names) plink = zip(pops, names, paternal, maternal, sex, phenotype, genotypes) stamp("Formatting calls...") ped_data = [] for row in plink: new_row = list(row[:6]) for geno in row[6]: new_row.append(geno) ped_data.append(new_row) stamp("Writing PLINK") stamp("PED file:", ped_file) stamp("MAP file:", map_file) with open(ped_file, 'w') as ped_out: ped_writer = csv.writer(ped_out, delimiter=sep) ped_writer.writerows(ped_data) # MAP Formatting map_data = [["0", snp_id, "0", "0"] for snp_id in snp_order] with open(map_file, 'w') as map_out: ped_writer = csv.writer(map_out, delimiter=sep) ped_writer.writerows(map_data)
def get_data(self, mono="all", comparison="=="): stamp("Initialised Population Module") if mono is None: stamp("No filter specified, returning data.") return self.data, self.attributes stamp("Indexing monomorphic SNPs in each population") self._calculate_monomorphics() for pop, indices in self.populations.items(): stamp("There are", len(indices), "samples in population", pop) for pop, monomorphs in self.monomorphics.items(): stamp("There are", len(monomorphs), "monomorphic SNPs in population", pop) # If threshold is string 'all', set to all populations. stamp("Filtering SNPs that are monomorphic in", mono, "populations.") if mono == "all": mono = len(self.populations) if comparison == "==": filtered = {snp: data for snp, data in self.data.items() if data["mono"] == mono} elif comparison == ">=": filtered = {snp: data for snp, data in self.data.items() if data["mono"] >= mono} elif comparison == "<=": filtered = {snp: data for snp, data in self.data.items() if data["mono"] <= mono} else: raise ValueError("Comparison must be one of: <=, >=, ==") filtered_data = {snp: data for snp, data in self.data.items() if snp not in filtered} stamp("Filtered", len(filtered), "SNPs.") attributes = self._log_monomorphic(self.attributes, filtered_data, mono) return filtered_data, attributes
def check_concordance(self): if set(self.sample_names) != set(self.call_names): stamp("Sample names from the read count file are not the same as sample names from the data file.") stamp("Sample difference, present in one but not the other data:") for sample in set(self.sample_names).difference(set(self.call_names)): stamp(sample) raise SimpleException("Sample names in data & read count files don't match.\n" + "\t\t- Check if the read_counts (and data) sample row is set correctly") else: stamp("Concordance between sample names in call and count data, all is good.") if len(self.call_data) != len(self.data): diff = set(self.call_data.keys()).difference(set(self.data.keys())) inter = set(self.call_data.keys()).intersection(set(self.data.keys())) stamp("Number of SNPs are different, there are:", len(self.call_data), "SNPs in the called set and", len(self.data), "SNPs in the raw set.") stamp(len(diff), "SNPs have a different ID. Keeping the intersection of", len(inter), "SNPs...") self.data = {k: v for (k, v) in self.data.items() if k in inter} self.call_data = {k: v for (k, v) in self.call_data.items() if k in inter} if set(self.call_data.keys()) != set(self.data.keys()): stamp("SNP IDs are not the same, removal not effective, please re-format your data.")
def filter_read_counts(self, threshold=7): """ 1. Transform read count matrix to numpy array, ordered by allele IDs. 2. Sum-collapse replicate columns in the order of sample names from the original data (sample_names) 3. Construct the reduced array and assign each call in a dictionary the allele ID 4. For each allele in the dictionary, construct a boolean vector with True if the sum of the two allele counts is smaller than the threshold value, otherwise False 5. Use this vector in the same iteration to assign missing to all calls in the original data """ self.check_concordance() call_missing = self.get_missing() stamp("Number of missing in call data:", call_missing) snp_order = sorted(self.data.keys()) reduced_counts = {} stamp("Finding replicate columns...") self.get_replicates() stamp("Ordering count data by SNPs...") counts = [self.data[snp]["calls"] for snp in snp_order] count_array = numpy.asarray(counts) stamp("Sum-collapsing replicates...") columns = [ numpy.sum(count_array[:, self.replicates[sample]], axis=1).tolist() for sample in self.call_names ] reduced_array = list(zip(*columns)) for i, snp in enumerate(snp_order): reduced_counts[snp] = reduced_array[i] replaced = 0 total = 0 stamp("Replacing low counts with missing...") for snp, counts in reduced_counts.items(): filter_vector = [ False if sum(allele_counts) <= threshold else True for allele_counts in counts ] self.call_data[snp]["calls"] = [ call if filter_vector[i] else "-" for i, call in enumerate(self.call_data[snp]["calls"]) ] replaced += filter_vector.count(False) total += len(filter_vector) replaced -= call_missing stamp("Pre-processing silenced {r}/{t} calls {p}".format( r=replaced, t=total, p=format((replaced / total) * 100, ".2f"))) self.call_attributes["modules"][self.name]["results"] = { "total_calls": total, "replaced_calls": replaced, "before_missing": call_missing, "after_missing": call_missing + replaced } self.call_attributes["modules"][self.name]["settings"] = { "read_count_sum_threshold": threshold }
def filter_read_counts(self, threshold=[7]): """ 1. Transform read count matrix to numpy array, ordered by allele IDs. 2. Sum-collapse replicate columns in the order of sample names from the original data (sample_names) 3. Construct the reduced array and assign each call in a dictionary the allele ID 4. For each allele in the dictionary, construct a boolean vector with True if the sum of the two allele counts is smaller than the threshold value, otherwise False 5. Use this vector in the same iteration to assign missing to all calls in the original data """ self.check_concordance() call_missing = self.get_missing() stamp("Number of missing in call data:", call_missing) snp_order = sorted(self.data.keys()) reduced_counts = {} stamp("Finding replicate columns...") self.get_replicates() stamp("Ordering count data by SNPs...") counts = [self.data[snp]["calls"] for snp in snp_order] count_array = numpy.asarray(counts) stamp("Sum-collapsing replicates...") for idx, aCounts in enumerate(count_array): if len(aCounts) < 2: raise SimpleException("Invalid read counts data for allele " + snp_order[idx] + " - is there only 1 row?") columns = [numpy.sum(count_array[:, self.replicates[sample]], axis=1).tolist() for sample in self.call_names] reduced_array = list(zip(*columns)) for i, snp in enumerate(snp_order): reduced_counts[snp] = reduced_array[i] stamp("Replacing low counts with missing...") all_call_data = [] all_call_attrs = [] all_filtered = [] # If not graphing there is no point filtering all values given, so just take the first threshold value if not self.graph: threshold = [threshold[0]] else: DartGraphs.create_static_plots(self.call_data, self.data, self.out_path, self.project) DartGraphs.create_plots(self.call_data, self.data, self.call_attributes, "original", self.out_path, self.project, "red") # pass for call_thresh in threshold: replaced = 0 total = 0 call_data = {} call_attrs = copy(self.call_attributes) call_attrs["modules"] = {self.name: {}} filtered = {} for snp, counts in reduced_counts.items(): filter_vector = [False if sum(allele_counts) <= call_thresh else True for allele_counts in counts] call_data[snp] = copy(self.call_data[snp]) call_data[snp]["calls"] = [call if filter_vector[i] else "-" for i, call in enumerate(self.call_data[snp]["calls"])] total += len(filter_vector) replaced += filter_vector.count(False) filtered[snp] = filter_vector; replaced -= call_missing stamp("Pre-processing silenced {r}/{t} calls {p}% using call threshold {c}".format(r=replaced, t=total, p=format((replaced/total)*100, ".2f"), c=call_thresh)) call_attrs["modules"][self.name]["results"] = { "total_calls": total, "replaced_calls": replaced, "before_missing": call_missing, "after_missing": call_missing + replaced } call_attrs["modules"][self.name]["settings"] = { "read_count_sum_threshold": call_thresh } all_call_attrs.append(call_attrs) all_call_data.append(call_data) all_filtered.append(filtered) self.call_data = all_call_data[0] self.call_attributes = all_call_attrs[0] self.filtered = all_filtered[0] if self.graph: DartGraphs.create_plots(all_call_data, self.data, all_call_attrs, "threshold", self.out_path, self.project, "orange", legend=[("Threshold " + str(thresh)) for thresh in threshold])