def filter(self, input_filepath, output_filepath): self.load_loci(input_filepath) self.filtered_loci = {} for locus_name, locus in iteritems(self.loci): locus = self.filter_locus(locus) if locus is not False: self.filtered_loci[locus_name] = locus self.write_output(output_filepath)
def generate_value_list(self, data): values = [] # Iterate through the data and only record values # for k-repeats that have some support for k, count in iteritems(data): if count > 0: values += [k] * count return sorted(values)
def __normalized_subset(self, subset, data): total = self.get_support(subset) normalized = {} for k, count in iteritems(data): if total == 0: normalized[k] = 0.0 else: normalized[k] = (1.0 * count) / total return normalized
def generate_value_list(self,data): values = [] # Iterate through the data and only record values # for k-repeats that have some support for k, count in iteritems(data): if count > 0: values += [k] * count return sorted(values)
def write_output(self, output_filepath): fileout = open(output_filepath, 'w') header = ['Locus', 'Repeats', 'Normal', 'Tumor'] fileout.write('\t'.join(header) + '\n') for l, locus in sorted(iteritems(self.filtered_loci)): output = locus.generate_output() for line in output: fileout.write('\t'.join(line) + '\n') fileout.close() return True
def subset_outlier_filter(self, data, sds): values = self.generate_value_list(data) output = {} if len(values): mean = numpy.mean(values) std = numpy.std(values) # Round the min and max for the window to allow for # some leniency in the filter. min_k = int(math.floor(mean - (sds * std))) max_k = int(math.ceil(mean + (sds * std))) for k, count in iteritems(data): if min_k <= k <= max_k: # Acceptable output[k] = count return output
def k_values(self, subset=False): k_values = set() if subset is False: for k in self.__k: if (k in self.__normal and self.__normal[k] > 0.0) or \ (k in self.__tumor and self.__tumor[k] > 0.0): k_values.add(k) else: if subset.upper()[0] == 'N': # Normal data set subset = self.__normal elif subset.upper()[0] == 'T': # Tumor data set subset = self.__tumor for k, v in iteritems(subset): if v > 0.0: k_values.add(k) k_values = sorted(k_values) return k_values
def k_values(self, subset = False): k_values = set() if subset is False: for k in self.__k: if (k in self.__normal and self.__normal[k] > 0.0) or \ (k in self.__tumor and self.__tumor[k] > 0.0): k_values.add(k) else: if subset.upper()[0] == 'N': # Normal data set subset = self.__normal elif subset.upper()[0] == 'T': # Tumor data set subset = self.__tumor for k, v in iteritems(subset): if v > 0.0: k_values.add(k) k_values = sorted(k_values) return k_values
def expand_kmer_counts(d): new_list = [] for k, v in iteritems(d): new_list.extend([k] * v) return new_list
output_filepath = os.path.abspath(args.output) status_filepath = output_filepath + '.status' loci = load_loci(input_filepath) fileout = open(output_filepath, 'w') line = '\t'.join([ 'Locus', 'Normal_Reads', 'Tumor_Reads', 'Difference', 'Distance', 'Dissimilarity' ]) fileout.write(line + '\n') # Iterate through all the results to generate the output. As part of the # loop, count the weighted values for each metric. values = {'difference': [], 'distance': [], 'dissimilarity': []} for l, locus in sorted(iteritems(loci)): # Calculate post-normalization metrics locus.normalize() difference = Difference.get(locus) distance = EuclideanDistance.get(locus) dissimilarity = CosineDissimilarity.get(locus) # Generate output line. line = '\t'.join([ str(x) for x in [ locus.locus(), locus.get_support('N'), locus.get_support('T'), round(difference, 4), round(distance, 4), round(dissimilarity, 4)