def sources_at_point_pair(self, chrom1, pos1, chrom2, pos2, strain_names): """ Prints the range of the 2D interval and the counts of subspecific combos at 2 loci in the genome :param chrom1: chromosome of one locus :param pos1: position of one locus :param chrom2: chromosome of another locus :param pos2: position of another locus :param strain_names: list of strain names to analyze """ coords = [self.genome_index(chrom1, pos1), self.genome_index(chrom2, pos2)] mins = [0] * 2 maxes = [np.sum(self.sizes)] * 2 coords.sort() output = {} samples = [[[] for _ in subspecies.iter_subspecies(True)] for _ in subspecies.iter_subspecies(True)] key = [subspecies.to_string(s) for s in subspecies.iter_subspecies(True)] for strain_name in strain_names: intervals = self.sample_dict[strain_name][0] sources = self.sample_dict[strain_name][1] # find interval containing each location i = 0 interval_indices = [None, None] for loc_num in xrange(2): while intervals[i] < coords[loc_num]: i += 1 if i > 0: mins[loc_num] = max(mins[loc_num], intervals[i - 1]) maxes[loc_num] = min(maxes[loc_num], intervals[i]) interval_indices[loc_num] = i samples[subspecies.to_ordinal(sources[interval_indices[0]])][ subspecies.to_ordinal(sources[interval_indices[1]])].append(strain_name) output['Key'] = key output['Samples'] = samples output['Intervals'] = [ self.chrom_and_pos(mins[0], maxes[0]), self.chrom_and_pos(mins[1], maxes[1]) ] return output
def build_pairwise_matrix(self, strain_names, elem_intervals): # 3d matrix. First index is combo, remaining 2d matrices are counts for pairwise intervals source_counts = np.zeros([(subspecies.NUM_SUBSPECIES + 1) ** 2, len(elem_intervals), len(elem_intervals)], dtype=np.int16) for strain_name in strain_names: intervals, sources = self.sample_dict[strain_name] # map this strain's intervals onto the elementary intervals breaks = np.insert(np.searchsorted(elem_intervals, intervals), 0, -1) for row in xrange(len(intervals)): for col in xrange(row, len(intervals)): # only upper triangle source = subspecies.combine(sources[row], sources[col]) source_ordinate = subspecies.to_ordinal(source) source_counts[source_ordinate, breaks[row] + 1:breaks[row + 1] + 1, breaks[col] + 1:breaks[col + 1] + 1] += 1 return source_counts
def pairwise_frequencies(self, strain_names): """ For every locus pair and every label pair, count the number of strains which have those labels at those pairs of loci. :param strain_names: list of strain names to analyze (must be a subset of the output from preprocess()) """ output = [[[], [], [], []] for _ in xrange(subspecies.NUM_SUBSPECIES**2)] for strain_name in strain_names: intervals, sources = self.sample_dict[strain_name] for i in xrange(len(intervals)): # only upper triangle is meaningful if subspecies.is_known(sources[i]): for j in xrange(i, len(intervals)): if subspecies.is_known(sources[j]): combo_output = output[subspecies.to_ordinal(subspecies.combine(sources[i], sources[j]))] combo_output[0].append(intervals[i-1]) combo_output[1].append(intervals[i]) combo_output[2].append(intervals[j-1]) combo_output[3].append(intervals[j]) return output, [subspecies.to_color(i, True) for i in xrange(subspecies.NUM_SUBSPECIES**2)]