def _clean_clique(self, c): '''Ideally, each IBD graph component is a clique (reflecting IBD transitivity). In practice, we have approximate transitivity, so attempt to complete the clique by looking for more IBD segment edges in the neighborhood of the SNP for nodes that are not connected to all others. If those nodes are still not connected to at least min_degree neighbors, remove them from the clique. If the clique is smaller than (min_degree+1), such nodes are always removed.''' max_degree = c.number_of_nodes() - 1 nodes = c.nodes() orig_nodes = len(nodes) unconnected = filter(lambda node: c.degree(node) < max_degree, nodes) if self.debug: self._writeln('\tComponent: nodes %d, edges %d, %s' % (c.number_of_nodes(), c.number_of_edges(), repr(util.occur_dict(c.degree().values())))) if unconnected: if self.debug: self._writeln('\t\tAll nodes %s' % (len(nodes), )) self._writeln('\t\tUnconnected nodes %s' % (len(unconnected), )) # Add extra edges extra_edges = [ tuple(s.samples) for s in it.chain.from_iterable( self.segments.find(self.bp_left, self.bp_right + 1, node) for node in unconnected) if s.samples[0] in nodes and s.samples[1] in nodes ] # self._writeln('\t\textra_edges', extra_edges c.add_edges_from(extra_edges) # if debug: # self._writeln('\t\tnodes %d, edges %d, %s' % (c.number_of_nodes(), c.number_of_edges(), repr(util.occur_dict(c.degree().values()))) # Delete still-connected nodes c_min_degree = min(max_degree, self.min_degree) unconnected = filter(lambda node: c.degree(node) < c_min_degree, nodes) if unconnected: # if debug: # self._writeln('\t\tRemoving nodes %s min_degree %d' % (repr(unconnected), c_min_degree) c.remove_nodes_from(unconnected) if self.debug: self._writeln('\t\tCleaned component: %d -> %d' % (orig_nodes, c.number_of_nodes())) return c
def plot_two_families(): '''Test ancestor imputation and child POO alignment for two families on chromosome 22.''' # Parameters chrom = 22 plot = False # True save_plot = False # True debug = False # True # Read data p = im.hutt('hutt.phased.npz') q = p.pedigree.quasi_founders # aligned = set(p.haplotype.aligned_samples) t = frozenset([frozenset(im.gt.genotyped_children(p, p.pedigree.find_family_by_child(i, genotyped=False))) for i in q]) num_sibs = map(len, t) print 'Distribution of QF family sizes', util.occur_dict(num_sibs) # plot_hist_num_sibs(num_sibs) # ibd = im.index.segment_index.SegmentIndex(os.environ['OBER_OUT'] + '/index_segments') if plot: P.figure(1) s = set([x for x in t if 1049 in x][0]) - set([1049]) pa, _ = analyze_family(p, s, max_colors=4, title='Haplotype Coloring: Quasi-Founder Sibs, All, Chrom. %d' % (chrom,), plot=plot, debug=debug) if save_plot: P.savefig(os.environ['OBER'] + '/doc/poo/qf_family/hap_colors_poo.png') if plot: P.figure(2) s2 = set([x for x in t if 1049 in x][0]) analyze_family(p, s2, max_colors=4, title='Haplotype Coloring: Quasi-Founder Sibs, POOC Chrom. %d' % (chrom,), plot=plot, debug=debug) if save_plot: P.savefig(os.environ['OBER'] + '/doc/poo/qf_family/hap_colors_all.png') if plot: P.figure(3) f = p.find_family(10, 1414) # 4 children, genotyped parents s3 = f.children analyze_family(p, s3, max_colors=4, title='Haplotype Coloring: Non-Founder Sibs Chrom. %d' % (chrom,), plot=plot, debug=debug) if save_plot: P.savefig(os.environ['OBER'] + '/doc/poo/nf_family/hap_colors.png') if plot: P.show() print im.color.hap_color.best_hap_alignment_to_colors(pa) print 'Regions', pa.num_regions print 'Parental haplotype coverage %', parent_coverage_fraction(pa, p) print 'Children coverage by parental haplotypes', pa.color_sequence_coverage(np.arange(4))
''' import util, networkx as nx, sys, numpy as np sorted_pair = lambda a, b: (a, b) if a <= b else (b, a) '''Main program''' if __name__ == '__main__': in_file = sys.stdin delimiter = ',' output_format = 'cytoscape' # 'matrix' # Read disease-block list from a delimited text file (by default, a CSV) # Create a hash table of block-to-diseases h = util.mdict.from_items((j, i) for (i, j) in np.loadtxt(in_file, delimiter=delimiter, dtype=[('i', 'S50'), ('j', 'S50')])) # For each h-entry and each pair of diseases in the entry, increment the a-b weight in the # disease graph weighted_edgelist = util.occur_dict(sorted_pair(a, b) for diseases in h.itervalues() for a in diseases for b in diseases) g = nx.Graph() g.add_weighted_edges_from((i, j, w) for (i, j), w in weighted_edgelist.iteritems()) # Convert to adjacency matrix and output in CSV format diseases = g.nodes() if output_format == 'matrix': print delimiter.join(diseases) A = nx.to_numpy_matrix(g) for i in xrange(len(diseases)): print diseases[i] + delimiter + delimiter.join(str(x) for x in A[i, :].tolist()[0]) elif output_format == 'cytoscape': print '\n'.join(delimiter.join([u, v, 'pp', 'TRUE', 'edge%05d' % (k,), str(d['weight'])]) for k, (u, v, d) in enumerate(g.edges_iter(data=True)))
def _clean_clique(self, c): '''Ideally, each IBD graph component is a clique (reflecting IBD transitivity). In practice, we have approximate transitivity, so attempt to complete the clique by looking for more IBD segment edges in the neighborhood of the SNP for nodes that are not connected to all others. If those nodes are still not connected to at least min_degree neighbors, remove them from the clique. If the clique is smaller than (min_degree+1), such nodes are always removed.''' max_degree = c.number_of_nodes() - 1 nodes = c.nodes() orig_nodes = len(nodes) unconnected = filter(lambda node: c.degree(node) < max_degree, nodes) if self.debug: self._writeln('\tComponent: nodes %d, edges %d, %s' % (c.number_of_nodes(), c.number_of_edges(), repr(util.occur_dict(c.degree().values())))) if unconnected: if self.debug: self._writeln('\t\tAll nodes %s' % (len(nodes),)) self._writeln('\t\tUnconnected nodes %s' % (len(unconnected),)) # Add extra edges extra_edges = [tuple(s.samples) for s in it.chain.from_iterable(self.segments.find(self.bp_left, self.bp_right + 1, node) for node in unconnected) if s.samples[0] in nodes and s.samples[1] in nodes] # self._writeln('\t\textra_edges', extra_edges c.add_edges_from(extra_edges) # if debug: # self._writeln('\t\tnodes %d, edges %d, %s' % (c.number_of_nodes(), c.number_of_edges(), repr(util.occur_dict(c.degree().values()))) # Delete still-connected nodes c_min_degree = min(max_degree, self.min_degree) unconnected = filter(lambda node: c.degree(node) < c_min_degree, nodes) if unconnected: # if debug: # self._writeln('\t\tRemoving nodes %s min_degree %d' % (repr(unconnected), c_min_degree) c.remove_nodes_from(unconnected) if self.debug: self._writeln('\t\tCleaned component: %d -> %d' % (orig_nodes, c.number_of_nodes())) return c