def form_intersections(segments, full_output=False, samples=None, snp_range=None): '''Return a list of bins whose ith element is the list of samples that are equal over [endpoints[i], endpoints[i+1]]. This is done by scanning all ibd_pairs entries and accumulating them into the intersecting segments' bins. Union set of all segment endpoints and sort ascending. These form the set of intersections of all pairwise_ibd segments.''' non_unique_endpoints = [s[0] for s in segments] # print 'snp_range', snp_range if snp_range is not None: non_unique_endpoints += [(snp_range[START], snp_range[STOP])] endpoints = sorted(reduce(set.union, non_unique_endpoints, set([]))) # This index makes it easier to random-access bins endpoint_index = dict((x, index) for (index, x) in enumerate(endpoints)) sub_segments = [(endpoints[i], endpoints[i + 1]) for i in xrange(len(endpoints) - 1)] # print 'endpoints', endpoints # print 'sub_segments', sub_segments n = len(endpoints) intersections = [list([]) for _ in xrange(n - 1)] if full_output: value_segment_tuples = [list([]) for _ in xrange(n - 1)] segment_local_value_tuples = [] # Sweep segment intersections (bins) in order, accumulate segment info into each one for k, segment in enumerate(segments): snp = segment[0] for i in xrange(endpoint_index[snp[0]], endpoint_index[snp[1]]): intersections[i].append(segment[1]) if full_output: value_segment_tuples[i] += [(x, k) for x in segment[1]] segment_local_value_tuples += [(k, (i, x)) for x in segment[1]] if full_output: # print 'segment_local_value_tuples', segment_local_value_tuples segment_to_local_values = util.to_set_dict(segment_local_value_tuples) # print 'segment_to_local_values', segment_to_local_values # print [x for s in segments for x in s[1]] all_samples = samples if samples is not None else util.union_all(x for s in segments for x in s[1]) # print 'all_samples', all_samples all_local_values = set(itertools.product(xrange(n - 1), all_samples)) dangling_local_values = all_local_values - util.union_all(*segment_to_local_values.values()) # print 'dangling_local_values', dangling_local_values value_to_segments = [util.to_set_dict(d) for d in value_segment_tuples] # print 'value_to_segments', value_to_segments util.to_set_dict((x, k) for k, s in enumerate(segments) for x in s[1]) return (sub_segments, intersections, value_to_segments, segment_to_local_values, dangling_local_values) else: return (sub_segments, intersections)
def _compute_block_frames(g, snp_dao): '''Yield frame numbers (colors) within each SNP LD block using greedy coloring.''' # This is how to turn on database logging to debug query slowness: # logging.basicConfig() # logging.getLogger('sqlalchemy.engine').setLevel(logging.DEBUG) # logging.getLogger('sqlalchemy.pool.QueuePool').setLevel(logging.DEBUG) for block in nx.connected_components(g): snps = list(snp_dao.get_snps(block)) position = dict((snp.name, snp.bp) for snp in snps) # print '#' * 50 # print 'block', block # print '#' * 50 # print 'snps', snps # print 'position', position h = g.subgraph(block) c = greedy_coloring(h, position) frames = util.to_set_dict((v, k) for (k, v) in c.iteritems()) yield frames
def group_by_value(d): '''Convert a dictionary to a dictionary where each key is a d-value and the value is the list of keys in d that have this value.''' return util.to_set_dict((v, k) for k, v in d.iteritems())
def extract_genotypes(input_file, var_file_prefix=os.environ['OBER_DATA'] + '/cgi/all.2012-09-20.testvar.chr', index_file=os.environ['OBER_DATA'] + '/cgi/README.assembly_sample_subject.csv', allele_swap=False, debug=False, custom_id=False): '''Extract SNP data from tabixed CGI var files and convert to a Genotype object. Base-pair ranges are read from input_file in the format: chr start_bp end_bp * Multiple ranges can be specified (one per line). * If input_file = ''-'', input is taken from standard input. * Use 23 for chrX, 24 for chrY. (BUT: chrs X, Y, M are not yet supported. TBA.) * By default, allele labels are swapped so that 1 is always the major and 2 is always the minor.''' # Read CGI-ID-to-our-sample-ID (FINDIV) dictionary cgi_id_to_sample_id = db_gene.cgi.ids.cgi_id_to_sample_id(index_file) # Read SNPs from stdin; assuming numeric chromosome numbers (typically we only use autosomal) input_snps = [(int(line[0]), (int(line[1]), int(line[2]))) for line in csv.reader(input_file, skipinitialspace=True, delimiter=' ')] num_snps = len(input_snps) # Holds the sorted list sample IDs; assumed to be the same for all chromosomes; # if not, raise an exception in the chromosome loop below sample_id = None # Holds SNP metadata snp = np.zeros( (num_snps, ), dtype=[ ('chrom', np.uint8), # Chromosome # containing the SNP ('name', np.chararray), # SNP name (e.g., 'rs...') ('dist_cm', np.float), # Genetic distance from beginning of chomorsome ('base_pair', np.uint) # Base pair position on chromosome ]) # Genetic map (allele letters of each SNP) genetic_map = np.zeros((num_snps, ), dtype=np.object) # Sample genotypes at all SNPs data = None # Bulk-extract data using tabix for each chromosome row = 0 for chrom, bps in util.to_set_dict(input_snps).iteritems(): chrom_letter = _to_letter[chrom] var_file = '%s%s.tsv.gz' % (var_file_prefix, chrom_letter) # Read header line from the compressed var file of this chromosome to get CGI IDs for line in csv.reader(gzip.GzipFile(var_file), skipinitialspace=True, delimiter='\t'): raw_id = [cgi_id_to_sample_id[cgi_id] for cgi_id in line[8:]] chr_sample_id, index = util.sort_with_index(np.array(raw_id)) break if sample_id is None: # Save global IDs sample_id = chr_sample_id num_samples = len(sample_id) elif not np.array_equal(chr_sample_id, sample_id): # Make sure the IDs are the same across all chromosomes raise ValueError( 'Sample IDs are not the same across all chromosomes:\n' + sample_id + '\n' + chr_sample_id) # Read SNP data from the compressed chromosome archive using tabix; append to genotype array if data is None: data = np.zeros((num_snps, num_samples, 2), dtype=np.byte) _, output, _ = util.run_command( _TABIX_CMD + ' ' + var_file + ' ' + ' '.join('chr%s:%d-%d' % (chrom_letter, x[0], x[1]) for x in bps), verbose=debug) # Apparently there's a tabix bug: some records might not be contained in the requested region. # Only process lines that are within the requested regions relevant_lines = [ a for a in csv.reader(StringIO.StringIO(output), skipinitialspace=True, delimiter='\t') if np.array([(x[0] <= int(a[2])) & (int(a[3]) <= x[1]) and a[4] == 'snp' for x in bps]).any() ] for line in relevant_lines: if len(snp) <= row: snp.resize(2 * len(snp)) genetic_map.resize(2 * len(snp)) data.resize((2 * len(snp), num_samples, 2), refcheck=False) # If SNP name is missing, create a unique identifier by concatenating the chromosome, begin and minor allele snp[row] = (chrom, line[7] if line[7] and not custom_id else 'chr%s_%d_%s' % (chrom_letter, int(line[2]), line[6]), 0, int(line[3])) genetic_map[row] = line[5] + line[6] # print line[8:] if chrom <= 22: # Autosomes - genotype data (two letters) data[row, :] = np.array( sum(([ CGI_LETTER_TO_ALLELE[item[0]], CGI_LETTER_TO_ALLELE[item[1]] ] for item in line[8:]), [])).reshape(num_samples, 2)[index, :] else: # X/Y chromosomes: if one letter is available => duplicate it to make the individual homozygous # (or 00 for a single missing value) data[row, :] = np.array( sum(([ CGI_LETTER_TO_ALLELE[item[0]], CGI_LETTER_TO_ALLELE[item[1 if len(item) == 2 else 0]] ] for item in line[8:]), [])).reshape(num_samples, 2)[index, :] d = data[row, :] _, f2 = im.gt.allele_frequencies(d) if allele_swap and f2 > 0.7: # Minor allele has a much larger frequency than the major allele, swap them print 'chr%d:%d-%d (%s): Major-minor allele swap detected, fixing' % \ (chrom, int(line[2]), int(line[3]), line[7]) im.gt.swap_alleles(d) row += 1 # Not all SNPs might be found; restrict data arrays to the first row rows snp = snp[0:row] genetic_map = genetic_map[0:row].tolist() data = data[0:row] print 'Found %d SNP(s) in %d requested base-pair range(s)' % (row, num_snps) # Construct Genotype object; write in desired output format g = im.factory.GenotypeFactory.new_instance('genotype', data, snp, sample_id=sample_id) g.map = genetic_map return g
def extract_genotypes(input_file, var_file_prefix=os.environ['OBER_DATA'] + '/cgi/all.2012-09-20.testvar.chr', index_file=os.environ['OBER_DATA'] + '/cgi/README.assembly_sample_subject.csv', allele_swap=False, debug=False, custom_id=False): '''Extract SNP data from tabixed CGI var files and convert to a Genotype object. Base-pair ranges are read from input_file in the format: chr start_bp end_bp * Multiple ranges can be specified (one per line). * If input_file = ''-'', input is taken from standard input. * Use 23 for chrX, 24 for chrY. (BUT: chrs X, Y, M are not yet supported. TBA.) * By default, allele labels are swapped so that 1 is always the major and 2 is always the minor.''' # Read CGI-ID-to-our-sample-ID (FINDIV) dictionary cgi_id_to_sample_id = db_gene.cgi.ids.cgi_id_to_sample_id(index_file) # Read SNPs from stdin; assuming numeric chromosome numbers (typically we only use autosomal) input_snps = [(int(line[0]), (int(line[1]), int(line[2]))) for line in csv.reader(input_file, skipinitialspace=True, delimiter=' ')] num_snps = len(input_snps) # Holds the sorted list sample IDs; assumed to be the same for all chromosomes; # if not, raise an exception in the chromosome loop below sample_id = None # Holds SNP metadata snp = np.zeros((num_snps,), dtype=[('chrom', np.uint8), # Chromosome # containing the SNP ('name', np.chararray), # SNP name (e.g., 'rs...') ('dist_cm', np.float), # Genetic distance from beginning of chomorsome ('base_pair', np.uint) # Base pair position on chromosome ]) # Genetic map (allele letters of each SNP) genetic_map = np.zeros((num_snps,), dtype=np.object) # Sample genotypes at all SNPs data = None # Bulk-extract data using tabix for each chromosome row = 0 for chrom, bps in util.to_set_dict(input_snps).iteritems(): chrom_letter = _to_letter[chrom] var_file = '%s%s.tsv.gz' % (var_file_prefix, chrom_letter) # Read header line from the compressed var file of this chromosome to get CGI IDs for line in csv.reader(gzip.GzipFile(var_file), skipinitialspace=True, delimiter='\t'): raw_id = [cgi_id_to_sample_id[cgi_id] for cgi_id in line[8:]] chr_sample_id, index = util.sort_with_index(np.array(raw_id)) break if sample_id is None: # Save global IDs sample_id = chr_sample_id num_samples = len(sample_id) elif not np.array_equal(chr_sample_id, sample_id): # Make sure the IDs are the same across all chromosomes raise ValueError('Sample IDs are not the same across all chromosomes:\n' + sample_id + '\n' + chr_sample_id) # Read SNP data from the compressed chromosome archive using tabix; append to genotype array if data is None: data = np.zeros((num_snps, num_samples, 2), dtype=np.byte) _, output, _ = util.run_command(_TABIX_CMD + ' ' + var_file + ' ' + ' '.join('chr%s:%d-%d' % (chrom_letter, x[0], x[1]) for x in bps), verbose=debug) # Apparently there's a tabix bug: some records might not be contained in the requested region. # Only process lines that are within the requested regions relevant_lines = [a for a in csv.reader(StringIO.StringIO(output), skipinitialspace=True, delimiter='\t') if np.array([(x[0] <= int(a[2])) & (int(a[3]) <= x[1]) and a[4] == 'snp' for x in bps]).any()] for line in relevant_lines: if len(snp) <= row: snp.resize(2 * len(snp)) genetic_map.resize(2 * len(snp)) data.resize((2 * len(snp), num_samples, 2), refcheck=False) # If SNP name is missing, create a unique identifier by concatenating the chromosome, begin and minor allele snp[row] = (chrom, line[7] if line[7] and not custom_id else 'chr%s_%d_%s' % (chrom_letter, int(line[2]), line[6]), 0, int(line[3])) genetic_map[row] = line[5] + line[6] # print line[8:] if chrom <= 22: # Autosomes - genotype data (two letters) data[row, :] = np.array(sum(([CGI_LETTER_TO_ALLELE[item[0]], CGI_LETTER_TO_ALLELE[item[1]]] for item in line[8:]), [])).reshape(num_samples, 2)[index, :] else: # X/Y chromosomes: if one letter is available => duplicate it to make the individual homozygous # (or 00 for a single missing value) data[row, :] = np.array(sum(([CGI_LETTER_TO_ALLELE[item[0]], CGI_LETTER_TO_ALLELE[item[1 if len(item) == 2 else 0]]] for item in line[8:]), [])).reshape(num_samples, 2)[index, :] d = data[row, :] _, f2 = im.gt.allele_frequencies(d) if allele_swap and f2 > 0.7: # Minor allele has a much larger frequency than the major allele, swap them print 'chr%d:%d-%d (%s): Major-minor allele swap detected, fixing' % \ (chrom, int(line[2]), int(line[3]), line[7]) im.gt.swap_alleles(d) row += 1 # Not all SNPs might be found; restrict data arrays to the first row rows snp = snp[0:row] genetic_map = genetic_map[0:row].tolist() data = data[0:row] print 'Found %d SNP(s) in %d requested base-pair range(s)' % (row, num_snps) # Construct Genotype object; write in desired output format g = im.factory.GenotypeFactory.new_instance('genotype', data, snp, sample_id=sample_id) g.map = genetic_map return g