help='Debug at sample %d (-1=no sample)') parser.add_option('-j', '--inject-ids', action='store_true' , dest='inject_ids', default=False, help='If true, the IDs read from id-file are interpreted as sample FINDIVs rather than' \ 'CGI GSM sequence identifiers') options, args = parser.parse_args(argv[1:]) if len(args) != (1 if options.out_id else 4): print usage sys.exit(1) if options.chrom < 1 or options.chrom > 22: print usage print('\nMust specify a chromosome number in 1..22.') sys.exit(util.EXIT_BAD_INPUT_ARGS) return args, options '''Read CGI sample IDs and convert to our sample IDs.''' __read_sample_ids = lambda cgi_id_file, cgi_id_to_sample_id: util.sort_with_index(np.array([cgi_id_to_sample_id[cgi_id] for cgi_id in __csv_reader(cgi_id_file).next()[8:]])) __read_sample_ids_inject_ids = lambda cgi_id_file, cgi_id_to_sample_id: util.sort_with_index(np.loadtxt(cgi_id_file, dtype=int)) '''Parse a CSV file name f.''' __csv_reader = lambda f: csv.reader(open(f, 'rb') if isinstance(f, str) else f, skipinitialspace=True, delimiter='\t') def __imputer(problem, segment_location, options): '''A factory method that selects that imputation algorithm implementation.''' algorithm = options.algorithm if algorithm == 'interval_tree': return _ImputerIntervalTree(problem, segment_location, options) elif algorithm == 'index': return _ImputerIndex(problem, segment_location, options) else: raise ValueError('Unsupported imputation algorithm ''%s''' % (algorithm,)) #################################################################################### class _Imputer(object): '''Base class for imputation implementations.'''
def extract_genotypes(input_file, var_file_prefix=os.environ['OBER_DATA'] + '/cgi/all.2012-09-20.testvar.chr', index_file=os.environ['OBER_DATA'] + '/cgi/README.assembly_sample_subject.csv', allele_swap=False, debug=False, custom_id=False): '''Extract SNP data from tabixed CGI var files and convert to a Genotype object. Base-pair ranges are read from input_file in the format: chr start_bp end_bp * Multiple ranges can be specified (one per line). * If input_file = ''-'', input is taken from standard input. * Use 23 for chrX, 24 for chrY. (BUT: chrs X, Y, M are not yet supported. TBA.) * By default, allele labels are swapped so that 1 is always the major and 2 is always the minor.''' # Read CGI-ID-to-our-sample-ID (FINDIV) dictionary cgi_id_to_sample_id = db_gene.cgi.ids.cgi_id_to_sample_id(index_file) # Read SNPs from stdin; assuming numeric chromosome numbers (typically we only use autosomal) input_snps = [(int(line[0]), (int(line[1]), int(line[2]))) for line in csv.reader(input_file, skipinitialspace=True, delimiter=' ')] num_snps = len(input_snps) # Holds the sorted list sample IDs; assumed to be the same for all chromosomes; # if not, raise an exception in the chromosome loop below sample_id = None # Holds SNP metadata snp = np.zeros( (num_snps, ), dtype=[ ('chrom', np.uint8), # Chromosome # containing the SNP ('name', np.chararray), # SNP name (e.g., 'rs...') ('dist_cm', np.float), # Genetic distance from beginning of chomorsome ('base_pair', np.uint) # Base pair position on chromosome ]) # Genetic map (allele letters of each SNP) genetic_map = np.zeros((num_snps, ), dtype=np.object) # Sample genotypes at all SNPs data = None # Bulk-extract data using tabix for each chromosome row = 0 for chrom, bps in util.to_set_dict(input_snps).iteritems(): chrom_letter = _to_letter[chrom] var_file = '%s%s.tsv.gz' % (var_file_prefix, chrom_letter) # Read header line from the compressed var file of this chromosome to get CGI IDs for line in csv.reader(gzip.GzipFile(var_file), skipinitialspace=True, delimiter='\t'): raw_id = [cgi_id_to_sample_id[cgi_id] for cgi_id in line[8:]] chr_sample_id, index = util.sort_with_index(np.array(raw_id)) break if sample_id is None: # Save global IDs sample_id = chr_sample_id num_samples = len(sample_id) elif not np.array_equal(chr_sample_id, sample_id): # Make sure the IDs are the same across all chromosomes raise ValueError( 'Sample IDs are not the same across all chromosomes:\n' + sample_id + '\n' + chr_sample_id) # Read SNP data from the compressed chromosome archive using tabix; append to genotype array if data is None: data = np.zeros((num_snps, num_samples, 2), dtype=np.byte) _, output, _ = util.run_command( _TABIX_CMD + ' ' + var_file + ' ' + ' '.join('chr%s:%d-%d' % (chrom_letter, x[0], x[1]) for x in bps), verbose=debug) # Apparently there's a tabix bug: some records might not be contained in the requested region. # Only process lines that are within the requested regions relevant_lines = [ a for a in csv.reader(StringIO.StringIO(output), skipinitialspace=True, delimiter='\t') if np.array([(x[0] <= int(a[2])) & (int(a[3]) <= x[1]) and a[4] == 'snp' for x in bps]).any() ] for line in relevant_lines: if len(snp) <= row: snp.resize(2 * len(snp)) genetic_map.resize(2 * len(snp)) data.resize((2 * len(snp), num_samples, 2), refcheck=False) # If SNP name is missing, create a unique identifier by concatenating the chromosome, begin and minor allele snp[row] = (chrom, line[7] if line[7] and not custom_id else 'chr%s_%d_%s' % (chrom_letter, int(line[2]), line[6]), 0, int(line[3])) genetic_map[row] = line[5] + line[6] # print line[8:] if chrom <= 22: # Autosomes - genotype data (two letters) data[row, :] = np.array( sum(([ CGI_LETTER_TO_ALLELE[item[0]], CGI_LETTER_TO_ALLELE[item[1]] ] for item in line[8:]), [])).reshape(num_samples, 2)[index, :] else: # X/Y chromosomes: if one letter is available => duplicate it to make the individual homozygous # (or 00 for a single missing value) data[row, :] = np.array( sum(([ CGI_LETTER_TO_ALLELE[item[0]], CGI_LETTER_TO_ALLELE[item[1 if len(item) == 2 else 0]] ] for item in line[8:]), [])).reshape(num_samples, 2)[index, :] d = data[row, :] _, f2 = im.gt.allele_frequencies(d) if allele_swap and f2 > 0.7: # Minor allele has a much larger frequency than the major allele, swap them print 'chr%d:%d-%d (%s): Major-minor allele swap detected, fixing' % \ (chrom, int(line[2]), int(line[3]), line[7]) im.gt.swap_alleles(d) row += 1 # Not all SNPs might be found; restrict data arrays to the first row rows snp = snp[0:row] genetic_map = genetic_map[0:row].tolist() data = data[0:row] print 'Found %d SNP(s) in %d requested base-pair range(s)' % (row, num_snps) # Construct Genotype object; write in desired output format g = im.factory.GenotypeFactory.new_instance('genotype', data, snp, sample_id=sample_id) g.map = genetic_map return g
'CGI GSM sequence identifiers') options, args = parser.parse_args(argv[1:]) if len(args) != (1 if options.out_id else 4): print usage sys.exit(1) if options.chrom < 1 or options.chrom > 22: print usage print('\nMust specify a chromosome number in 1..22.') sys.exit(util.EXIT_BAD_INPUT_ARGS) return args, options '''Read CGI sample IDs and convert to our sample IDs.''' __read_sample_ids = lambda cgi_id_file, cgi_id_to_sample_id: util.sort_with_index( np.array([ cgi_id_to_sample_id[cgi_id] for cgi_id in __csv_reader(cgi_id_file).next()[8:] ])) __read_sample_ids_inject_ids = lambda cgi_id_file, cgi_id_to_sample_id: util.sort_with_index( np.loadtxt(cgi_id_file, dtype=int)) '''Parse a CSV file name f.''' __csv_reader = lambda f: csv.reader(open(f, 'rb') if isinstance(f, str) else f, skipinitialspace=True, delimiter='\t') def __imputer(problem, segment_location, options): '''A factory method that selects that imputation algorithm implementation.''' algorithm = options.algorithm if algorithm == 'interval_tree': return _ImputerIntervalTree(problem, segment_location, options)
def extract_genotypes(input_file, var_file_prefix=os.environ['OBER_DATA'] + '/cgi/all.2012-09-20.testvar.chr', index_file=os.environ['OBER_DATA'] + '/cgi/README.assembly_sample_subject.csv', allele_swap=False, debug=False, custom_id=False): '''Extract SNP data from tabixed CGI var files and convert to a Genotype object. Base-pair ranges are read from input_file in the format: chr start_bp end_bp * Multiple ranges can be specified (one per line). * If input_file = ''-'', input is taken from standard input. * Use 23 for chrX, 24 for chrY. (BUT: chrs X, Y, M are not yet supported. TBA.) * By default, allele labels are swapped so that 1 is always the major and 2 is always the minor.''' # Read CGI-ID-to-our-sample-ID (FINDIV) dictionary cgi_id_to_sample_id = db_gene.cgi.ids.cgi_id_to_sample_id(index_file) # Read SNPs from stdin; assuming numeric chromosome numbers (typically we only use autosomal) input_snps = [(int(line[0]), (int(line[1]), int(line[2]))) for line in csv.reader(input_file, skipinitialspace=True, delimiter=' ')] num_snps = len(input_snps) # Holds the sorted list sample IDs; assumed to be the same for all chromosomes; # if not, raise an exception in the chromosome loop below sample_id = None # Holds SNP metadata snp = np.zeros((num_snps,), dtype=[('chrom', np.uint8), # Chromosome # containing the SNP ('name', np.chararray), # SNP name (e.g., 'rs...') ('dist_cm', np.float), # Genetic distance from beginning of chomorsome ('base_pair', np.uint) # Base pair position on chromosome ]) # Genetic map (allele letters of each SNP) genetic_map = np.zeros((num_snps,), dtype=np.object) # Sample genotypes at all SNPs data = None # Bulk-extract data using tabix for each chromosome row = 0 for chrom, bps in util.to_set_dict(input_snps).iteritems(): chrom_letter = _to_letter[chrom] var_file = '%s%s.tsv.gz' % (var_file_prefix, chrom_letter) # Read header line from the compressed var file of this chromosome to get CGI IDs for line in csv.reader(gzip.GzipFile(var_file), skipinitialspace=True, delimiter='\t'): raw_id = [cgi_id_to_sample_id[cgi_id] for cgi_id in line[8:]] chr_sample_id, index = util.sort_with_index(np.array(raw_id)) break if sample_id is None: # Save global IDs sample_id = chr_sample_id num_samples = len(sample_id) elif not np.array_equal(chr_sample_id, sample_id): # Make sure the IDs are the same across all chromosomes raise ValueError('Sample IDs are not the same across all chromosomes:\n' + sample_id + '\n' + chr_sample_id) # Read SNP data from the compressed chromosome archive using tabix; append to genotype array if data is None: data = np.zeros((num_snps, num_samples, 2), dtype=np.byte) _, output, _ = util.run_command(_TABIX_CMD + ' ' + var_file + ' ' + ' '.join('chr%s:%d-%d' % (chrom_letter, x[0], x[1]) for x in bps), verbose=debug) # Apparently there's a tabix bug: some records might not be contained in the requested region. # Only process lines that are within the requested regions relevant_lines = [a for a in csv.reader(StringIO.StringIO(output), skipinitialspace=True, delimiter='\t') if np.array([(x[0] <= int(a[2])) & (int(a[3]) <= x[1]) and a[4] == 'snp' for x in bps]).any()] for line in relevant_lines: if len(snp) <= row: snp.resize(2 * len(snp)) genetic_map.resize(2 * len(snp)) data.resize((2 * len(snp), num_samples, 2), refcheck=False) # If SNP name is missing, create a unique identifier by concatenating the chromosome, begin and minor allele snp[row] = (chrom, line[7] if line[7] and not custom_id else 'chr%s_%d_%s' % (chrom_letter, int(line[2]), line[6]), 0, int(line[3])) genetic_map[row] = line[5] + line[6] # print line[8:] if chrom <= 22: # Autosomes - genotype data (two letters) data[row, :] = np.array(sum(([CGI_LETTER_TO_ALLELE[item[0]], CGI_LETTER_TO_ALLELE[item[1]]] for item in line[8:]), [])).reshape(num_samples, 2)[index, :] else: # X/Y chromosomes: if one letter is available => duplicate it to make the individual homozygous # (or 00 for a single missing value) data[row, :] = np.array(sum(([CGI_LETTER_TO_ALLELE[item[0]], CGI_LETTER_TO_ALLELE[item[1 if len(item) == 2 else 0]]] for item in line[8:]), [])).reshape(num_samples, 2)[index, :] d = data[row, :] _, f2 = im.gt.allele_frequencies(d) if allele_swap and f2 > 0.7: # Minor allele has a much larger frequency than the major allele, swap them print 'chr%d:%d-%d (%s): Major-minor allele swap detected, fixing' % \ (chrom, int(line[2]), int(line[3]), line[7]) im.gt.swap_alleles(d) row += 1 # Not all SNPs might be found; restrict data arrays to the first row rows snp = snp[0:row] genetic_map = genetic_map[0:row].tolist() data = data[0:row] print 'Found %d SNP(s) in %d requested base-pair range(s)' % (row, num_snps) # Construct Genotype object; write in desired output format g = im.factory.GenotypeFactory.new_instance('genotype', data, snp, sample_id=sample_id) g.map = genetic_map return g