def main(): arg_ct = len(sys.argv) if arg_ct < 4: print "Usage: python hamming_distance.py [.bed/.pgen] [sample index 1] [sample index 2] {raw_sample_ct=[val]}" print "* raw_sample_ct is required for .bed files." return specified_sample_ct = None sample_ct = 2 sample_subset = np.empty(sample_ct, np.uint32) sample_subset[0] = int(sys.argv[2]) sample_subset[1] = int(sys.argv[3]) if arg_ct > 4: if not sys.argv[4].startswith("raw_sample_ct="): print "Error: Invalid raw_sample_ct parameter." return specified_sample_ct = int(sys.argv[4][14:]) with pgenlib.PgenReader(sys.argv[1], raw_sample_ct=specified_sample_ct, sample_subset=sample_subset) as pf: variant_ct = pf.get_variant_ct() hamming_distance = 0 buf = np.empty(sample_ct, np.int8) for vidx in xrange(variant_ct): pf.read(vidx, buf) geno0 = buf[0] geno1 = buf[1] if geno0 != -9 and geno1 != -9: hamming_distance += abs(geno0 - geno1) sys.stdout.write("Hamming distance: " + str(hamming_distance)) sys.stdout.write('\n')
def main(): arg_ct = len(sys.argv) if arg_ct < 3: print( "Usage: python3 pgen_subset_and_compress.py <input .bed/.pgen> <output .pgen name> [raw_sample_ct=<val>] [sample idx(s)...]" ) print("* raw_sample_ct is required for .bed files.") print("* sample indexes must be in increasing order.") print("* This currently assumes that A2 alleles are always reference.") return sample_subset = None specified_sample_ct = None sample_ct = None if arg_ct > 3: offset = 3 if sys.argv[3].startswith("raw_sample_ct="): specified_sample_ct = int(sys.argv[3][14:]) offset = 4 if arg_ct > offset: sample_ct = arg_ct - offset sample_subset = np.empty(sample_ct, np.uint32) for idx in range(sample_ct): sample_subset[idx] = int(sys.argv[offset + idx]) with pgenlib.PgenReader(bytes(sys.argv[1], 'utf8'), raw_sample_ct=specified_sample_ct, sample_subset=sample_subset) as infile: raw_sample_ct = infile.get_raw_sample_ct() if sample_ct is None: sample_ct = raw_sample_ct variant_ct = infile.get_variant_ct() hardcall_phase_present = infile.hardcall_phase_present() geno_buf = None allele_code_buf = None phasepresent_buf = None if not hardcall_phase_present: geno_buf = np.empty(sample_ct, np.int8) else: allele_code_buf = np.empty(sample_ct * 2, np.int32) phasepresent_buf = np.empty(sample_ct, np.bool_) with pgenlib.PgenWriter( bytes(sys.argv[2], 'utf8'), sample_ct, variant_ct, False, hardcall_phase_present=hardcall_phase_present) as outfile: if not hardcall_phase_present: for vidx in range(variant_ct): infile.read(vidx, geno_buf) outfile.append_biallelic(geno_buf) else: for vidx in range(variant_ct): infile.read_alleles_and_phasepresent( vidx, allele_code_buf, phasepresent_buf) outfile.append_partially_phased(allele_code_buf, phasepresent_buf) return
def read_alleles_block(pgen_f, bins_df, block_id): """wrapper function of pgenlib.PgenReader.read_alleles_range for a LD block""" bim_s = bins_df.bimIdStart[block_id] bim_e = bins_df.bimIdEnd[block_id] with pg.PgenReader(pgen_f) as pgr: buf_ndary = np.zeros((bim_e - bim_s, pgr.get_raw_sample_ct() * 2), dtype=np.int32) pgr.read_alleles_range(bim_s, bim_e, buf_ndary) return buf_ndary
def main(): arg_ct = len(sys.argv) if arg_ct < 3: print( "Usage: python single_variant_test.py <.bed/.pgen> <variant idx> [raw_sample_ct=<val>] [sample idx(s)...]" ) print("* raw_sample_ct is required for .bed files.") print("* sample indexes must be in increasing order.") return sample_subset = None specified_sample_ct = None sample_ct = None if arg_ct > 3: offset = 3 if sys.argv[3].startswith("raw_sample_ct="): specified_sample_ct = int(sys.argv[3][14:]) offset = 4 if arg_ct > offset: sample_ct = arg_ct - offset sample_subset = np.empty(sample_ct, np.uint32) for idx in range(sample_ct): sample_subset[idx] = int(sys.argv[offset + idx]) vidx = int(sys.argv[2]) with pgenlib.PgenReader(bytes(sys.argv[1], 'utf8'), raw_sample_ct=specified_sample_ct, sample_subset=sample_subset) as pf: raw_sample_ct = pf.get_raw_sample_ct() if sample_ct is None: sample_ct = raw_sample_ct # variant_ct = pf.get_variant_ct() buf = np.empty(raw_sample_ct * 2, np.int32) buf2 = np.empty(raw_sample_ct, np.bool_) pf.read_alleles_and_phasepresent(vidx, buf, buf2) for sample_idx in range(sample_ct): sys.stdout.write(str(buf[2 * sample_idx])) if buf2[sample_idx]: sys.stdout.write('|') else: sys.stdout.write('/') sys.stdout.write(str(buf[2 * sample_idx + 1])) sys.stdout.write(' ') sys.stdout.write('\n') pf.change_sample_subset() pf.read_alleles_and_phasepresent(vidx, buf, buf2) for sample_idx in range(raw_sample_ct): sys.stdout.write(str(buf[2 * sample_idx])) if buf2[sample_idx]: sys.stdout.write('|') else: sys.stdout.write('/') sys.stdout.write(str(buf[2 * sample_idx + 1])) sys.stdout.write(' ') sys.stdout.write('\n')
def main(): arg_ct = len(sys.argv) if arg_ct < 2: print( "Usage: python3 het_count.py <.bed/.pgen> [raw_sample_ct=<val>] [sample idx(s)...]" ) print("* raw_sample_ct is required for .bed files.") print("* sample indexes must be in increasing order.") print( "* The Python side of this is really slow if you don't explicitly specify sample indexes." ) return sample_subset = None specified_sample_ct = None sample_ct = None if arg_ct > 2: offset = 2 if sys.argv[2].startswith("raw_sample_ct="): specified_sample_ct = int(sys.argv[2][14:]) offset = 3 if arg_ct > offset: sample_ct = arg_ct - offset sample_subset = np.empty(sample_ct, np.uint32) for idx in range(sample_ct): sample_subset[idx] = int(sys.argv[offset + idx]) with pgenlib.PgenReader(bytes(sys.argv[1], 'utf8'), raw_sample_ct=specified_sample_ct, sample_subset=sample_subset) as pf: raw_sample_ct = pf.get_raw_sample_ct() if sample_ct is None: sample_ct = raw_sample_ct variant_ct = pf.get_variant_ct() tot_hets = np.zeros((sample_ct, ), dtype=np.uint32) buf = np.empty(sample_ct, np.int8) for vidx in range(variant_ct): pf.read(vidx, buf) # this is horribly inefficient... for sample_idx in range(sample_ct): if buf[sample_idx] == 1: tot_hets[sample_idx] += 1 for sample_idx in range(sample_ct): sys.stdout.write(str(tot_hets[sample_idx])) sys.stdout.write(' ') sys.stdout.write('\n')
def main(): arg_ct = len(sys.argv) if arg_ct < 3: print( "Usage: python3 extract_haps.py <.bed/.pgen> <output filename> [raw_sample_ct=<val>] [sample idx(s)...]" ) print("* raw_sample_ct is required for .bed files.") return sample_subset = None specified_sample_ct = None sample_ct = None if arg_ct > 3: offset = 3 if sys.argv[3].startswith("raw_sample_ct="): specified_sample_ct = int(sys.argv[3][14:]) if arg_ct > offset: sample_ct = arg_ct - offset sample_subset = np.empty(sample_ct, np.uint32) for idx in range(sample_ct): sample_subset[idx] = int(sys.argv[offset + idx]) with pgenlib.PgenReader(bytes(sys.argv[1], 'utf8'), raw_sample_ct=specified_sample_ct, sample_subset=sample_subset) as infile: raw_sample_ct = infile.get_raw_sample_ct() if sample_ct is None: sample_ct = raw_sample_ct variant_ct = infile.get_variant_ct() allele_code_buf = np.empty([sample_ct * 2, variant_ct], dtype=np.int32) infile.read_alleles_range(0, variant_ct, allele_code_buf, 1) with open(sys.argv[2], 'w') as outfile: for vidx in range(variant_ct): for sidx in range(sample_ct): if sidx != 0: outfile.write(' ') outfile.write( str(allele_code_buf[2 * sidx, vidx]) + "|" + str(allele_code_buf[2 * sidx + 1, vidx])) outfile.write('\n')
vec_rr = 1.0 / np.sqrt(vec_rr) mat_covar = np.transpose(np.transpose(mat_covar) * vec_rr) # qr decomposition of covariate matrix Q = linalg.qr(mat_covar, mode="economic")[0] del mat_covar print("Completed QR decomposition of covariate matrix.") ''' Read .pvar file. Read pgen/bed file and do single-variant assoc tests. Write stats to output file on the fly. ''' print("Starting assoc tests...") print("Writing stats into", args.out) outfp = open(args.out, "w") pf = pgenlib.PgenReader(pgen.encode(), raw_sample_ct=raw_sample_ct, sample_subset=sample_subset) with open(pvar) as fp: var_idx = -1 int8vec_x = np.empty(sample_ct, dtype=np.int8) vec_x = np.empty(sample_ct, dtype=np.float32) for line in fp: if line[0] == '#': continue if line.strip() is not None: var_idx += 1 if var_idx > args.end: break if var_idx % 1000 == 0: print(" Scanning variant", var_idx) if var_idx >= args.begin:
def load(self, chromosome): self.pgen[self.uniq_key(chromosome)] = \ pg.PgenReader(self.filename)