Пример #1
0
def main():
    arg_ct = len(sys.argv)
    if arg_ct < 4:
        print "Usage: python hamming_distance.py [.bed/.pgen] [sample index 1] [sample index 2] {raw_sample_ct=[val]}"
        print "* raw_sample_ct is required for .bed files."
        return
    specified_sample_ct = None
    sample_ct = 2
    sample_subset = np.empty(sample_ct, np.uint32)
    sample_subset[0] = int(sys.argv[2])
    sample_subset[1] = int(sys.argv[3])
    if arg_ct > 4:
        if not sys.argv[4].startswith("raw_sample_ct="):
            print "Error: Invalid raw_sample_ct parameter."
            return
        specified_sample_ct = int(sys.argv[4][14:])
    with pgenlib.PgenReader(sys.argv[1],
                            raw_sample_ct=specified_sample_ct,
                            sample_subset=sample_subset) as pf:
        variant_ct = pf.get_variant_ct()
        hamming_distance = 0
        buf = np.empty(sample_ct, np.int8)
        for vidx in xrange(variant_ct):
            pf.read(vidx, buf)
            geno0 = buf[0]
            geno1 = buf[1]
            if geno0 != -9 and geno1 != -9:
                hamming_distance += abs(geno0 - geno1)
        sys.stdout.write("Hamming distance: " + str(hamming_distance))
        sys.stdout.write('\n')
Пример #2
0
def main():
    arg_ct = len(sys.argv)
    if arg_ct < 3:
        print(
            "Usage: python3 pgen_subset_and_compress.py <input .bed/.pgen> <output .pgen name> [raw_sample_ct=<val>] [sample idx(s)...]"
        )
        print("* raw_sample_ct is required for .bed files.")
        print("* sample indexes must be in increasing order.")
        print("* This currently assumes that A2 alleles are always reference.")
        return
    sample_subset = None
    specified_sample_ct = None
    sample_ct = None
    if arg_ct > 3:
        offset = 3
        if sys.argv[3].startswith("raw_sample_ct="):
            specified_sample_ct = int(sys.argv[3][14:])
            offset = 4
        if arg_ct > offset:
            sample_ct = arg_ct - offset
            sample_subset = np.empty(sample_ct, np.uint32)
            for idx in range(sample_ct):
                sample_subset[idx] = int(sys.argv[offset + idx])
    with pgenlib.PgenReader(bytes(sys.argv[1], 'utf8'),
                            raw_sample_ct=specified_sample_ct,
                            sample_subset=sample_subset) as infile:
        raw_sample_ct = infile.get_raw_sample_ct()
        if sample_ct is None:
            sample_ct = raw_sample_ct
        variant_ct = infile.get_variant_ct()
        hardcall_phase_present = infile.hardcall_phase_present()
        geno_buf = None
        allele_code_buf = None
        phasepresent_buf = None
        if not hardcall_phase_present:
            geno_buf = np.empty(sample_ct, np.int8)
        else:
            allele_code_buf = np.empty(sample_ct * 2, np.int32)
            phasepresent_buf = np.empty(sample_ct, np.bool_)
        with pgenlib.PgenWriter(
                bytes(sys.argv[2], 'utf8'),
                sample_ct,
                variant_ct,
                False,
                hardcall_phase_present=hardcall_phase_present) as outfile:
            if not hardcall_phase_present:
                for vidx in range(variant_ct):
                    infile.read(vidx, geno_buf)
                    outfile.append_biallelic(geno_buf)
            else:
                for vidx in range(variant_ct):
                    infile.read_alleles_and_phasepresent(
                        vidx, allele_code_buf, phasepresent_buf)
                    outfile.append_partially_phased(allele_code_buf,
                                                    phasepresent_buf)
    return
Пример #3
0
def read_alleles_block(pgen_f, bins_df, block_id):
    """wrapper function of pgenlib.PgenReader.read_alleles_range for a LD block"""
    bim_s = bins_df.bimIdStart[block_id]
    bim_e = bins_df.bimIdEnd[block_id]

    with pg.PgenReader(pgen_f) as pgr:
        buf_ndary = np.zeros((bim_e - bim_s, pgr.get_raw_sample_ct() * 2),
                             dtype=np.int32)
        pgr.read_alleles_range(bim_s, bim_e, buf_ndary)

    return buf_ndary
Пример #4
0
def main():
    arg_ct = len(sys.argv)
    if arg_ct < 3:
        print(
            "Usage: python single_variant_test.py <.bed/.pgen> <variant idx> [raw_sample_ct=<val>] [sample idx(s)...]"
        )
        print("* raw_sample_ct is required for .bed files.")
        print("* sample indexes must be in increasing order.")
        return
    sample_subset = None
    specified_sample_ct = None
    sample_ct = None
    if arg_ct > 3:
        offset = 3
        if sys.argv[3].startswith("raw_sample_ct="):
            specified_sample_ct = int(sys.argv[3][14:])
            offset = 4
        if arg_ct > offset:
            sample_ct = arg_ct - offset
            sample_subset = np.empty(sample_ct, np.uint32)
            for idx in range(sample_ct):
                sample_subset[idx] = int(sys.argv[offset + idx])
    vidx = int(sys.argv[2])
    with pgenlib.PgenReader(bytes(sys.argv[1], 'utf8'),
                            raw_sample_ct=specified_sample_ct,
                            sample_subset=sample_subset) as pf:
        raw_sample_ct = pf.get_raw_sample_ct()
        if sample_ct is None:
            sample_ct = raw_sample_ct
        # variant_ct = pf.get_variant_ct()
        buf = np.empty(raw_sample_ct * 2, np.int32)
        buf2 = np.empty(raw_sample_ct, np.bool_)
        pf.read_alleles_and_phasepresent(vidx, buf, buf2)
        for sample_idx in range(sample_ct):
            sys.stdout.write(str(buf[2 * sample_idx]))
            if buf2[sample_idx]:
                sys.stdout.write('|')
            else:
                sys.stdout.write('/')
            sys.stdout.write(str(buf[2 * sample_idx + 1]))
            sys.stdout.write(' ')
        sys.stdout.write('\n')
        pf.change_sample_subset()
        pf.read_alleles_and_phasepresent(vidx, buf, buf2)
        for sample_idx in range(raw_sample_ct):
            sys.stdout.write(str(buf[2 * sample_idx]))
            if buf2[sample_idx]:
                sys.stdout.write('|')
            else:
                sys.stdout.write('/')
            sys.stdout.write(str(buf[2 * sample_idx + 1]))
            sys.stdout.write(' ')
        sys.stdout.write('\n')
Пример #5
0
def main():
    arg_ct = len(sys.argv)
    if arg_ct < 2:
        print(
            "Usage: python3 het_count.py <.bed/.pgen> [raw_sample_ct=<val>] [sample idx(s)...]"
        )
        print("* raw_sample_ct is required for .bed files.")
        print("* sample indexes must be in increasing order.")
        print(
            "* The Python side of this is really slow if you don't explicitly specify sample indexes."
        )
        return
    sample_subset = None
    specified_sample_ct = None
    sample_ct = None
    if arg_ct > 2:
        offset = 2
        if sys.argv[2].startswith("raw_sample_ct="):
            specified_sample_ct = int(sys.argv[2][14:])
            offset = 3
        if arg_ct > offset:
            sample_ct = arg_ct - offset
            sample_subset = np.empty(sample_ct, np.uint32)
            for idx in range(sample_ct):
                sample_subset[idx] = int(sys.argv[offset + idx])
    with pgenlib.PgenReader(bytes(sys.argv[1], 'utf8'),
                            raw_sample_ct=specified_sample_ct,
                            sample_subset=sample_subset) as pf:
        raw_sample_ct = pf.get_raw_sample_ct()
        if sample_ct is None:
            sample_ct = raw_sample_ct
        variant_ct = pf.get_variant_ct()
        tot_hets = np.zeros((sample_ct, ), dtype=np.uint32)
        buf = np.empty(sample_ct, np.int8)
        for vidx in range(variant_ct):
            pf.read(vidx, buf)
            # this is horribly inefficient...
            for sample_idx in range(sample_ct):
                if buf[sample_idx] == 1:
                    tot_hets[sample_idx] += 1
        for sample_idx in range(sample_ct):
            sys.stdout.write(str(tot_hets[sample_idx]))
            sys.stdout.write(' ')
        sys.stdout.write('\n')
Пример #6
0
def main():
    arg_ct = len(sys.argv)
    if arg_ct < 3:
        print(
            "Usage: python3 extract_haps.py <.bed/.pgen> <output filename> [raw_sample_ct=<val>] [sample idx(s)...]"
        )
        print("* raw_sample_ct is required for .bed files.")
        return
    sample_subset = None
    specified_sample_ct = None
    sample_ct = None
    if arg_ct > 3:
        offset = 3
        if sys.argv[3].startswith("raw_sample_ct="):
            specified_sample_ct = int(sys.argv[3][14:])
        if arg_ct > offset:
            sample_ct = arg_ct - offset
            sample_subset = np.empty(sample_ct, np.uint32)
            for idx in range(sample_ct):
                sample_subset[idx] = int(sys.argv[offset + idx])
    with pgenlib.PgenReader(bytes(sys.argv[1], 'utf8'),
                            raw_sample_ct=specified_sample_ct,
                            sample_subset=sample_subset) as infile:
        raw_sample_ct = infile.get_raw_sample_ct()
        if sample_ct is None:
            sample_ct = raw_sample_ct
        variant_ct = infile.get_variant_ct()
        allele_code_buf = np.empty([sample_ct * 2, variant_ct], dtype=np.int32)
        infile.read_alleles_range(0, variant_ct, allele_code_buf, 1)
        with open(sys.argv[2], 'w') as outfile:
            for vidx in range(variant_ct):
                for sidx in range(sample_ct):
                    if sidx != 0:
                        outfile.write(' ')
                    outfile.write(
                        str(allele_code_buf[2 * sidx, vidx]) + "|" +
                        str(allele_code_buf[2 * sidx + 1, vidx]))
                outfile.write('\n')
Пример #7
0
    vec_rr = 1.0 / np.sqrt(vec_rr)
    mat_covar = np.transpose(np.transpose(mat_covar) * vec_rr)
    # qr decomposition of covariate matrix
    Q = linalg.qr(mat_covar, mode="economic")[0]
    del mat_covar
    print("Completed QR decomposition of covariate matrix.")
    '''
	Read .pvar file.
	Read pgen/bed file and do single-variant assoc tests.
	Write stats to output file on the fly.
	'''
    print("Starting assoc tests...")
    print("Writing stats into", args.out)
    outfp = open(args.out, "w")
    pf = pgenlib.PgenReader(pgen.encode(),
                            raw_sample_ct=raw_sample_ct,
                            sample_subset=sample_subset)
    with open(pvar) as fp:
        var_idx = -1
        int8vec_x = np.empty(sample_ct, dtype=np.int8)
        vec_x = np.empty(sample_ct, dtype=np.float32)
        for line in fp:
            if line[0] == '#':
                continue
            if line.strip() is not None:
                var_idx += 1
                if var_idx > args.end:
                    break
                if var_idx % 1000 == 0:
                    print("    Scanning variant", var_idx)
                if var_idx >= args.begin:
Пример #8
0
 def load(self, chromosome):
     self.pgen[self.uniq_key(chromosome)] = \
         pg.PgenReader(self.filename)