示例#1
0
    def test_ibd_segments_sib_pair(self):
        '''Test calculating distant IBD segments between every hap pair within a pair of phased sibs.
        Compare with IBD segments based on nucelar family info.'''
        # Segments should contain segments obtained by the genotype-genotye-HMM to validate phasing:
        #        [((1412, 3218), (32992389, 51156934, 18.165, 1), ((3, 0), (2, 0))),
        #         ((241 , 451), (19643555, 23817486, 4.174, 1), ((2, 0), (3, 1))),
        #         ((0   , 454), (16484792, 23834889, 7.350, 1), ((3, 1), (2, 1))),
        #         ((2650, 3218), (45892433, 51156934, 5.265, 1), ((3, 1), (2, 1)))],
        expected_segments = [
            ((1412, 3217), (32992389, 51156933, 18.165, 0), ((3, 0), (2, 0))),
            ((238, 453), (19581946, 23826675, 4.245, 0), ((2, 0), (3, 1))),
            ((0, 629), (16484792, 25608548, 9.124, 0), ((3, 1), (2, 1))),
            ((2661, 3217), (45972017, 51156933, 5.185, 0), ((3, 1), (2, 1)))
        ]
        # Serial test
        segment_set = im.ih.between_samples_segments(
            self.problem, [3], [2],
            im.PhaseParam(kinship_file=im.itu.KINSHIP_FILE, debug=False))
        im.itu.assert_segments_almost_equal(segment_set,
                                            expected_segments,
                                            full_data=True,
                                            decimal=3,
                                            err_msg='Wrong IBD segments')

        # Parallel test
        segment_set = im.ih.between_samples_segments(
            self.problem, [3], [2],
            im.PhaseParam(kinship_file=im.itu.KINSHIP_FILE, debug=False),
            num_processes=3)
        im.itu.assert_segments_almost_equal(segment_set,
                                            expected_segments,
                                            full_data=True,
                                            decimal=3,
                                            err_msg='Wrong IBD segments')
示例#2
0
def write_id_coefs_legacy(p, out_file, params=im.PhaseParam()):
    '''Given a legacy Problem object p, import ID coefficients from global ID coefficient file.'''
    i = p.pedigree.genotyped_sample_id()
    a = np.array([(x, y, m[0]) + tuple(m[1].tolist())
                  for (x, y, m) in ((x, y, params.id_coefs(x, y))
                                    for x, y in itertools.product(i, i))])
    np.savetxt(out_file, a, fmt='%d %d %e %e %e %e %e %e %e %e %e %e')
示例#3
0
def problem_ibd_segments(problem, i, ai, j, bj, **kwargs):
    '''IBD segments using a phasing npz file- absolute path.'''
    segments = im.segment.SegmentSet()
    for a in (im.constants.ALLELES if ai is None else [ai]):
        for b in (im.constants.ALLELES if bj is None else [bj]):
            segments += im.ih.hap_segments_from_problem(
                problem, (i, a), (j, b), im.PhaseParam(**kwargs))
    return segments
示例#4
0
def debug_distant_phasing(sample):
    '''Debug distant-phasing of sample ''sample''.'''
    p = im.io.read_npz(
        '/home/oren/ober/out/kids/cytosnp/chr22/cytosnp.imputed.stage4.npz')
    phaser = im.phase_core.new_phaser_chain(
        [im.phase_distant.distant_phaser(single_sample=sample)])
    h = p.haplotype
    print h.fill_fraction(sample=0)
    phaser.run(p, im.PhaseParam())
    print h.fill_fraction(sample=0)
示例#5
0
def plot_lambda_std(problem, id_coef=im.PhaseParam().id_coef_file):
    '''Plot lambda std dev vs. mean lambda in all children of all families in the problem object
    ''problem'.'''
    k = __idcoef_dao(id_coef)
    l = dict((x, k.id_coefs(x, x)[0]) for x in k.index)
    child_lam = [
        map(l.get, problem.pedigree.sample_id[np.array(list(y.children))])
        for y in problem.families()
    ]
    c = np.array([(np.mean(x), np.std(x)) for x in child_lam])
    P.clf()
    P.hold(True)
    P.scatter(c[:, 0], c[:, 1])
    P.xlabel('Sibs Mean $\lambda$')
    P.ylabel('Sibs Stddev $\lambda$')
示例#6
0
def lambda_vs_f(id_coef=im.PhaseParam().id_coef_file):
    '''Return lambda as a discrete function of the inbreeding coefficient f for all Hutt samples.
    Outputs an array with f, lam columns.'''
    # Read lambda, calculate f from Deltas
    k = __idcoef_dao(id_coef)
    a = np.array([
        np.concatenate(([y[0]], y[1]))
        for y in (k.id_coefs(x, x) for x in k.index)
    ])
    inbreed = lambda d: 2 * (d[1] + 0.5 *
                             (d[3] + d[5] + d[7]) + 0.25 * d[8]) - 1
    b = np.array([(inbreed(x), x[0]) for x in a])
    # Remove outliers with f~0
    b = b[np.where(b[:, 0] >= 1e-5)[0], :]
    return b
示例#7
0
    def test_global_ibd_segments(self):
        '''Debug error that appeared after stage 2 - child had wrong hap color at SNP 2960
        in the middle of a long IBD segment. This error doesn''t appear in this isolated test.
        It depends on the ordering of processing samples and segments.'''

        # Before stage 2: no IBD segments should exist
        problem = im.io.read_npz(im.itu.FAMILY4_STAGE1)
        im.itu.assert_problem_stats(problem, 38616, 35586, 22)
        ibd = problem.info.ibd
        assert_equal(ibd.length, 0,
                     'Before stage 2: no IBD segments should exist')
        assert_equal(problem.h[2955:2965, 1, 0],
                     [2, 2, 2, 2, 2, 2, 2, 2, 2, 0], 'Wrong mother haplotype')
        assert_equal(problem.h[2955:2965, 4, 1],
                     [0, 0, 0, 0, 0, 0, 0, 0, 0, 1], 'Wrong child haplotype')

        im.phase_family.family_phaser().run(problem, im.PhaseParam())

        # After: check IBD segment; (1, 0), (4, 1) should be IBD around SNP 2960
        assert_segments_almost_equal(
            ibd,
            [((0, 176), (16484792, 18545634, 2.061, 0), ((2, 0), (0, 0))),
             ((187, 2687), (18895227, 46242359, 27.347, 1), ((0, 1), (2, 0))),
             ((2687, 3216), (46336181, 51103692, 4.768, 0), ((2, 0), (0, 0))),
             ((0, 3218), (16484792, 51156933, 34.672, 0), ((0, 1), (3, 0))),
             ((0, 2470), (16484792, 44456692, 27.972, 0), ((0, 0), (4, 0))),
             ((2473, 3216), (44515806, 51103692, 6.588, 1), ((0, 1), (4, 0))),
             ((0, 2985), (16484792, 48709188, 32.224, 0), ((0, 1), (5, 0))),
             ((2993, 3216), (48742097, 51103692, 2.362, 0), ((0, 0), (5, 0))),
             ((0, 3218), (16484792, 51156933, 34.672, 0), ((1, 1), (2, 1))),
             ((4, 36), (17087656, 17434521, 0.347, 0), ((3, 1), (1, 1))),
             ((42, 2035), (17587680, 37662436, 20.075, 1), ((1, 0), (3, 1))),
             ((2047, 3217), (37902926, 51140316, 13.237, 0), ((3, 1), (1, 1))),
             ((11, 804), (17285049, 27091750, 9.807, 0), ((4, 1), (1, 1))),
             ((823, 3217), (27200942, 51140316, 23.939, 0), ((1, 0), (4, 1))),
             ((11, 14), (17285049, 17307742, 0.023, 0), ((5, 1), (1, 0))),
             ((31, 3217), (17415572, 51140316, 33.725, 1), ((5, 1), (1, 1)))],
            decimal=3,
            err_msg='Wrong IBD segments')
        im.itu.assert_problem_stats(problem, 38616, 38576, 30)
        assert_equal(problem.h[2955:2965, 1, 0],
                     [2, 2, 2, 2, 2, 2, 2, 2, 2, 1], 'Wrong mother haplotype')
        assert_equal(problem.h[2955:2965, 4, 1],
                     [2, 2, 2, 2, 2, 2, 2, 2, 2, 1], 'Wrong child haplotype')
示例#8
0
 def test_entire_pipeline(self):
     '''Run the entire pipeline. Using a small # of surrogate parents, for speed.'''
     g = self.problem.genotype
     # Inject a mock DAO so that we don't need the real ID coef file, which is huge here
     self.problem.pedigree._idcoef_dao = mock_dao.IdCoefDao(
         self.problem.pedigree.num_genotyped)
     phaser = im.phase.build_phasing_pipeline(
         util.Struct(impute=im.phase.IMPUTE_OPTION.IMPUTE_AND_FILL,
                     debug=False,
                     print_times=False,
                     stage=0))
     im.phase.run_phasing_chain(
         phaser, self.problem,
         im.PhaseParam(distant_phasing_params=[(0.9, 2, 0.95)]))
     im.itu.assert_problem_stats(self.problem, 22640, 20225, 144)
     assert_equal(g.num_filled, 22640,
                  'Incorrect number of imputed genotypes')
     assert_equal(
         g.num_missing, 0,
         'Incorrect number of missing genotypes; there should not be any after imputation'
     )
示例#9
0
Test IBD clique kinship comparison for parent-of-origin
determination.

Created on July 2, 2013
@author: Oren Livne <*****@*****.**>
============================================================
'''
import impute as im, time, numpy as np, os

'''
--------------------------------------------------
Main program
--------------------------------------------------
'''
chrom = 22
# a = im.poo.Aligner(chrom)

# print a.debug_sample_snp(1101, 1500)
# print a.debug_sample(6)
# a.debug_sample_snp(6, 2000)
# a.debug_sample_snp(0, 500)

# Time POO vs. # processes used
for num_processes in [1]:#[2, 4]:
    start_time = time.time()
    m = im.poo.determine_poo(chrom, params=im.PhaseParam(debug=True, poo_snp_step_size=100, num_processes=num_processes))
    print '#processes=%d, time %.2f' % (num_processes, time.time() - start_time)
 
np.savetxt(os.environ['OBER'] + '/doc/poo/m-chr%d.txt' % (chrom,), m)
im.poo.plot_flip_measure_vs_sample(chrom, m)
示例#10
0
    items = [int(x) for x in line]
    return tuple(items[0:2]), im.segment.DisjointSegmentSet(
        zip(items[2::2], items[3::2]))


####################################################################################
if __name__ == '__main__':
    '''
    --------------------------------------------------
    Main program
    --------------------------------------------------
    '''
    options = parse_command_line_args()
    file_reader = lambda f: csv.reader(
        open(f, 'rb'), delimiter=' ', skipinitialspace=True)
    param = im.PhaseParam()
    try:
        reader1 = file_reader(options.file1)
        reader2 = file_reader(options.file2)
        for i, line in enumerate(reader1):
            key, A = parse_line(line)
            key2, B = parse_line(reader2.next())
            if key != key2:
                raise ValueError('Sample pairs are not the same in both files: (%d,%d), (%d,%d) at line %d' % \
                                 key + key2 + (i + 1,))
            else:
                # Output statistics. Only log pairs for which samples were found
                len_A, len_B = A.length, B.length
                if len_A != 0 or len_B != 0:
                    f = param.kinship(key[0], key[1])
                    sys.stdout.write('%d %d %f %d %d %d %d\n' %
示例#11
0
Created on August 2, 2013
@author: Oren Livne <*****@*****.**>
============================================================
'''
import impute as im, os, numpy as np, matplotlib.pyplot as P, sys
from impute.kids.hutt_kids import get_sib_ids, read_chip_problem, chip_data_set
from impute.phasing.phase_trivial import trivial_phaser

####################################################################################
if __name__ == '__main__':

    path = os.environ['OBER_OUT'] + '/kids'
    chrom = 22
    debug = 1
    num_processes = 1  # 4
    params = im.PhaseParam()

    sibs = get_sib_ids(path)
    sib = 842  # Has high imputation call rate but high re-phasing error rate

    chip = 'cytosnp'
    p = im.io.read_npz('%s/%s/chr%d/%s.npz' %
                       (path, chip, chrom, chip_data_set(chip)))

    # Run stage 1 so that we can step through the code and find why SNP 1700
    # is phased to 2,1 instead of 1,2
    phaser = trivial_phaser()
    problem = phaser.run(
        p,
        im.PhaseParam(selected_samples=np.array([842]),
                      debug=True,
示例#12
0
#!/usr/bin/env python
'''
============================================================
Plot the nbhrs1298 pedigree.

Created on August 16, 2012
@author: Oren Livne <*****@*****.**>
============================================================
'''
import impute as im

p = im.hutt('hutt.phased.npz')

s = 1159  # Unphased sample index
#phaser = im.phase_core.new_phaser_chain([im.phase_distant.distant_phaser(single_sample=s)])
phaser = im.phase_core.new_phaser_chain([im.phase_distant.distant_phaser(phased_fill=0.92, target_fill=0.95, max_path_length=7)])
h = p.haplotype
print h.fill_fraction(sample=s)
phaser.run(p, im.PhaseParam(debug=True))
print h.fill_fraction(sample=s)

out = '/home/oren/ped-%d.png' % (s,)
im.pt.draw_member_neighbor_genotyped_pedigree(p, 1159, 5, out, identifier='index')
示例#13
0
                      dest='debug',
                      default=0,
                      help='Debug Level (0=quiet; 1=summary; 2=full debug)')
    parser.add_option(
        '-i',
        '--input',
        type='str',
        dest='input_file',
        default=None,
        help=
        'Input file. If specified, reads from this file instead of from stdin')
    parser.add_option('-u',
                      '--len-unit',
                      type='str',
                      dest='len_unit',
                      default=im.PhaseParam().len_unit,
                      help='segment length to look for [cm|mbp]')
    parser.add_option('-l',
                      '--min-len',
                      type='str',
                      dest='min_len',
                      default=im.PhaseParam().min_len,
                      help='Minimum segment length to look for')

    options, args = parser.parse_args(sys.argv[1:])
    if len(args) != 2:
        print usage
        sys.exit(1)
    phased_data_file, kinship_file = args

    try:
示例#14
0
#!/usr/bin/env python
'''
============================================================
Test GERMLINE IBD on 507's ungenotyped family. 

Created on September 15, 2012
@author: Oren Livne <*****@*****.**>
============================================================
'''
import impute as im
import numpy as np

p = im.hutt('hutt.stage3.npz')
q = im.hutt('hutt.stage3.npz')
phaser = im.phase_distant.family_sib_comparison_phaser()
i = 507
phaser.run(q, im.PhaseParam(single_member=i, debug=True))

print np.where(p.haplotype.data[:, i, :] != q.haplotype.data[:, i, :])
示例#15
0
'''
============================================================
Generate the family 7 identity coefficient file.

Created on January 21, 2013
@author: Oren Livne <*****@*****.**>
============================================================
'''
import impute as im, numpy as np, db_gene, itertools

# Temporarily comment out the frames field in im.io functions for this cmomand to work
p = im.io.read_npz(im.itu.FAMILY7 + '.npz')

i = p.pedigree.genotyped_sample_id()
k = db_gene.snp.file_dao.IdCoefDao(im.PhaseParam().id_coef_file)
a = np.array([(x, y, m[0]) + tuple(m[1].tolist())
              for (x, y, m) in ((x, y, k.id_coefs(x, y))
                                for x, y in itertools.product(i, i))])
np.savetxt(im.itu.FAMILY7 + '.id',
           a,
           fmt='%d %d %e %e %e %e %e %e %e %e %e %e')
示例#16
0
文件: run_poo.py 项目: orenlivne/ober
                      type='int',
                      dest='num_processes',
                      default=1,
                      help='Number of processes to spawn')
    parser.add_option('-s',
                      '--snp-step-size',
                      type='int',
                      dest='poo_snp_step_size',
                      default=1,
                      help='POO SNP downsampling step size')
    options, args = parser.parse_args(argv[1:])
    if len(args) != 2:
        print usage
        sys.exit(1)
    options.input = args[0]
    options.output = args[1]
    return args, options


#---------------------------------------------
# Main Program
#---------------------------------------------
if __name__ == '__main__':
    args, options = __parse_command_line_args(sys.argv)
    params = im.PhaseParam()
    params.update_from_struct(options)

    problem = im.io.read_npz(options.input)
    poo_alignment(problem, params)
    im.io.write_npz(problem, options.output)
示例#17
0
Created on January 11, 2012
@author: Oren Livne <*****@*****.**>
============================================================
'''
import impute as im, itertools, matplotlib.pyplot as P, sys
from impute.color.hap_color_grouping import plot_hap_coloring

print sys.argv
generate_plots = True if (len(sys.argv) < 2) else bool(int(sys.argv[1]))
p = im.io.read_npz(im.itu.FAMILY_TOO_ZEROED_STAGE4)
haps = list(
    itertools.product(im.gt.genotyped_members(p, p.first_family), xrange(2)))
children = im.gt.genotyped_children(p, p.first_family)
child_haps = list(itertools.product(children, xrange(2)))

params = im.PhaseParam()  #debug=True)

#-------------------------------------------------------
# Nuclear family phasing
#-------------------------------------------------------
ibd = p.info.ibd
print ibd
if generate_plots:
    # Including parents
    P.figure(1)
    plot_hap_coloring(ibd,
                      haps,
                      pair_gap=10,
                      linewidth=6,
                      title='Family IBD Segments: Nuclear Family Phasing')
    #P.savefig(os.environ['OBER'] + '/doc/ibd/hmm/family_ibd_nuclear.png')