Пример #1
0
def generate_trios(pedfile, f1=True):
    """
    Given a PED file, specify whether you want
    to output trios w/r/t the F1 or F2 generation
    (i.e., whether the kid in each trio is an F1 or F2).
    """
    from peddy import Ped
    ped = Ped(pedfile)
    p0s = [k for k in ped.samples() if k.mom is None]
    f1s = [k for k in ped.samples() if k.mom and k.dad and k.mom.mom is None]
    f2s = [k for k in ped.samples() if k.mom and k.mom.mom]
    if f1: trios = f1s
    else: trios = f2s
    for i in trios:
        yield (i.sample_id, i.mom.sample_id, i.dad.sample_id)
Пример #2
0
    def create_samples(self):
        ped = Ped(self.ped_path)
        cols = [
            'sample_id', 'family_id', 'name', 'paternal_id', 'maternal_id',
            'sex', 'phenotype'
        ]
        if ped.header is None:
            ped.header = [x for x in cols if x != 'name']
        samples = [fix_sample_name(s) for s in self.vcf.samples]
        cols = [
            'sample_id', 'family_id', 'name', 'paternal_id', 'maternal_id',
            'sex', 'phenotype'
        ]
        idxs, rows, not_in_vcf = [], [], []
        cols.extend(ped.header[6:])
        sample_id = 1
        for i, s in enumerate(ped.samples(), start=1):
            try:
                idxs.append(samples.index(fix_sample_name(s.sample_id)))
            except ValueError:
                not_in_vcf.append(s.sample_id)
                continue
            rows.append([
                sample_id,
                s.family_id,
                fix_sample_name(s.sample_id),
                fix_sample_name(str(s.paternal_id)),
                fix_sample_name(str(s.maternal_id)),
                '1' if s.sex == 'male' else '2' if s.sex == 'female' else '-9',
                '2' if s.affected is True else
                '1' if s.affected is False else '-9',
            ] + s.attrs)
            sample_id += 1

        if len(not_in_vcf) > 0:
            print("not in VCF: %s" % ",".join(not_in_vcf), file=sys.stderr)
        scols = [sql.Column('sample_id', sql.Integer, primary_key=True)]
        for i, col in enumerate(cols[1:], start=1):
            vals = None
            try:
                vals = [r[i] for r in rows]
                l = max(len(v) for v in vals)
                scols.append(sql.Column(col, Unicode(l)))
            except:
                print(col, vals, file=sys.stderr)
                raise

        t = sql.Table('samples', self.metadata, *scols)
        t.drop(checkfirst=True)
        t.create()

        self.engine.execute(t.insert(), [dict(zip(cols, r)) for r in rows])

        # track the order to pull from the genotype fields.
        self.sample_idxs = np.array(idxs)
        return [r[2] for r in rows]
Пример #3
0
    def create_samples(self):
        ped = Ped(self.ped_path)
        cols = ["sample_id", "family_id", "name", "paternal_id", "maternal_id", "sex", "phenotype"]
        if ped.header is None:
            ped.header = [x for x in cols if x != "name"]
        samples = [fix_sample_name(s) for s in self.vcf.samples]
        cols = ["sample_id", "family_id", "name", "paternal_id", "maternal_id", "sex", "phenotype"]
        idxs, rows, not_in_vcf = [], [], []
        cols.extend(ped.header[6:])
        sample_id = 1
        for i, s in enumerate(ped.samples(), start=1):
            try:
                idxs.append(samples.index(fix_sample_name(s.sample_id)))
            except ValueError:
                not_in_vcf.append(s.sample_id)
                continue
            rows.append(
                [
                    sample_id,
                    s.family_id,
                    fix_sample_name(s.sample_id),
                    fix_sample_name(str(s.paternal_id)),
                    fix_sample_name(str(s.maternal_id)),
                    "1" if s.sex == "male" else "2" if s.sex == "female" else "-9",
                    "2" if s.affected is True else "1" if s.affected is False else "-9",
                ]
                + s.attrs
            )
            sample_id += 1

        if len(not_in_vcf) > 0:
            print("not in VCF: %s" % ",".join(not_in_vcf), file=sys.stderr)
        scols = [sql.Column("sample_id", sql.Integer, primary_key=True)]
        for i, col in enumerate(cols[1:], start=1):
            vals = None
            try:
                vals = [r[i] for r in rows]
                l = max(len(v) for v in vals)
                scols.append(sql.Column(col, Unicode(l)))
            except:
                print(col, vals, file=sys.stderr)
                raise

        t = sql.Table("samples", self.metadata, *scols)
        t.drop(checkfirst=True)
        t.create()

        self.engine.execute(t.insert(), [dict(zip(cols, r)) for r in rows])

        # track the order to pull from the genotype fields.
        self.sample_idxs = np.array(idxs)
        return [r[2] for r in rows]
Пример #4
0
def test_6():
    p = Ped(op.join(HERE, 'peddy/tests/a6.ped'))
    assert len(list(p.samples())) == 14
    for sam in p.samples():
        assert sam.family_id[:3] == "fam"
Пример #5
0
def test_getattr():
    p = Ped(op.join(HERE, 'peddy/tests/a.ped'))
    li = list(p.samples(ethnicity='caucasianNEuropean'))
    assert len(li) == 5
    for item in li:
        assert item.ethnicity == 'caucasianNEuropean'
Пример #6
0
def test_ped():

    p = Ped(op.join(HERE, 'peddy/tests/a.ped'))
    assert len(p.families) == 4

    assert len(list(p.samples())) == 14
Пример #7
0
def run(args):
    ped = Ped(args.ped)
    vcf = VCF(args.vcf, gts012=True)

    ped_samples = [s.sample_id for s in ped.samples()]
    vcf_samples = set(vcf.samples)

    samples = [s for s in ped_samples if s in vcf_samples]

    exclude = read_exclude(args.exclude)

    vcf = VCF(args.vcf, samples=samples, gts012=True)
    if args.region:
        vcf_iter = vcf(args.region)
    else:
        vcf_iter = vcf

    pctile1 = 10
    # build a dict of sample_id to sample index
    smp2idx = dict(zip(vcf.samples, range(len(vcf.samples))))

    # get the Ped objects for the family of interest
    if args.families is None:
        fams = ped.families.values()
    else:
        fams = [ped.families[f] for f in args.families.split(",")]
    if len(fams) == 0:
        sys.exit('Families %s not found in ped file' % args.families)

    # create a simple dictionary of info for each family member
    fs = [get_family_dict(fam, smp2idx, args) for fam in fams]
    del fam

    fsites = open("%s.sites" % args.prefix, "w")
    # fcalls contains the crossovers for all samples.
    try:
        vcf["AB"]
        has_abs = True
    except KeyError:
        has_abs = False

    smp2gt = defaultdict(int)

    nused, i, report_at, t0 = 0, 0, 20000, time.time()
    for i, v in enumerate(vcf_iter, start=1):
        if i % report_at == 0:
            persec = i / float(time.time() - t0)
            print("%s:%d (%.1f/sec) %.2f%% informative (%d/%d variants)" %
                  (v.CHROM, v.POS, persec, 100.0 * nused / i, nused, i),
                  file=sys.stderr)
            if i == 20000:
                report_at = 100000
            if i == 100000:
                report_at = 200000
                for f in fs:
                    for k in f:
                        if k.startswith('fh'): f[k].flush()
            sys.stderr.flush()
        if v.var_type != 'snp': continue  ### no indels
        if len(v.ALT) > 1: continue
        #if len(v.REF) > 3 or len(v.ALT) > 1 or len(v.ALT[0]) > 3:
        #    continue
        #if v.call_rate < 0.95: continue
        if v.call_rate < 0.90: continue
        if v.FILTER is not None: continue
        if int(v.INFO.get('AC')) == 1: continue

        if exclude is not None and 0 != len(exclude[v.CHROM].search(
                v.start, v.end)):
            continue

        # expensive to get gt_bases and we only need it at the crossover.
        gt_bases = None
        gt_types, gt_quals, gt_depths = v.gt_types, v.gt_quals, v.gt_depths

        ### added by tom 2019-11-13, as 1st percentile of depths on chr17
        ### were often negative...
        #gt_depths[gt_depths < 0] = 0

        gt_phases = v.gt_phases
        ipctiles, pctiles = None, None
        sample_abs = None

        nsites = 0  # track the number of families that had this as an informative site.
        for f in fs:
            #if ipctiles is not None and ipctiles[0] < pctile1:
            #    break

            # is_informative only needs gt_types, so we check that first...
            add_genotype_info(f, gt_types=gt_types, gt_phases=gt_phases)

            # ############## PHASED ####################
            if all(f['gt_phase']):
                if gt_bases is None:
                    gt_bases = v.gt_bases

                nsites += phased_check(f, v, gt_bases)
                continue
            # ############ END PHASED ####################

            # need exactly 1 het parent for unphased checks.
            if 1 != ((f['gt_type'][0] == HET) + (f['gt_type'][1] == HET)):
                continue

            if not is_informative(f):
                continue
            # now wee need to add quality and depth.
            add_genotype_info(f, gt_quals=gt_quals, gt_depths=gt_depths)

            if not passes_quality_control(f, args):
                continue

            # detect crossovers.
            for parent, (p1, p2) in [("dad", (0, 1)), ("mom", (1, 0))]:

                if not (f['gt_type'][p1] == HET
                        and f['gt_type'][p2] == HOM_REF):
                    continue

                if gt_bases is None:
                    gt_bases = v.gt_bases
                if sample_abs is None:
                    sample_abs = get_allele_balance(v, has_abs)
                    if sample_abs is None: break

                fam_abs = sample_abs[f['idxs']]
                if all(np.isnan(fam_abs)): continue
                off = 0.31  # require that  off <= alt/(ref+alt) <= 1-off
                if ((fam_abs[p1] >= 1 - off) | (fam_abs[p1] <= off)): continue
                if np.any((1 - off < fam_abs[2:]) | (fam_abs[2:] <= off)):
                    continue

                fam_bases = "\t".join(gt_bases[f['idxs']])

                fam_abs = "|".join("%.2f" % val for val in fam_abs)

                # calculate on first use. we found that having a low 1st pctile
                # was a good indicator of increased chance of spurious XO even
                # in families with decent depth.
                #print (np.mean(gt_depths), np.median(gt_depths))
                if pctiles is None:
                    ipctiles = np.percentile(gt_depths, (1, 5, 10, 50, 90))
                    pctiles = "|".join("%.0f" % de for de in ipctiles)
                #if ipctiles[0] < pctile1:
                #    break
                fam_depths = "|".join(map(str, gt_depths[f['idxs']]))
                nsites += 1
                val = 1 if f['gt_type'][2] == f['gt_type'][3] else 0
                f['fh-%s' % parent].write('\t'.join(
                    str(s) for s in [
                        v.CHROM, v.POS - 1, v.POS, f['ids'][p1],
                        f['family_id'], val, fam_bases, fam_depths,
                        "%.2f" % v.call_rate, pctiles, fam_abs
                    ]) + '\n')

        fsites.write("%s:%d\t%d\n" % (v.CHROM, v.POS, nsites))
        if nsites > 0:
            nused += 1
    fsites.close()
    persec = i / float(time.time() - t0)
    print("finished at %s:%d (%.1f/sec) %.2f%% informative (%d/%d variants)" %
          (v.CHROM, v.POS, persec, 100.0 * nused / i, nused, i),
          file=sys.stderr)
    kept = _remove_empty(fs)
    call_all(kept, args.prefix, min_sites=20)
Пример #8
0
def test_ped():

    p = Ped('peddy/tests/a.ped')
    assert len(p.families) == 4

    assert len(list(p.samples())) == 14
Пример #9
0
def test_6():
    p = Ped(op.join(HERE, 'peddy/tests/a6.ped'))
    assert len(list(p.samples())) == 14
    for sam in p.samples():
        assert sam.family_id[:3] == "fam"
Пример #10
0
def test_getattr():
    p = Ped(op.join(HERE, 'peddy/tests/a.ped'))
    li = list(p.samples(ethnicity='caucasianNEuropean'))
    assert len(li) == 5
    for item in li:
        assert item.ethnicity == 'caucasianNEuropean'
Пример #11
0
def test_ped():

    p = Ped(op.join(HERE, 'peddy/tests/a.ped'))
    assert len(p.families) == 4

    assert len(list(p.samples())) == 14
Пример #12
0
def run(args):
    print ('\t'.join(['chrom', 'start', 'end', 'sample_id', 'parent_id', 'n_vars', 'hap_start']))
    vcf = VCF(args.vcf)
    ped = Ped(args.ped)

    samples = [s for s in ped.samples()]
    kids = [s for s in samples if s.mom is not None and s.dad is not None]
    fams = set([s.family_id for s in samples])

    smp2ped = dict(zip([s.sample_id for s in samples], samples))

    exclude = read_exclude(args.exclude)

    #kid = smp2ped[args.kid].sample_id

    #dad, mom = smp2ped[kid].paternal_id, smp2ped[kid].maternal_id

    #samples_in_ped = [s.sample_id for s in samples]
    #samples_in_fam = [s.sample_id for s in samples if s.family_id == smp2ped[kid].family_id]

    # restrict VCF to the samples in the current family
    #vcf.set_samples(samples_in_ped)

    smp2idx = dict(zip(vcf.samples, range(len(vcf.samples))))

    bad_positions = []
    v_feats = []

    haps = defaultdict()
    inf_positions = defaultdict(list)

    fam_dict = defaultdict(lambda: defaultdict())

    
    for fam in fams:
        mom = [s for s in samples if s.family_id == fam and s.mom is None and s.dad is None and s.sex == 'female'][0]
        dad = [s for s in samples if s.family_id == fam and s.mom is None and s.dad is None and s.sex == 'male'][0]
        sibs = [s.sample_id for s in kids if s.dad == dad and s.mom == mom]

        fam_dict[fam]['mom'] = mom
        fam_dict[fam]['dad'] = dad
        fam_dict[fam]['sibs'] = ','.join(sibs) 

    nused, i, report_at, t0 = 0, 0, 1000, time.time()
    for i,v in enumerate(vcf(args.chrom)):
        if i != 0 and i % report_at == 0:
            persec = i / float(time.time() - t0)
            print("%s:%d (%.1f/sec) %.2f%% informative (%d/%d variants)" % (v.CHROM, v.POS,
                  persec, 100.0 * nused/i, nused, i), file=sys.stderr)

        if args.exclude and len(exclude[v.CHROM].search(v.start, v.end)) > 0: continue
        if v.var_type != 'snp': continue
        if v.FILTER not in ('PASS', None): continue
        if v.call_rate < 0.90: continue 
        if len(v.ALT) > 1: continue
        if len(v.ALT[0]) > 1: continue
        
        gts = v.gt_types
        quals = v.gt_quals
        rd, ad = v.gt_ref_depths, v.gt_alt_depths

        for fam in fam_dict:

            mom = fam_dict[fam]['mom']
            dad = fam_dict[fam]['dad']
            sibs = fam_dict[fam]['sibs'].split(',')

            if args.inf_parent and args.inf_parent not in (mom, dad): continue

            try:
                mi, di = smp2idx[mom.sample_id], smp2idx[dad.sample_id]
            except KeyError: sys.exit()

            # ensure we're at an informative site

            if gts[mi] != HOM_REF and gts[di] != HOM_REF: continue
            if gts[mi] == HOM_REF and gts[di] == HOM_REF: continue
            
            sib_gts = [gts[smp2idx[k]] for k in sibs]

            inf_parent = None
            
            if (gts[mi] == HET and gts[di] == HOM_REF): inf_parent = mom.sample_id
            elif (gts[di] == HET and gts[mi] == HOM_REF): inf_parent = dad.sample_id
            else: continue
            
            # check that both parents are "high-quality"
            if not is_good_site(mi, quals, gts, ad, rd, het_ab=args.ab): continue
            if not is_good_site(di, quals, gts, ad, rd, het_ab=args.ab): continue

            if args.inf_parent and inf_parent != args.inf_parent: continue

            # catalog the "states" of each child w/r/t the informative parent
            # if kids are HETs, their state w/r/t to the INF parent is 0
            states = []
            sib_pass = []
            for i,k in enumerate(sibs):
                k_idx = smp2idx[k]
                k_pass = is_good_site(k_idx, quals, gts, ad, rd, het_ab=args.ab)
                sib_pass.append(k_pass)
                k_state = -1
                if v.CHROM in ('chrX', 'X') and k.sex == "male":
                    k_state = 0 if gts[k_idx] == HOM_ALT else 1
                else:
                    k_state = 0 if gts[k_idx] == HET else 1

                states.append(k_state)

            if not all([x is True for x in sib_pass]): continue

            if sum([s < 0 for s in states]) > 0: continue
            
            if inf_parent not in haps:
                haps[inf_parent] = np.array(states)
            else:
                haps[inf_parent] = np.vstack((haps[inf_parent], np.array(states)))

            inf_positions[inf_parent].append(v.start)

        nused += 1

    persec = i / float(time.time() - t0)
    xos = get_xo(args, haps, inf_positions)