示例#1
0
def add_mock_probands(persons, required, prefix, suffix, phenotype, study):
    """ include mock probands for those without any de novos
    
    Args:
        persons: set of unique persons in the cohort
        required: number of required mock_probands
        prefix: prefix for mock sample IDs
        suffix: suffix for mock sample IDs
        phenotype: phenotype of probands (some studies include affected and
            unaffected).
    """
    # ensure IDs and sexes are repeatable between runs by setting the random
    # seed with the first known person for each cohort.
    random.seed(str(min(persons)))

    affected = [x for x in persons if x.phenotype == phenotype]
    # use the current individuals to estimate the proportion of males, so we
    # can sample according to that fraction, to avoid changing the ratio.
    male_ratio = sum(x.sex == 'male' for x in affected) / len(affected)

    for x in range(required - len(affected)):
        person_id = f'{prefix}_{random_id()}|{suffix}'
        sex = 'male' if random.random() < male_ratio else 'female'
        person = Person(person_id, sex, phenotype, study)
        persons.add(person)

    return persons
示例#2
0
def open_oroak_cohort():
    """ get proband data from the O'Roak et al autism exome study
    
    O'Roak et al. (2012) Nature 485:246-250
    doi: 10.1038/nature10989
    Supplementary table 1
    """
    data = pandas.read_excel(url, sheet_name='Supplementary Table 1',
        skipfooter=1, engine='xlrd')
    
    study = ['10.1038/nature10989']
    persons = set()
    for i, row in data.iterrows():
        status = ['HP:0000717']
        person_type = row.child.split('.')[1]
        # ignore the siblings, since they don't have any de novos recorded, so
        # don't contribute to the exome-sequence populations
        if person_type.startswith('s'):
            continue
        if row['non-verbal_IQ'] < 70:
            status.append('HP:0001249')
        
        person = Person(row.child + '|asd_cohorts', row.sex, status, study)
        persons.add(person)
    
    return persons
示例#3
0
def open_sanders_neuron_cohort():
    """
    
    Supplementary Table 1 from:
    Sanders et al. (2015) Neuron 87:1215-1233
    doi: 10.1016/j.neuron.2015.09.016
    """
    data = pandas.read_excel(url, sheet_name='Sheet1')
    
    sexes = {'F': 'female', 'female': 'female', 'M': 'male', 'male': 'male',
        'U': 'unknown'}
    study = ['10.1016/j.neuron.2015.09.016']
    
    persons = set()
    for i, row in data.iterrows():
        if row.Father == '.' or row.Mother == '.':
            continue
        
        if row.Cohort == 'SSC_Removed':
            continue
        
        for sample in ['Proband', 'Sibling']:
            if row[sample] == '.':
                continue
            
            sex = sexes[row[f'{sample}Sex']]
            phenotype = ['unaffected'] if sample == 'Sibling' else ['HP:0000717']
            
            person = Person(row[sample] + '|asd_cohorts', sex, phenotype, study)
            persons.add(person)
    
    return persons
示例#4
0
def open_de_rubeis_cohort():
    """
    De Rubeis et al. (2013) Nature 515:209-215
    doi: 10.1038/nature13772
    Supplementary Table 3, with some additional proband details sourced from
    Supplementary table S5 from Sanders et al. (2015) Neuron 87:1215-1233.
    """
    data = pandas.read_excel(url, sheet_name='De Novo', skipfooter=1)
    
    # clean up a couple of columns
    data['person_id'] = data.Child_ID
    data['sex'] = data.Child_Sex.map({1: 'male', 2: 'female'})
    data['phenotype'] = data['Child_AffectedStatus'].map({1: ['unaffected'], 2: ['HP:0000717']})
    data = data[['person_id', 'sex', 'phenotype']]
    
    additional = open_additional()
    data = data.append(additional, ignore_index=True)
    data['person_id'] = data.person_id.astype(str)
    data['person_id'] += '|asd_cohorts'
    study = ['10.1038/nature13772']
    
    persons = set()
    for i, row in data.iterrows():
        person = Person(row.person_id, row.sex, row.phenotype, study)
        persons.add(person)
    
    persons = add_mock_probands(persons, 1445, 'asd', 'asd_cohorts', ['HP:0000717'], study)
    
    return persons
def open_jin_nature_genetics_cohort():
    """ gets individual level data for Jin et al congenital heart disease
    
    Supplementary Table 1 from:
    Jin et al. Nature Genetics 49: 1593-1601, doi: 10.1038/ng.3970
    """
    random.seed(1)
    data = pandas.read_excel(url, 'S1', skiprows=1)
    data['person_id'] = data['Blinded ID'].astype(str) + '|jin'
    
    # get male fraction in trios from cohort sex counts in supplemental table 2
    male_fraction = 1691 / (1691 + 1180)
    study = ['10.1038/ng.3970']
    
    persons = set()
    for i, row in data.iterrows():
        status = ['HP:0001627']
        sex = 'male' if random.random() < male_fraction else 'female'
        if row['NDD'] == 'Yes':
            status.append('HP:0001263')
        
        person = Person(row.person_id, sex, status, study)
        persons.add(person)
    
    return persons
示例#6
0
def open_iossifov_neuron_cohort():
    """ get probands sequenced in Iossifov et al., Neuron 2012
    
    Iossifov et al. (2012) Neuron 74:285-299
    doi: 10.1016/j.neuron.2012.04.009
    Data from supplementary tables S1, S2 and S3.
    """
    s1 = pandas.read_excel(supp_s1_url, sheet_name='SNV.v4.1-normlized')
    s2 = pandas.read_excel(supp_s2_url, sheet_name='suppLGKTable')
    s3 = pandas.read_excel(supp_s3_url, sheet_name='ID.v4.1-normlized')

    fam_ids = list(s1.quadId) + list(s2.quadId) + list(s3.quadId)
    members = list(s1.inChild) + list(s2.inChild) + list(s3.inChild)

    sex = ['M', 'F']
    affected = ['aut', 'sib']
    possible = list(itertools.product(affected, sex))
    study = ['10.1016/j.neuron.2012.04.009']

    persons = set()
    for fam, children in zip(fam_ids, members):
        for affected, sex in possible:
            string = f'{affected}{sex}'
            if string in children:
                status = ['unaffected'
                          ] if affected != 'aut' else ['HP:0000717']
                member = 's1' if affected != 'aut' else 'p1'
                sex = 'female' if sex == 'F' else 'male'
                person_id = f'{fam}.{member}|asd_cohorts'

                person = Person(person_id, sex, status, study)
                persons.add(person)

    return persons
def open_halldorsson_science_cohort():
    """ get de novo variants for Halldorsson et al Science 2019
    
    Supplementary Data 5 (revised) from:
    Halldorsson et al. Science 343: eaau1043, doi: 10.1126/science.aau1043
    """
    random.seed(1)

    with tempfile.NamedTemporaryFile() as temp:
        # the url redirects, so use the requests package to open the URL
        download_file(url, temp.name)
        df = pandas.read_table(temp.name, comment='#')

    df['person_id'] = df['Proband_id'].astype(str)
    df['person_id'] += '|halldorsson'

    phenotype = ['unaffected']
    study = ['10.1126/science.aau1043']
    female_fraction = 0.5  # assumption from the fraction from their earlier Jonsson et al publication

    persons = set()
    for row in df.itertuples():
        sex = 'female' if random.random() < female_fraction else 'male'
        var = Person(row.person_id, sex, phenotype, study)
        persons.add(var)

    return persons
示例#8
0
def open_an_science_cohort():
    """ gets individual level data for An et al Autism dataset
    
    Table S1 from:
    An et al. Science 362: eaat6576, doi: 10.1126/science.aat6576
    """
    with warnings.catch_warnings():
        # suppress warning about unknown extension that doesn't affect loading data
        warnings.simplefilter("ignore")
        data = pandas.read_excel(url,
                                 sheet_name='Table S1 Sample information',
                                 skiprows=1,
                                 engine='openpyxl')
    data = data[['SampleID', 'FamilyID', 'Sex', 'Pheno', 'NVIQ']]
    study = ['10.1126/science.aat6576']

    persons = set()
    for i, row in data.iterrows():
        if row.SampleID.endswith('fa') or row.SampleID.endswith('mo'):
            # ignore parental samples
            continue

        status = ['unaffected'] if row.Pheno == 'control' else ['HP:0000717']
        if isinstance(row.NVIQ, int) and row.NVIQ < 70:
            status.append('HP:0001249')

        person = Person(row.SampleID + '|asd_cohorts', row.Sex, status, study)
        persons.add(person)
    return persons
示例#9
0
def open_cohort(path=None):
    if not path:
        path = COHORT_PATH
    with gzip.open(path, 'rt') as handle:
        header = handle.readline()
        cohort = []
        for line in handle:
            person_id, sex, phenotypes, studies = line.strip('\n').split('\t')
            phenotypes = phenotypes.split(',')
            studies = studies.split(',')
            cohort.append(Person(person_id, sex, phenotypes, studies))
        return cohort
示例#10
0
def open_jonsson_nature_cohort():
    """ get cohort for Jonsson et al Nature 2017
    
    Supplementary Table 4 from:
    Jonsson et al. Nature 549: 519-522, doi: 10.1038/nature24018
    """
    random.seed(1)
    zipf = tempfile.NamedTemporaryFile()
    download_file(url, zipf.name)

    # open the zipfile, then open a tarfile inside the zip, then extract and
    # read from file inside the tar
    path = 'nature24018-s2/Aging_Oocytes_Supplementary_Table_DNMs.tar.gz'
    with ZipFile(
            zipf.name) as zip, tarfile.open(fileobj=zip.open(path)) as tar:
        member = tar.getmember('decode_DNMs/decode_DNMs.tsv')
        data = pandas.read_table(tar.extractfile(member))

    data['person_id'] = data['Proband_nr'].astype(str)
    data['person_id'] += '|jonsson'
    data['chrom'] = data['Chr'].astype('str')

    # remove individuals who were children of other probands
    child_ids = set(data.person_id[data.Phase_source == 'three_generation'])
    data = data[~data.person_id.isin(child_ids)]

    # we need to know which individuals are female. From the de novo table:
    #   - 99% of chrX dnms have alt fractions between 0.3-0.75
    #   - we expect 5% of DNMs to occur on chrX, but only have half that at 2%
    #   - only half (818 of 1548) of individuals have a chrX de novo call
    # These imply only females have chrX de novo calls. ChrX is 5% of the genome,
    # so we expect ~3.5 de novo calls on chrX per person. The chance of a person
    # having 0 chrX de novo calls is 3%, so the number of females should be ~3%
    # higher (818 - (818 / (1 - 0.03)) = 25). Of the remaining individuals,
    # each is 3.4% likely to be female (25 / (1548 - 818) = 0.0342)
    females = set(data.person_id[data.chrom == 'chrX'])
    missing_n = len(females) - (len(females) / (1 - 0.0301))
    female_remainder = missing_n / (len(set(data.person_id)) - len(females))

    phenotype = ['unaffected']
    study = ['10.1038/nature24018']

    persons = set()
    for row in data.itertuples():
        # individuals have two chances to be female, 1) if their sample if is in
        # the female group, or 2) 3.4% of the remainder are female.
        sex = 'female' if row.person_id in females or random.random(
        ) < female_remainder else 'male'
        person = Person(row.person_id, sex, phenotype, study)
        persons.add(person)

    return persons
示例#11
0
def subcohort(rows, counts, prefix, suffix, study):
    '''
    '''
    phenotype = ['HP:0001249']
    total = sum(counts.values())
    male_fraction = counts['male'] / total

    persons = set()
    for i, row in rows.iterrows():
        sex = 'male' if random.random() < male_fraction else 'female'
        person = Person(row['person_id'], sex, phenotype, study)
        persons.add(person)

    # account for individuals without exomic de novo mutations
    return add_mock_probands(persons, total, prefix, suffix, phenotype, study)
示例#12
0
def open_homsy_science_cohort():
    """ gets individual level data for Homsy et al congenital heart disease
    
    Supplementary Database 1 from:
    Homsy et al. Science 350: 1262-1266, doi: 10.1126/science.aac9396
    """
    random.seed(1)
    zipf = tempfile.NamedTemporaryFile()
    download_file(url, zipf.name)

    with ZipFile(zipf.name) as zipped:
        handle = zipped.open('homsy_database_S01.xlsx')
        data = pandas.read_excel(handle, 'Database S1', skiprows=1)

    data = data.drop(0, axis=0)
    data = data.rename(
        columns={
            'NDD determination if PCGC cohort': 'Developmental Delay',
            'Unnamed: 6': 'Learning Disability',
            'Unnamed: 7': 'Mental Retardation',
            'Unnamed: 8': 'Autism Spectrum'
        })

    data['person_id'] = data['Blinded ID']
    data['person_id'] += '|homsy'
    study = ['10.1126/science.aac9396']

    # estimate male fraction from proportion in Zaidi et al 2013, since the
    # sex isn't provided for individuals, nor the count of people per sex.
    male_fraction = 220 / (220 + 142)

    persons = set()
    for i, row in data.iterrows():
        status = ['HP:0001627']
        sex = 'male' if random.random() < male_fraction else 'female'
        if row['Developmental Delay'] == 'Yes':
            status.append('HP:0001263')
        if row['Mental Retardation'] == 'Yes':
            status.append('HP:0001249')
        if row['Autism Spectrum'] == 'Yes':
            status.append('HP:0000717')

        person = Person(row.person_id, sex, status, study)
        persons.add(person)

    return persons
示例#13
0
def open_mcrae_nature_cohort():
    """ get proband details for McRae et al., Nature 2017
    
    McRae et al Nature 2017 542:433-438
    doi: 10.1038/nature21062
    Supplementary table S1.
    """
    data = pandas.read_excel(url, sheet_name='Supplementary Table 1')
    data['Individual ID'] += '|DDD'

    phenotype = ['HP:0001249']
    study = ['10.1038/nature21062']

    persons = set()
    for i, row in data.iterrows():
        person = Person(row['Individual ID'], row.Sex, phenotype, study)
        persons.add(person)

    persons = add_mock_probands(persons, 4293, 'ddd', 'DDD', phenotype, study)

    return persons
示例#14
0
def open_de_ligt_cohort():
    """ get individuals from De Ligt et al., 2012
    
    De Ligt et al., (2012) N Engl J Med 367:1921-1929
    doi:10.1056/NEJMoa1206524
    Proband details sourced from 'Clinical description of patients' section in
    supplementary material.
    """
    temp = tempfile.NamedTemporaryFile()
    download_with_cookies(url, temp.name)
    
    data = extract_table(temp)
    data['person_id'] += '|de_ligt'
    
    status = ['HP:0001249']
    study = ['10.1056/NEJMoa1206524']
    persons = set()
    for i, row in data.iterrows():
        person = Person(row.person_id, row.sex, status, study)
        persons.add(person)
    
    return persons
示例#15
0
def open_epi4k_ajhg_cohort():
    """ gets individual level data for Epi4K cohort
    
    Supplementary Table 6 from:
    Epi4K AJHG 95: 360-370, doi: 10.1016/j.ajhg.2014.08.013
    """

    temp = tempfile.NamedTemporaryFile()
    download_file(url, temp.name)

    data = extract_table(temp)

    data['person_id'] += '|epi4k'
    status = ['HP:0001250']
    study = ['10.1016/j.ajhg.2014.08.013']
    persons = set()
    for i, row in data.iterrows():

        person = Person(row.person_id, row.sex, status, study)
        persons.add(person)

    return persons
示例#16
0
def open_rauch_cohort():
    """ get person data for Rauch et al. intellectual disability exome study
    
     Rauch et al. (2012) Lancet 380:1674-1682
     doi: 10.1016/S0140-6736(12)61480-9
     Supplementary table 1
    """
    temp = tempfile.NamedTemporaryFile()
    download_file(url, temp.name)
    
    data = extract_table(temp)
    data['person_id'] += '|rauch'
    
    status = ['HP:0001249']
    study = ['10.1016/S0140-6736(12)61480-9']
    persons = set()
    for i, row in data.iterrows():
        
        person = Person(row.person_id, row.sex, status, study)
        persons.add(person)
    
    return persons
示例#17
0
def open_iossifov_nature_cohort():
    """ get proband details fromn Iossifov et al., Nature 2014
    
    Nature (2014) 515: 216-221, doi:10.1038/nature13908
    Supplementary table S1.
    """

    tempdir = tempfile.TemporaryDirectory()
    zipf = os.path.join(tempdir.name, 'temp.zip')
    download_file(url, zipf)

    with ZipFile(zipf) as zipped:
        zipped.extractall(tempdir.name)

    path = os.path.join(tempdir.name, 'nature13908-s2',
                        'Supplementary Table 1.xlsx')
    data = pandas.read_excel(path, 'Supplement-T1-familiesTable')
    study = ['10.1038/nature13908']

    persons = set()
    for i, row in data.iterrows():

        fam = row.familyId
        for member in get_members(row):
            sex = row['probandGender'] if member[0] == 'p' else row[
                'siblingGender']

            status = ['HP:0000717'] if member[0] == 'p' else ['unaffected']
            if member[0] == 'p' and (row.probandVIQ < 70
                                     or row.probandNVIQ < 70):
                status.append('HP:0001249')
            sex = 'male' if sex == 'M' else 'female'
            person_id = f'{fam}.{member}|asd_cohorts'

            person = Person(person_id, sex, status, study)
            persons.add(person)

    return persons
示例#18
0
def open_lelieveld_cohort():
    """ get proband details for Lelieveld et al., 2016
    
    Lelieveld et al. (2016) Nature Neuroscience 19:1194-1196
    doi: 10.1038/nn.4352
    Supplementary table S2.
    """
    random.seed(1)
    data = pandas.read_excel(url, sheet_name='Supplementary Table 2')

    phenotype = ['HP:0001249']
    study = ['10.1038/nn.4352']

    ids = list(range(1, max(data['Patient key']) + 1))
    ids = [str(x) + '|lelieveld' for x in ids]
    male_fraction = 461 / (461 + 359)

    persons = set()
    for person_id in ids:
        sex = 'male' if random.random() < male_fraction else 'female'
        person = Person(person_id, sex, phenotype, study)
        persons.add(person)

    return persons
示例#19
0
def open_sanders_nature_cohort():
    """ load individuals form Sanders et al Nature 2012 cohort
    
    Sanders et al. (2012) Nature 485:237-241
    doi: 10.1038/nature10945
    Supplementary table S1
    """
    data = pandas.read_excel(url, sheet_name='Sheet1', engine='xlrd')
    
    study = ['10.1038/nature10945']
    persons = set()
    for i, row in data.iterrows():
        if row.Sample.endswith('fa') or row.Sample.endswith('mo'):
            # ignore parental samples
            continue
        
        status = ['HP:0000717']
        if row.Role == 'Unaffected_Sibling':
            status = ['unaffected']
        
        person = Person(row.Sample + '|asd_cohorts', row.Gender.lower(), status, study)
        persons.add(person)
        
    return persons