def get_pop_curse_flag(self, mode, hpo):
     '''
     get pop cursed?
     return the cursed pop, or None
     '''
     genon = self.genons[mode][hpo]
     # get inds with small p
     s_p_inds = np.where(genon[:, 0] <= self.pop_check_p)[0]
     # get patients, then variants
     variants = {'pos': [], 'neg': []}
     tp = None
     if mode == 'r':
         tp = 'gnomad_hom_f'
     elif mode == 'd':
         tp = 'gnomad_af'
     else:
         msg = 'mode has to be either r or d'
         raise ValueError(msg)
     for ind in s_p_inds:
         patients = self.patient_map['patient_map'][mode]["{},0".format(
             ind)][0]
         cadd_cuts = (self.cadd_step * ind, self.cadd_step * (ind + 1))
         gnomad_cut = self.gnomad_step
         for p in patients:
             if hpo in self.patient_info[p]:
                 curse = 'pos'
             else:
                 curse = 'neg'
             for v in self.patients_variants['patients'][p]:
                 A = (self.patients_variants['variants'][v][tp] <
                      gnomad_cut)
                 B = (cadd_cuts[0] <= \
                         self.patients_variants['variants'][v]['cadd'] < \
                         cadd_cuts[1])
                 if A and B:
                     variants[curse].append(v)
     pop_curse = {'pos': set(), 'neg': set()}
     if len(variants['pos']) < self.pop_flags[1]:
         # number of variants are too few
         return None
     # annotate variants using gnomad_utils, and find pop curse
     # if pos and neg find same most freq pop, return None
     gnomad_freqs = gnomad_utils.overall_freqs(
         variants['pos'] + variants['neg'], self.gnomad_path)
     for k, v in variants.items():
         C = Counter()
         for vv in v:
             C.update(gnomad_freqs[vv]['most_freq_pops'])
         # what if there is a tie?!?!
         if len(C) == 0:
             pop_curse[k] = set()
             continue
         most_freq = ([C.most_common(1)[0][0]], C.most_common(1)[0][1])
         for kk, vv in C.items():
             if vv == most_freq[1]:
                 most_freq[0].append(kk)
         if most_freq[1] / len(v) >= self.pop_flags[0]:
             pop_curse[k] = set(most_freq[0])
     return list(pop_curse['pos'] - pop_curse['neg']) or None
Пример #2
0
 def gnomad(self):
     # Check local database first. If not,
     #   use CommonFuncs to annotate gnomad, then store in database
     if getattr(self, '_gnomad', None) is None:
         if not self.path_to_gnomad:
             raise ValueError(
                 'Required to provide a path to gnomad for annotation')
         # check database
         db_c = self.db_conn.cursor()
         result = batch_query(db_c, 'variants', list(self._v.values()))
         data = {}
         new_vars = {}
         gnomad = {}
         for i in result:
             temp = dict_factory(db_c, i)
             if temp['gnomad']:
                 data[temp['id']] = json.loads(temp['gnomad'])
         for k, v in self._v.items():
             if v in data and data[v] != None:
                 gnomad[k] = data[v]
             else:
                 # not in database, push to array for later query
                 new_vars[k] = v
         if new_vars:
             print('querying gnomad')
             # need to divide vars according to their chroms
             new_result = {}
             for chrom_vars in get_chrom_vars(new_vars.values()):
                 new_result.update(
                     gnomad_utils.overall_freqs(chrom_vars,
                                                self.path_to_gnomad))
             # update database
             update_db(self.db_conn, 'variants', ['gnomad'],
                       {k: [json.dumps(v)]
                        for k, v in new_result.items()})
             # populate exac
             for k, v in new_vars.items():
                 gnomad[k] = new_result.get(v, None)
         self._gnomad = gnomad
     return self._gnomad
Пример #3
0
def main(params):
    # genotype dict
    genotype_dict = {1: 'het', 2: 'hom'}
    # read patient info
    patient = {}
    patient_header = []
    if not params.cadd:
        # no cadd provided, write to vcf
        outfile = '{}.vcf'.format(params.gene)
    else:
        outfile = '{}.txt'.format(params.gene)
    with open(PATIENT_CSV, 'rt', encoding='utf-8-sig') as inf:
        csvreader = csv.reader(inf)
        for row in csvreader:
            row = row[:11]
            if not patient_header:
                patient_header = row
                continue
            record = dict(zip(patient_header, row))
            del record['IRDC ID']
            patient[row[0]] = record

    variants = set()
    report = {}
    for csvfile in os.listdir(PATH_TO_CSVS):
        if not csvfile.endswith('.csv'):
            continue
        header = []
        genotype_header = None
        with open(os.path.join(PATH_TO_CSVS, csvfile), 'rt') as inf:
            csvreader = csv.reader(inf)
            for row in csvreader:
                if not header:
                    header = row
                    continue
                record = dict(zip(header, row))
                # has gene?
                genes = record['HUGO.no.splice.info'].split(',')
                if params.gene in genes:
                    variant = CommonFuncs.find_leftmost_synonymous_variant(
                        CommonFuncs.clean_variant(
                            record['clean.signature'].replace('_', '-')))
                    variants.add(variant)
                    sample = csvfile.split('.csv')[0]
                    genotype = genotype_dict.get(
                        Counter(record[sample].split(':')[0])['1'], 'unknown')
                    if variant not in report:
                        report[variant] = record
                        report[variant]['samples'] = [{
                            'id': sample,
                            'genotype': genotype
                        }]
                    else:
                        report[variant]['samples'].append({
                            'id': sample,
                            'genotype': genotype
                        })
    if not params.cadd:
        # sort and write vcf
        with open(outfile, 'wt') as outf:
            # write vcf header
            outf.write('##VCF4.1\n')
            outf.write('\t'.join(['#CHROM', 'POS', 'ID', 'REF', 'ALT']) + '\n')
            for variant in sorted(list(variants),
                                  key=lambda x: int(x.split('-')[1])):
                chrom, pos, ref, alt = variant.split('-')
                row = [chrom, pos, '.', ref, alt]
                outf.write('\t'.join(row) + '\n')
    else:
        # get gnomads
        gnomads = gnomad_utils.overall_freqs(list(variants), PATH_TO_GNOMAD)
        # get cadd
        cadds = {}
        with open(params.cadd, 'rt') as inf:
            for line in inf:
                if line.startswith('#'):
                    continue
                row = line.rstrip().split('\t')
                cadds['-'.join(row[:4])] = row[-1]
        # write report
        with open(outfile, 'wt') as outf:
            for variant in sorted(list(variants),
                                  key=lambda x: int(x.split('-')[1])):
                outf.write(variant + ':\n')
                outf.write('\tFilter: {}\n'.format(report[variant]['FILTER']))
                outf.write('\t{}\n'.format(report[variant]['AAChange']))
                outf.write(
                    '\tPolyphen: {}, SIFT: {}, MutationTaster: {}\n'.format(
                        report[variant]['LJB_PolyPhen2_Pred'],
                        report[variant]['LJB_SIFT_Pred'],
                        report[variant]['LJB_MutationTaster_Pred']))
                outf.write('\tgnomad_af:{}, gnomad_hom_f:{}, cadd:{}\n'.format(
                    gnomads[variant]['gnomad_af'],
                    gnomads[variant]['gnomad_hom_f'], cadds[variant]))
                for sample in report[variant]['samples']:
                    outf.write('\t{} ({}):\n'.format(sample['id'],
                                                     sample['genotype']))
                    study_number = sample['id'].split('_')[3]
                    if study_number in patient:
                        for h in patient_header:
                            if h in patient[study_number]:
                                outf.write('\t\t{}: {}\n'.format(
                                    h, patient[study_number][h]))
                    outf.write('\n')
Пример #4
0
def get_vcf_df(**kwargs):
    '''
    use bcf tools to subset variants and patients. then according to 
    p/v_cutoff to get bad_vs, bad_ps to remove
    '''
    compulsory_keys = {
        'vcf_file',
        'chrom',
        'start',
        'stop',
        'unrelated_file',
        'human_fasta_ref',
        'v_cutoff',
        'gnomad_cutoff',
        'p_cutoff',
        'patient_mini',
    }
    # check args
    helper.check_args(compulsory_keys, kwargs, 'get_vcf_df')
    position = '{chrom}:{start}-{stop}'.format(**kwargs)
    ps1 = subprocess.Popen(('tabix', '-h', kwargs['vcf_file'], position),
                           stdout=subprocess.PIPE)
    # subset on unrelated samples, and normalise
    ps2 = subprocess.Popen(('bcftools', 'view', '-Ou', '-S',
                            kwargs['unrelated_file'], '-f', 'PASS'),
                           stdin=ps1.stdout,
                           stdout=subprocess.PIPE)
    ps3 = subprocess.Popen(('bcftools', 'norm', '-Ou', '-m', '-any'),
                           stdin=ps2.stdout,
                           stdout=subprocess.PIPE)
    normed_vcf = subprocess.check_output(
        ['bcftools', 'norm', '-Ov', '-f', kwargs['human_fasta_ref']],
        stdin=ps3.stdout)
    # get vcf df. genotype -1 = missing, 0 = wildtype, 1 = het, 2 = hom
    genotype_df = read_vcf(normed_vcf)
    # empty vcf? early return
    if genotype_df.empty:
        return None
    # get poorly covered variants and individuals
    # change df to cover_df
    cover_df = genotype_df.copy()
    cover_df[cover_df >= 0] = 1
    cover_df[cover_df == -1] = 0
    pm = cover_df.mean()

    # rid of patients not in patient_mini
    bad_ps = set(pm[pm < kwargs['p_cutoff']].index)
    bad_ps.update(set(pm.index) - set(kwargs['patient_mini'].keys()))
    vm = cover_df.T.mean()
    bad_vs = set(vm[vm < kwargs['v_cutoff']].index)
    # annotate vs with gnomad
    vs = (i for i in vm.index if i not in bad_vs)
    gnomad_freqs = gnomad_utils.overall_freqs(vs, kwargs['gnomad_path'])

    # remove variants with 'SEGDUP' filter. This gives a lot of noise for recessive
    # analysis. For example IGHV3-38 - ENST00000390618, 14-106866588-T-C
    bad_vs.update([
        i for i, v in gnomad_freqs.items() if v['filters']['exome'] is not None
        and 'SEGDUP' in v['filters']['exome'] or v['filters']['genome']
        is not None and 'SEGDUP' in v['filters']['genome']
    ])
    # in fact, many variants have very high af, but 0 hom_f, such as
    # 6-32548641-A-T, which has no 'SEGDUP' filter. Remove those
    # hard filtering for the time being. There might be better ways
    bad_vs.update([
        i for i, v in gnomad_freqs.items()
        if v['gnomad_af'] > 0.01 and v['gnomad_hom_f'] == 0.0
    ])

    # add to bad_vs gnomad_hom_af >= gnomad_cutoff,
    #  and those not covered by gnomad_path
    # Note that if gnomad_hom_af >= gnomad_cutoff, then gnomad_af >= gnomad_cutoff
    #  but not vice versa
    #this = [i for i,v in gnomad_freqs.items()
    #    if v['gnomad_af'] is None or v['gnomad_hom_f'] >= kwargs['gnomad_cutoff']]
    bad_vs.update([
        i for i, v in gnomad_freqs.items() if v['gnomad_af'] is None
        or v['gnomad_hom_f'] >= kwargs['gnomad_cutoff']
    ])
    vs_count = np.sum(genotype_df[genotype_df > 0], axis=1)
    bad_vs.update([
        i for i in gnomad_freqs
        if vs_count[i] > 3 and gnomad_freqs[i]['pop_filter']
    ])
    # then drop bad_ps and bad_vs
    genotype_df.drop(bad_vs, inplace=True)
    genotype_df.drop(bad_ps, inplace=True, axis=1)
    return (genotype_df, cover_df, gnomad_freqs)
Пример #5
0
def pop_annotate(line_cache, variant_cache, header, fields, outf, options):
    import gnomad_utils, bravo_utils, kaviar_utils
    # annotate
    # gnomad
    if 'gnomad_path' in options['pop_freqs']:
        gnomads = gnomad_utils.overall_freqs(
            list(variant_cache.keys()), options['pop_freqs']['gnomad_path'])
        for variant in variant_cache:
            af = gnomads[variant]['gnomad_af']
            if af is None:
                af = ''
            hom_f = gnomads[variant]['gnomad_hom_f']
            if hom_f is None:
                hom_f = ''
            variant_cache[variant]['gnomad_af'] = af

            variant_cache[variant]['gnomad_hom_f'] = hom_f

    # bravo
    if 'bravo_vcf' in options['pop_freqs']:
        bravos = bravo_utils.bravo(list(variant_cache.keys()),
                                   options['pop_freqs']['bravo_vcf'])
        for variant in variant_cache:
            if variant not in bravos:
                variant_cache[variant]['bravo_af'] = ''
                variant_cache[variant]['bravo_hom_f'] = ''
            else:
                variant_cache[variant]['bravo_af'] = \
                    bravos[variant]['af']
                variant_cache[variant]['bravo_hom_f'] = \
                    bravos[variant]['Hom']*2 / bravos[variant]['an']
    # kaviar
    if 'kaviar_vcf' in options['pop_freqs']:
        kaviars = kaviar_utils.kaviar(list(variant_cache.keys()),
                                      options['pop_freqs']['kaviar_vcf'])
        for variant in variant_cache:
            if variant not in kaviars:
                variant_cache[variant]['kaviar_af'] = ''
            else:
                variant_cache[variant]['kaviar_af'] = \
                    kaviars[variant]['af']

    for line in line_cache:
        record = dict(zip(header, line.rstrip().split('\t')))
        INFO = record['INFO']
        pop_info = []
        for alt in record['ALT'].split(','):
            v_id = clean_variant('-'.join([
                record['CHROM'],
                record['POS'],
                record['REF'],
                alt,
            ]),
                                 human_ref_pysam=options['human_ref_pysam'])
            pop_info.append(
                '|'.join([alt] + [str(variant_cache[v_id][f])
                                  for f in fields]))
        pop_info = 'POPF=' + ','.join(pop_info)
        new_INFO = ';'.join([INFO, pop_info])
        record['INFO'] = new_INFO
        new_line = '\t'.join([record[h] for h in header]) + '\n'
        outf.write(new_line)