示例#1
0
def get_ann_from_output_snpeff(temp_out_name):
    callset = allel.read_vcf(temp_out_name, fields='ANN', transformers=allel.ANNTransformer(), \
                             numbers={'ANN': num_ann_max})

    df1 = pd.DataFrame(data=callset['variants/ANN_Allele'])
    df2 = pd.DataFrame(data=callset['variants/ANN_Annotation'])
    df3 = pd.concat((df1, df2), axis=1)
    df3.columns = range(0, df3.shape[1])

    return df3
示例#2
0
 def __get_variants_from_vcf(cls, vcf: str) -> Optional[Dict[str, Any]]:
     # variants is None precisely when filtered vcf file has no variants
     try:
         variants = allel.read_vcf(vcf,
                                   fields=cls.FIELD_NAMES,
                                   transformers=allel.ANNTransformer())
     except IOError:
         raise FileNotFoundError("File " + vcf +
                                 " not found or cannot be opened.")
     return variants
示例#3
0
def save_mutect2(*args):

    patient = args[0]
    sample = args[1]

    path_to_mutect2 = f"/media/emir/Storage/Cancer/mutect/output/pat{patient}/s{sample}"
    mutect2_name = f"output_transcr_new_predicted_dbsnp.vcf"
    mutect2_wo_ann = allel.vcf_to_dataframe(os.path.join(
        path_to_mutect2, mutect2_name),
                                            fields=["numalt"],
                                            alt_number=1)
    alt_len = max(mutect2_wo_ann["numalt"])
    col_list = [
        "CHROM", "POS", "ID", "REF", "DP", "FILTER_PASS", "ANN_Annotation",
        "ANN_Annotation_Impact", "ANN_Gene_Name", "ANN_Gene_ID", "ANN_HGVS_c",
        "ANN_HGVS_p", "ANN_AA_pos"
    ]
    triple_list = []
    alt_cols = [
        "ALT", "dbNSFP_Polyphen2_HVAR_score", "dbNSFP_SIFT_score",
        "dbNSFP_MetaLR_score", "dbNSFP_Polyphen2_HDIV_score",
        "dbNSFP_Uniprot_acc", "dbNSFP_CADD_phred",
        "dbNSFP_Polyphen2_HDIV_pred", "dbNSFP_MutationTaster_score",
        "dbNSFP_SIFT_pred", "dbNSFP_MutationTaster_pred",
        "dbNSFP_Polyphen2_HVAR_pred", "dbNSFP_MetaLR_pred"
    ]
    for j in alt_cols:
        for i in range(1, alt_len + 1):
            triple_list.append(j + f"_{i}")
            col_list.append(j + f"_{i}")
    mutect2_wo_ann = allel.vcf_to_dataframe(os.path.join(
        path_to_mutect2, mutect2_name),
                                            fields="*",
                                            alt_number=alt_len,
                                            exclude_fields="ANN")
    mutect2_w_ann = allel.vcf_to_dataframe(os.path.join(
        path_to_mutect2, mutect2_name),
                                           fields="ANN",
                                           alt_number=alt_len,
                                           transformers=allel.ANNTransformer())
    mutect2 = pd.concat([mutect2_wo_ann, mutect2_w_ann], axis=1)
    mutect2 = mutect2[col_list]
    mutect2.fillna("-", inplace=True)
    for i in alt_cols:
        mutect2[i] = mutect2[i+"_1"].map(str)+","+mutect2[i+"_2"].map(str)+","+ \
        mutect2[i+"_3"].map(str)
    mutect2.drop(triple_list, axis=1, inplace=True)
    return mutect2
    mutect2 = pfam_annotate(mutect2)
    mutect2.to_pickle(os.path.join(path_to_mutect2, "mutect2.pkl"))
示例#4
0
def parse_vcf(vcf, rs_ids, bed_file, outputdir, sampleId, vcftools):
    match_on_rsid = 0
    match_on_location = 0

    # Slice VCF on bed file
    temp_vcf_prefix = outputdir + '/' + sampleId + '_PGx'
    temp_vcf = outputdir + '/' + sampleId + '_PGx.recode.vcf'

    # Check if output vcf does not already exist
    if os.path.exists(temp_vcf):
        raise IOError("Temporary VCF file " + temp_vcf + ".recode.vcf already exists. Exiting.")
    subprocess.run([vcftools, '--gzvcf', vcf, '--bed', bed_file, '--out',
                    temp_vcf_prefix, '--recode', '--recode-INFO-all'])
    print("[INFO] Subprocess completed.")

    # Read in VCF file
    try:
        variants = allel.read_vcf(temp_vcf, fields=['samples', 'calldata/GT', 'variants/ALT', 'variants/CHROM',
                                                    'variants/FILTER', 'variants/ID', 'variants/POS',
                                                    'variants/QUAL', 'variants/REF', 'variants/ANN'],
                                  transformers=allel.ANNTransformer())
    except IOError:
        sys.exit("[ERROR] File " + temp_vcf + " not found or cannot be opened.")

    ids_found_in_patient = pd.DataFrame(columns=['position_GRCh37', 'ref_GRCh37', 'alt_GRCh37', 'rsid',
                                                 'variant_annotation', 'gene', 'filter'])

    for i, rs_number in enumerate(variants['variants/ID']):
        chr = variants['variants/CHROM'][i]
        pos = variants['variants/POS'][i]

        if ";" in rs_number:
            rs_id_filt = []
            cur_rs = rs_number.split(";")
            for rs in cur_rs:
                if rs.startswith("rs"):
                    rs_id_filt.append(rs)
        else:
            rs_id_filt = [rs_number]

        if any(rs in rs_id_filt for rs in rs_ids) or str(chr) + ":" + str(pos) in rs_ids.values():
            if any(rs in rs_id_filt for rs in rs_ids):
                match_on_rsid += 1
            else:
                match_on_location += 1
            new_id = {}
            if variants['variants/FILTER_PASS'][i] == True:
                filter = "PASS"
            else:
                filter = "FILTERED"
            alt = variants['variants/ALT'][i]
            ref = variants['variants/REF'][i]
            genotype = ""
            for geno in variants['calldata/GT'][i][0]:
                if geno == 0:
                    genotype = genotype + ref
                elif geno == 1:
                    genotype = genotype + alt[0]
                elif geno == 2:
                    genotype = genotype + alt[1]
                else:
                    print(geno)
                    raise ValueError("Genotype looks weird")

            new_id['position_GRCh37'] = str(chr) + ":" + str(pos)
            new_id['rsid'] = ";".join(rs_id_filt)
            new_id['ref_GRCh37'] = genotype[0]
            new_id['alt_GRCh37'] = genotype[1]
            new_id['variant_annotation'] = variants['variants/ANN_HGVS_c'][i]
            new_id['filter'] = filter
            new_id['gene'] = variants['variants/ANN_Gene_Name'][i]
            ids_found_in_patient = ids_found_in_patient.append(new_id, ignore_index=True)

    print("[INFO] Matches on RS id: " + str(match_on_rsid))
    print("[INFO] Matches on location: " + str(match_on_location))
    return ids_found_in_patient, temp_vcf
示例#5
0
print('Variants from TLX3 WGS in active enhacers = ', len(enh_mut_tlx))

# In[36]:

# Save only if new analysis done
enh_mut_tlx.saveas(join(WGS, 'Ehn_RAG_Active_TLX3_mut.vcf'))

# In[37]:

# Load table
import allel
enh_mut_tlx_tb = allel.vcf_to_dataframe(join(WGS,
                                             'Ehn_RAG_Active_TLX3_mut.vcf'),
                                        fields='*',
                                        numbers={'ALT': 4},
                                        transformers=allel.ANNTransformer())

# In[38]:

enh_mut_tlx_tb_fltr = enh_mut_tlx_tb[enh_mut_tlx_tb['FILTER_PASS'] == True]

# In[39]:

print('Variants from TLX3 WGS in active enhacers = ', len(enh_mut_tlx_tb))
print('Variants from TLX3 WGS in active enhacers PASS filter = ',
      len(enh_mut_tlx_tb_fltr))

# In[40]:

#enh_mut_tlx_tb[['FILTER_map','FILTER_PASS','FILTER_mrd20','FILTER_LowQual','FILTER_mrd10','FILTER_mrd30']].head(40)
#tnm = plt.hist(enh_mut_tlx_tb['REF'].apply(len), bins=40)
示例#6
0
def vcf_to_tables(vcf_file, genotype_file, variants_tsv, itd_vcf_file=False):
    # parses a vcf file into a series of tables
    # if itd_files is given, adds flt3 itd variants to table

    # load vcf file into numpy array
    # include annotation info from snpeff
    vcf = allel.read_vcf(vcf_file,
                         transformers=allel.ANNTransformer(),
                         fields=[
                             'variants/*', 'calldata/GT', 'calldata/AD',
                             'calldata/GQ', 'calldata/DP', 'samples', 'ANN'
                         ])
    # layers to extract:
    # GT: genotype (0: WT, 1: HET, 2: HOM, 3: no call)
    # DP: total read depth
    # GQ: genotype quality
    # AD: alt allele depth
    # RD: ref allele depth

    GT = np.sum(vcf['calldata/GT'], axis=2)
    GT[GT == -2] = 3
    DP = np.stack(vcf['calldata/DP'], axis=0)
    GQ = np.stack(vcf['calldata/GQ'], axis=0)
    AD = np.stack(vcf['calldata/AD'][:, :, 1], axis=0)
    RD = np.stack(vcf['calldata/AD'][:, :, 0], axis=0)

    # create variant names
    names = [
        vcf['variants/ANN_Gene_Name'][i] + ':' + vcf['variants/CHROM'][i] +
        ':' + str(vcf['variants/POS'][i]) + ':' + vcf['variants/REF'][i] +
        '/' + vcf['variants/ALT'][:, 0][i]
        for i in range((vcf['variants/REF'].shape[0]))
    ]

    # assemble and save variant annotations to file
    variants_table = pd.DataFrame(data=names, columns=['Name'])

    # cosmic id
    variants_table['COSMIC_ID'] = vcf['variants/ID']

    # snpeff columns
    ANN_columns = [c for c in list(vcf) if '/ANN' in c]
    for ann in ANN_columns:
        variants_table['SnpEff_' + ann.split('/ANN_')[1]] = vcf[ann]

    # clinvar columns
    CLN_columns = [c for c in list(vcf) if '/CLN' in c]
    for cln in CLN_columns:
        variants_table['ClinVar_' + cln.split('/')[1]] = vcf[cln]

    # optional: add flt3-itd variants to table
    if itd_vcf_file:

        # make sure flt3 vcf is not empty
        empty = True
        with open(itd_vcf_file, 'r') as f:
            for line in f:
                if line[0] != '#':
                    empty = False
                    break

        if not empty:

            itd_vcf = allel.read_vcf(itd_vcf_file, fields=['*'])

            # create itd variant names
            itd_names = [
                'FLT3-ITD' + ':' + itd_vcf['variants/CHROM'][i] + ':' +
                str(itd_vcf['variants/POS'][i]) + ':' +
                itd_vcf['variants/REF'][i] + '/' +
                itd_vcf['variants/ALT'][:, 0][i]
                for i in range((itd_vcf['variants/REF'].shape[0]))
            ]

            # add itd variant rows to variants table
            itd_table = pd.DataFrame(data=list(set(itd_names)),
                                     columns=['Name'])
            names += list(set(itd_names))
            variants_table = pd.concat([variants_table, itd_table], sort=True)

            # add itd variants to other layers
            # set RD = AD and GQ = 100 when itd is present
            # default for GT is 'no call' (3)

            # create additional array entries
            GT = np.concatenate((GT, 3 * np.ones(
                (itd_table.shape[0], GT.shape[1]))),
                                axis=0)
            GQ = np.concatenate((GQ, np.zeros(
                (itd_table.shape[0], GQ.shape[1]))),
                                axis=0)
            DP = np.concatenate((DP, np.zeros(
                (itd_table.shape[0], DP.shape[1]))),
                                axis=0)
            AD = np.concatenate((AD, np.zeros(
                (itd_table.shape[0], AD.shape[1]))),
                                axis=0)
            RD = np.concatenate((RD, np.zeros(
                (itd_table.shape[0], RD.shape[1]))),
                                axis=0)

            # indices for adding entries to arrays
            var_ind = dict(zip(names, range(len(names))))
            bar_ind = dict(zip(vcf['samples'], range(len(vcf['samples']))))

            # for each cell barcode, add entry to genotyping array
            for i in range(len(itd_vcf['variants/ID'])):
                cell_barcode = itd_vcf['variants/ID'][i]
                alt_depth = itd_vcf['variants/QUAL'][i]
                vaf = itd_vcf['variants/VAF'][i]
                print vaf
                total_depth = int(round(np.true_divide(alt_depth, vaf)))

                # set GT according to vaf
                # het mut
                if vaf < 0.9:
                    geno = 1

                # hom mut
                else:
                    geno = 2

                # store entries in genotyping array
                GT[var_ind[itd_names[i]], bar_ind[cell_barcode]] = geno
                GQ[var_ind[itd_names[i]], bar_ind[cell_barcode]] = 100
                DP[var_ind[itd_names[i]], bar_ind[cell_barcode]] = total_depth
                RD[var_ind[itd_names[i]], bar_ind[cell_barcode]] = total_depth
                AD[var_ind[itd_names[i]], bar_ind[cell_barcode]] = alt_depth

    # save variants to file
    variants_table.to_csv(path_or_buf=variants_tsv, sep='\t', index=False)

    # encode variant names and cell barcodes
    names = [n.encode('utf8') for n in names]
    barcodes = [b.encode('utf8') for b in vcf['samples']]

    # save genotyping information to compressed hdf5 file
    with h5py.File(genotype_file, 'w') as f:
        f.create_dataset('GT', data=GT, dtype='i1', compression='gzip')
        f.create_dataset('GQ', data=GQ, dtype='i1', compression='gzip')
        f.create_dataset('DP', data=DP, dtype='i2', compression='gzip')
        f.create_dataset('AD', data=AD, dtype='i2', compression='gzip')
        f.create_dataset('RD', data=RD, dtype='i2', compression='gzip')
        f.create_dataset('VARIANTS', data=names, compression='gzip')
        f.create_dataset('CELL_BARCODES', data=barcodes, compression='gzip')
示例#7
0
    def _parse_sv_vcfs(self, vcf_paths, ann_fields=[]):
        '''
            Merge all SV interval data from multiple vcf's in to a single BedTool instance

            Implementation:
                Use Panda's dataframe for some easy preprocessing, then create a BedTool from a tuple containing each row
        '''
        def split_Ensembl_ids(id_list):
            new_list = []
            for id in id_list:
                if '-' in id:
                    new_list.extend(id.split('-'))
                elif '&' in id:
                    new_list.extend(id.split('&'))
                else:
                    new_list.append(id)
            return new_list

        intervals = []
        sample_names = []
        ann_dfs = []

        index_fields = list(
            self.index_cols.keys()
        )  #CHR POS STOP needs to be first 3 columns for creation of BedTool instance
        sample_sv_fields = index_fields + [
            'calldata/GT', 'variants/ANN_Gene_ID', 'samples'
        ]
        parse_fields = list(set(sample_sv_fields + ann_fields))

        for vcf_path in vcf_paths:
            vcf_dict = allel.read_vcf(
                vcf_path, ['*'],
                numbers={'ANN': 1000},
                transformers=allel.ANNTransformer()
            )  #use read_vcf because genotype field is not picked up with vcf_to_dataframe

            assert len(vcf_dict['samples']
                       ) == 1, "%s contains 0 or more than 1 sample: %s" % (
                           vcf_path, str(vcf_dict['samples']))
            name = vcf_dict.pop('samples')[0]
            sample_names.append(name)

            # if 'chr' in CHROM field, remove
            vcf_dict['variants/CHROM'] = [
                chrom.strip('chr') for chrom in vcf_dict['variants/CHROM']
            ]

            # if 'chr' in CHROM field, remove
            vcf_dict['variants/CHROM'] = [
                chrom.strip('chr') for chrom in vcf_dict['variants/CHROM']
            ]

            # drop un-needed fields from vcf, cannot pass in parse_fields to read_vcf() because ANN_gene_id is unknown until ANNTransformer runs
            for key in list(vcf_dict.keys()):
                if key not in parse_fields:
                    vcf_dict.pop(key)

            # remove empty strings, split on delimited characters, then join using comma
            vcf_dict['variants/ANN_Gene_ID'] = [
                list(filter(None, ann))
                for ann in vcf_dict['variants/ANN_Gene_ID']
            ]  #by default, specifying numbers=1000 creates 1000 elements, with most being empty
            vcf_dict['variants/ANN_Gene_ID'] = [
                split_Ensembl_ids(id_list) if any('&' in id for id in id_list)
                or any('-' in id for id in id_list) else id_list
                for id_list in vcf_dict['variants/ANN_Gene_ID']
            ]
            vcf_dict['variants/ANN_Gene_ID'] = [
                ','.join(list(set(id_list)))
                if isinstance(id_list, list) else id_list
                for id_list in vcf_dict['variants/ANN_Gene_ID']
            ]

            vcf_dict['calldata/GT'] = np.array([
                'HET' if 0 in gt and 1 in gt else 'HOM'
                for gt in vcf_dict.pop('calldata/GT')
            ])

            df = pd.DataFrame(vcf_dict)
            df['samples'] = name

            # workaround for START > END so BedTool doesn't freak out using MalformedBedError
            # START > END is the case for TRV, INV
            s = df['variants/END'] < df['variants/POS']
            df.loc[s, ['variants/END', 'variants/POS']] = df.loc[
                s, ['variants/POS', 'variants/END']].values
            df['variants/POS'] = df['variants/POS'].astype(int)
            df['variants/END'] = df['variants/END'].astype(int)
            df = df.drop_duplicates()

            intervals.extend(df[sample_sv_fields].itertuples(index=False))

            if ann_fields:
                ann_dfs.append(df[index_fields + ann_fields])

        ann_df = pd.concat(ann_dfs).astype(str).rename(
            columns=self.index_cols).set_index(list(
                self.index_cols.values())) if ann_fields else pd.DataFrame()
        ann_df = ann_df[~ann_df.index.duplicated(
            keep='first'
        )]  #annotations for the same SV in a vcf can have slighly differing fields (ex. SVSCORE_MEAN)

        for i in intervals:
            print(i)

        return BedTool(intervals), ann_df, sample_names