示例#1
0
文件: util.py 项目: xuzhougeng/xpclr
def load_text_format_data(mapfn, pop_a_fn, pop_b_fn):

    tbl = pd.read_csv(mapfn, sep="\t", header=None, engine="c")

    try:
        tbl.columns = ["ID", "CHROM", "GDist", "POS", "REF", "ALT"]
    except ValueError:
        logger.info("File not tab delimited as expected- trying with spaces")
        tbl = pd.read_csv(mapfn,
                          sep=" ",
                          header=None,
                          engine="c",
                          names=["ID", "CHROM", "GDist", "POS", "REF", "ALT"])

    try:
        vartbl = allel.VariantChunkedTable(tbl.to_records(), index="POS")
    except ValueError:
        tbl = tbl.sort_values(["CHROM", "POS"])
        logger.warning(
            "Possible SNPs file is not sorted. Attempting to sort. This is likely to be inefficient"
        )
        vartbl = allel.VariantChunkedTable(tbl.to_records(), index="POS")

    d1 = np.loadtxt(pop_a_fn, dtype="int8")
    geno1 = allel.GenotypeChunkedArray(d1.reshape((d1.shape[0], -1, 2)))

    d2 = np.loadtxt(pop_b_fn, dtype="int8")
    geno2 = allel.GenotypeChunkedArray(d2.reshape((d2.shape[0], -1, 2)))

    pos = allel.SortedIndex(vartbl.POS[:])
    assert np.isnan(pos).sum() == 0, "nans values are not supported"

    return geno1, geno2, allel.SortedIndex(vartbl.POS[:]), vartbl.GDist[:]
示例#2
0
文件: gene_snp.py 项目: cyinbox/NGS
def getVariants(vcfFileName):

    convertVCFToH5(
        vcfFileName)  # Need to conver to H5 format to use VariantChunkedTable
    names = vcfFileName.split('.')
    h5FileName = names[0] + '.h5'
    callset = h5py.File(h5FileName, mode='r')

    chrom = 'variants'
    variants = allel.VariantChunkedTable(
        callset[chrom],
        index='POS')  #['variants'], names=['POS', 'REF', 'ALT'],index='POS')
    poss = variants['POS']
    refs = variants['REF']
    alts = variants['ALT'][:, 0]

    variantSNPs = {}  #make a new format of variants: pos:A->T etc.
    i = 0

    for snp in zip(refs, alts):
        snpx = snp[0] + '->' + snp[1]
        pos = poss[i]
        i = i + 1
        variantSNPs[pos] = snpx

    return callset, variants, variantSNPs
示例#3
0
def load_text_format_data(mapfn, pop_a_fn, pop_b_fn):

    tbl = pd.read_csv(mapfn,
                      sep=" ",
                      names=["ID", "CHROM", "GDist", "POS", "REF", "ALT"])

    vartbl = allel.VariantChunkedTable(tbl.to_records(), index="POS")

    d1 = np.loadtxt(pop_a_fn, dtype="int8")
    geno1 = allel.GenotypeChunkedArray(d1.reshape((d1.shape[0], -1, 2)))

    d2 = np.loadtxt(pop_b_fn, dtype="int8")
    geno2 = allel.GenotypeChunkedArray(d2.reshape((d2.shape[0], -1, 2)))

    return geno1, geno2, allel.SortedIndex(vartbl.POS[:]), vartbl.GDist[:]
示例#4
0
    def test_ld(self):
        ''' unit test for ldshrink '''

        input_hdf = "/home/nwknoblauch/Dropbox/Repos/LD_dask/test_data/reference_genotype.h5"
        callset = h5.File(input_hdf, mode='r')
        ref_geno = allel.GenotypeDaskArray(callset['calldata/GT'])
        vt = allel.VariantChunkedTable(callset['variants'])
        map_data = vt['MAP']
        geno_ac = ref_geno.to_n_alt().T.compute()
        m = 85
        Ne = 11490.672741
        cutoff = 0.001
        test_R_file = "test_data/reference_ld.txt"
        sub_X = geno_ac[:, :4]
        sub_map = map_data[:4]
        est_r = lddask.ld.ldshrink(sub_X, sub_map, m, Ne, cutoff)
        true_r = np.loadtxt(test_R_file, delimiter="\t")
        sub_est_r = true_r[:4, :4]
        assert (np.allclose(true_r[:4, :4], est_r))
def process_SNP_data(selected_chromosome):
    z = root['{}/score'.format(selected_chromosome)][0, :]

    # Load Ag1000g variation data
    data_ag1000g = h5py.File(
        'ag1000g/variation/ag1000g.phase2.ar1.pass.{}.h5'.format(
            selected_chromosome),
        mode='r')
    variants = allel.VariantChunkedTable(
        data_ag1000g[selected_chromosome]['variants'],
        names=['POS'],
        index='POS')

    # SNP data is 1-based
    snp_positions = variants['POS'][:] - 1
    pos_array = np.zeros(len(z))
    pos_array[snp_positions] = 1
    pos_roll = pd.DataFrame(pos_array).rolling(WINDOW_SIZE, center=True).apply(
        lambda x: np.sum(x) / WINDOW_SIZE, raw=True).fillna(0)

    root['{}/score'.format(selected_chromosome)][1, :] = np.array(pos_roll)
示例#6
0
文件: methods.py 项目: rrlove/ingenos
def import_data(filepath,
                chrom_name,
                names=['POS', 'REF', 'ALT', 'DP', 'MQ', 'QD', 'num_alleles']):
    '''Take the path to a well-formed h5py file and return a VariantTable
    and a GenotypeArray.'''

    ##to-do: check that h5py file is well-formed

    callset_handle = filepath
    callset = h5py.File(callset_handle, mode='r')

    variants = allel.VariantChunkedTable(callset[chrom_name]['variants'],
                                         names=names,
                                         index='POS')

    genotypes = allel.GenotypeChunkedArray(
        callset[chrom_name]['calldata']['genotype'])

    if not genotypes.shape[0] == variants.shape[0]:
        raise ValueError("Genotypes and variant table must contain the\
                         same number of positions")

    return variants, genotypes
示例#7
0
    ax.set_title('Depth of Coverage per individual')
    ax.tick_params(
        axis='x',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        bottom=False,      # ticks along the bottom edge are off
        top=False,         # ticks along the top edge are off
        labelbottom=True)
    ax.set_xticklabels(ids.id, rotation= 40, ha= 'right', fontsize= 8)
    ax.errorbar(np.arange(len(dpMu)), y= dpMu, yerr=dpSd, fmt= 'none', ecolor= 'grey')
    plt.tight_layout()

plotDP(dpMu, dpSd, ids)


## create VariantChunkedTable object
variants = al.VariantChunkedTable(subs['variants']) #, index= 'CHROM')


## count the number of variants per scaffold
scafs, scaf_counts = np.unique(variants['CHROM'], return_counts= True)
scafdf = pd.DataFrame({'scaffold': scafs, 'nVariables': scaf_counts})


## barplot of number of variants per scaffold
fig, ax = plt.subplots(figsize= (14,4))
ax.plot(np.arange(len(scafdf['nVariables'])), scafdf['nVariables'])
ax.set_xlabel('scaffolds')
ax.set_ylabel('count')
ax.set_title('Number of variants per scaffold')

示例#8
0
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
sns.set_style('ticks')
sns.set_context('notebook')
import h5py
import allel
print('scikit-allel', allel.__version__)

callset_fn = '9Moose_joint_Filter_B_NC_037355.1.h5'
callset = h5py.File(callset_fn, mode='r')
print(callset)

variants = allel.VariantChunkedTable(
    callset['variants'],
    names=['POS', 'REF', 'ALT', 'FILTER', 'DP', 'MQ', 'QD'],
    index='POS')
print(variants)

pos = variants['POS'][:]
print(pos)


def plot_windowed_variant_density(pos, window_size, title=None):

    # setup windows
    bins = np.arange(0, pos.max(), window_size)

    # use window midpoints as x coordinate
    x = (bins[1:] + bins[:-1]) / 2
示例#9
0
sns.set_style('white')
sns.set_style('ticks')
sns.set_context('notebook')
import h5py
import allel

print('scikit-allel', allel.__version__)

callset_fn = '/u/home/c/ckyriazi/kirk-bigdata/Ag1000G_data_hdf5/ag1000g.phase1.ar3.h5'
callset = h5py.File(callset_fn, mode='r')
callset

chrom = '3L'

variants = allel.VariantChunkedTable(
    callset[chrom]['variants'],
    names=['POS', 'REF', 'ALT', 'DP', 'MQ', 'QD', 'num_alleles'],
    index='POS')
print(variants)

pos = variants['POS'][:]
print(pos)


def plot_windowed_variant_density(pos, window_size, title=None):

    # setup windows
    bins = np.arange(0, pos.max(), window_size)

    # use window midpoints as x coordinate
    x = (bins[1:] + bins[:-1]) / 2
示例#10
0
def get_haplos(pops,
               chrom,
               p1,
               p2,
               samples,
               inaccessible=False,
               geno=False,
               biallelic=False,
               zarrpath=None):
    """ Returns a haplotype array or genotype array for the region and populations requested """

    print(
        '---------------------- retrieving haplotypes -----------------------')

    # Open Zarrs, genotype and variant data

    if zarrpath is False:
        if inaccessible is False:
            ############ Read zarrs #############
            Ag_array = zarr.open_array(
                f"/home/sanj/ag1000g/data/ag1000g.phase2.ar1.pass/{chrom}/calldata/GT/",
                mode='r')
            Ag_store = zarr.open_group(
                f"/home/sanj/ag1000g/data/ag1000g.phase2.ar1.pass/{chrom}/variants/",
                mode='r')
        else:
            Ag_array = zarr.open_array(
                f"/media/sanj/Sanj_HDD/Ag1000g/ag1000g.phase2.ar1/{chrom}/calldata/GT/",
                mode='r')
            Ag_store = zarr.open_group(
                f"/media/sanj/Sanj_HDD/Ag1000g/ag1000g.phase2.ar1/{chrom}/variants/",
                mode='r')

    else:
        if inaccessible is False:
            ############ Read zarrs #############
            Ag_array = zarr.open_array(f'{zarrpath}/calldata/GT/', mode='r')
            Ag_store = zarr.open_group(f'{zarrpath}/variants/', mode='r')
        else:
            Ag_array = zarr.open_array(f'{zarrpath}/calldata/GT/', mode='r')
            Ag_store = zarr.open_group(f'{zarrpath}/variants/', mode='r')

    variants = allel.VariantChunkedTable(
        Ag_store, names=['POS', 'REF', 'ALT', 'DP', 'MQ',
                         'QD'], index='POS')[:]

    positions = allel.SortedIndex(variants['POS'])
    positions = positions.intersect_range(p1, p2)
    # focus on haplotype region
    sweep_region = (variants['POS'] >= p1) & (variants['POS'] <= p2)

    ag_geno = allel.GenotypeChunkedArray(Ag_array)
    print('Zarr arrays opened')
    ag_geno = ag_geno.compress(sweep_region, axis=0)

    print(
        f'------------------------------- {pops} ------------------------------------'
    )
    # Restrict genotypeArray to population and make HapArray
    pop_bool = samples.population.isin(pops)
    pop_geno = ag_geno.compress(pop_bool, axis=1)
    pop_haplo = pop_geno.to_haplotypes()
    print("HaplotypeArray constructed")

    if biallelic is True:
        ac = pop_geno.count_alleles()
        bi_al = ac.is_biallelic_01()
        pop_haplo = pop_haplo.compress(bi_al, axis=0)
        positions = positions[bi_al]

    if geno is True:
        return (pop_geno, pop_bool, sweep_region, positions)
    else:
        return (pop_haplo, pop_bool, sweep_region, positions)
示例#11
0
def multiple_alignment(pops, chrom, p1, p2, samples, hap_only=False):
    """ Returns a multiple sequence alignment FASTA for a region, given populations, chromosome and locations. Useful for constructing phylogenetic trees (in IQTREE, e.g)
        Currently not bi-allelic which may be incorrect """

    print(
        '---------------------- multiple sequence alignment -----------------------'
    )

    # Open Zarrs, genotype and variant data
    Ag_array = zarr.open_array(
        f"/home/sanj/ag1000g/data/ag1000g.phase2.ar1.pass/{chrom}/calldata/GT/",
        mode='r')
    Ag_store = zarr.open_group(
        f"/home/sanj/ag1000g/data/ag1000g.phase2.ar1.pass/{chrom}/variants/",
        mode='r')

    variants = allel.VariantChunkedTable(
        Ag_store, names=['POS', 'REF', 'ALT', 'DP', 'MQ', 'QD'], index='POS')
    # focus on haplotype region
    sweep_region = (variants['POS'][:] >= p1) & (variants['POS'][:] <= p2)
    variants_in_region = variants.compress(sweep_region, axis=0)
    ag_geno = allel.GenotypeChunkedArray(Ag_array)
    print('Zarr arrays opened')
    ag_geno = ag_geno.compress(sweep_region, axis=0)

    # clean metadata
    species_map = {'M': 'coluzzii', 'S': 'gambiae'}
    samples['species'] = samples['m_s'].map(species_map)
    color_map = {'BFcol': 'gold'}
    samples = samples[[
        'ox_code', 'population', 'country', 'species', 'region'
    ]]

    #empty df for FASTAS
    multi_fastas = pd.DataFrame()
    all_samples = pd.DataFrame()
    for pop in pops:
        print(
            f'------------------------------- {pop} ------------------------------------'
        )
        # Restrict genotypeArray to population and make HapArray
        pop_bool = samples.population == pop
        pop_geno = ag_geno.compress(pop_bool, axis=1)
        pop_haplo = pop_geno.to_haplotypes()
        print("HaplotypeArray constructed")
        list_of_haplotypes = np.arange(0, pop_haplo.shape[1]).astype('str')
        #     all_haps = pd.DataFrame(np.repeat(all_samples.values,2,axis=0))

        list_of_haplotypes = list(list_of_haplotypes)
        pop_hap_sizes = dict()
        pop_hap_sizes[pop] = len(list_of_haplotypes)

        # THIS CREATES AN EMPTY DATAFRAME TO FILL WITH SEQUENCES
        # EACH ROW IS A HAPLOTYPE
        fastas = pd.DataFrame({
            "hap": np.nan,
            "seq": np.nan
        },
                              columns=["hap", "seq"])

        # THIS LOOPS THROUGH HAPLOTYPES AND POPULATES "seq" VARIABLE WITH A CONCATENATED ARRAY OF ALT/REF VARIANTS
        # genotypes_in_region: array of genotypes as loaded by scikit-allel (compress it to region of interest)
        # variants_in_region: table of variants as loaded by scikit-allel (compress it to region of interest)
        print(f"Extracting variants and writing to Pandas Dataframe")
        for n, i in enumerate(list_of_haplotypes):
            gen = np.ndarray.tolist(pop_haplo[:, n])

            endstring = ''
            for gn, allele in enumerate(gen):
                if allele == 1:
                    seq = variants_in_region['ALT'][gn][0].astype(str)
                if allele == 2:
                    seq = variants_in_region['ALT'][gn][1].astype(
                        str
                    )  #should this be here, or should it be bi-allelic only?
                else:
                    seq = variants_in_region['REF'][gn].astype(
                        str)  # if allele 0 then REF

                endstring += seq  # concatenate bases into sequence

            fastas["seq"][
                n] = endstring  #input to corresponding seq column of df

        # Join the dfs of different pops
        multi_fastas = multi_fastas.append(fastas, ignore_index=True)
        print(len(multi_fastas), "Haplotypes complete")
        pop_samples = samples[samples.population == pop]
        all_samples = all_samples.append(pop_samples)
        multi_fastas['hap'] = '>' + all_samples['population'].astype(
            str) + '_' + all_samples['ox_code'].astype(str)

    #write to csv with \n sep to make FASTA file
    multi_fastas.to_csv(f"haplotypes/{chrom}/{chrom}_{p1}_{p2}.fasta",
                        sep="\n",
                        index=False,
                        header=False)
    print('Multiple alignment FASTA written')

    #remove > and join with metadata for each pop, useful for plotting phylo trees
    multi_fastas['hap'] = multi_fastas['hap'].str.strip('>')
    all_haps = pd.DataFrame(np.repeat(all_samples.values, 2, axis=0))
    all_haps.columns = all_samples.columns
    all_haps = pd.concat([multi_fastas.reset_index(drop=True), all_haps],
                         axis=1)

    all_haps.to_csv(f"haplotypes/{chrom}/{chrom}_{p1}_{p2}.metadata",
                    sep="\t",
                    index=False,
                    header=True)

    return (multi_fastas, all_haps)
print("Data:")
print("* Samples     = ", p2_samples.shape[0])
print("* Populations = ", set(p2_samples[p2_popc]))
print(p2_samples.groupby(("population")).size())

# Variants and genotypes:

# In[5]:

# declare objects with variant data
p2_callset = zarr.open(p2_callset_fn)
# variants of genotypes
print("Variants...")
p2_callset_var = p2_callset[chrom]["variants"]
p2_genvars = allel.VariantChunkedTable(p2_callset_var,
                                       names=["POS", "REF", "ALT"],
                                       index="POS")
print(p2_genvars.shape)
# genotype data
print("Genotypes...")
p2_callset_gen = p2_callset[chrom]["calldata"]["genotype"]
p2_genotyp = allel.GenotypeChunkedArray(p2_callset_gen)
p2_genotyp = p2_genotyp.subset(sel1=p2_samples_bool)
print(p2_genotyp.shape)

# #### Outgroups
#
# Loads one outgroup, removes indels (duplicated variant positions) and subsets phase2 to include variants present in this outgroup. Then, loads outgroup genotypes and subsets them to remove indels and fit phase2. Then, loads the second outgroup and performs the same task. Thus, at each iteration, less and less variants remain (hopefully not too many are lost; worst offenders are `chri` and `epir`).

# In[6]:
    oc_popdict["all"] = oc_popdict["all"] + oc_popdict[popi]


# report
print("Data:")
print("* Samples     = ", oc_samples.shape[0])
print("* Populations = ", set(oc_samples[oc_popc]))
print(oc_samples.groupby(("population")).size())

# Phased variants and genotypes:
# declare objects with variant data
oc_hapcall   = h5py.File(oc_hapcall_fn)
# variants of genotypes
print("Variants phased...")
oc_hapcall_var = oc_hapcall[chrom]["variants"]
oc_hapvars = allel.VariantChunkedTable(oc_hapcall_var,names=["POS","REF","ALT"],index="POS")
print(oc_hapvars.shape)
# genotype data
print("Genotypes phased...")
oc_hapcall_hap = oc_hapcall[chrom]["calldata"]["genotype"]
oc_haploty     = allel.GenotypeChunkedArray(oc_hapcall_hap)
oc_haploty     = oc_haploty.subset(sel1=oc_samples_bool)
print(oc_haploty.shape)


# Effects:
oc_effcall     = zarr.open(oc_effcall_fn)
oc_effvars     = allel.VariantChunkedTable(oc_effcall["variants"],names=[
    "POS","REF","ALT","ANN_HGVS_p","ANN_HGVS_c",
    "ANN_Annotation","ANN_AA_pos","ANN_CDS_pos",
    "ANN_Feature_ID","ANN_Gene_ID","ANN_Gene_Name"
示例#14
0
### Save to hdf5
#import sys
#allel.vcf_to_hdf5(ftlx,'FERRIER_09_Germline.allchr.snpEff.p.SAL.SAL10_1_Shrt.h5',
#                  fields='*', alt_number=4,transformers=allel.ANNTransformer(),log=sys.stdout, vlen=False)

# In[18]:

### HDF5 from VCF database
ftlxh5 = 'tracks/WGS/Germline/FERRIER_09_Germline.allchr.snpEff.p.SAL.SAL10_1_Shrt.h5'

# read HDF5 file
csh = h5py.File(ftlxh5, mode='r')
var_tb = allel.VariantChunkedTable(
    csh['variants'],
    names=[
        'CHROM', 'POS', 'REF', 'ALT', 'DP', 'MQ', 'QD', 'ANN_AA_length',
        'ANN_Allele', 'ANN_Annotation', 'ANN_Annotation_Impact',
        'ANN_Feature_ID', 'ANN_Feature_Type', 'ANN_Gene_ID', 'ANN_Gene_Name',
        'ANN_Rank', 'ANN_Transcript_BioType', 'numalt'
    ])

# In[19]:

#a,b,c=plt.hist(var_tb['DP'][:], bins=10)
#csh['variants/REF']

# ## Now we can work with filters

# In[20]:

#fltr_expr = '(QD > 5) & (MQ > 40) & (DP > 1500) & (DP < 3000)'
fltr_expr = "ANN_Feature_Type==b'transcript'"
示例#15
0
                  action="store_false",
                  dest="verbose",
                  default=True,
                  help="don't print status messages to stdout")

(options, args) = parser.parse_args()

zarr_path = options.zarr_filename
chromID = options.chromID
samples_fn = options.sample_metadata
output_fp = options.output_fp

# load variants
callset = zarr.open_group(zarr_path, mode='r')
variants = allel.VariantChunkedTable(
    callset[chromID]['variants'],
    names=['POS', 'REF', 'ALT', 'AN', 'AC', 'numalt'],
    index='POS')

filter_expression = '(AN >= 800)'
variant_selection = variants.eval(filter_expression)[:]

pos = variants['POS'][:]
variants_pass_pos = pos.compress(variant_selection)

genotypes = allel.GenotypeChunkedArray(callset[chromID + '/calldata/GT'])
genotypes_subset = genotypes.subset(variant_selection, )

samples = pd.read_csv(samples_fn)

# paired groups filtering
paired_groupIDs_list = samples.GroupID.value_counts() == 2  # D-R pairs