Exemplos de Bed.iid_to_index em Python, exemplos de pysnptools.snpreader.Bed.iid_to_index em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: cleaning.py Projeto: ioneliabuzatu/PhenotipicPredictionThesis

def cleaner():

    snps = Bed(args.snps, count_A1=False)
    patients = pd.read_csv(
        '/Users/ioneliabuzatu/PycharmProjects/biobank/obesity/data/bmi_clean.csv',
        sep=' ',
        index_col=0)

    patients_id = pd.read_csv(
        '/Users/ioneliabuzatu/PycharmProjects/biobank/obesity/data/bmi_clean.csv',
        sep=' ')
    pats = patients_id.iloc[:, 0]
    count_not_there = 0

    for p in pats:
        search = str(p).encode('ascii')
        try:
            hi = snps.iid_to_index([[search, search]])
        except:
            patients = patients.drop([p])
            count_not_there += 1
            print(patients.shape)

    return patients

Exemplo n.º 2

0

Exibir arquivo

#[[   5 4000 4000]
# [   5 4001 4001]
# [   5 4002 4002]
# ...,
# [   5 4997 4997]
# [   5 4998 4998]
# [   5 4999 4999]]

#In one-line:
chr5data = Bed("all.bed")[:, snpreader.pos[:, 0] == 5].read()

# You can turn iid or sid names into indexes
snpreader = Bed("all.bed")
iid0 = [['cid499P1', 'cid499P1'], ['cid489P1', 'cid489P1'],
        ['cid479P1', 'cid479P1']]
indexes0 = snpreader.iid_to_index(iid0)
print indexes0
#array([499, 489, 479])
snpreader0 = snpreader[indexes0, :]
print snpreader0.iid
#[['cid499P1' 'cid499P1']
# ['cid489P1' 'cid489P1']
# ['cid479P1' 'cid479P1']]

# more condensed
snpreader0 = snpreader[snpreader.iid_to_index(iid0), :]

#both a once
snpdata0chr5 = snpreader[snpreader.iid_to_index(iid0),
                         snpreader.pos[:, 0] == 5].read()
print np.mean(snpdata0chr5.val)

Exemplo n.º 3

0

Exibir arquivo

class Mapping():
    def __init__(self, prefix, case_file):
        self.prefix = prefix
        self.case_file = case_file
        self.snpreader = Bed(f"{prefix}.bed", count_A1=False)
        if self.snpreader.pos.dtype != 'int64':
            self.snpreader.pos[:,0] = np.vectorize(replace)(self.snpreader.pos[:,0])
        self.snpreader.pos[:,1] = self.snpreader.pos[:,0] * 100000000000 + self.snpreader.pos[:,2]
        self.snpdata = self.snpreader.read()
        print('SNP data loaded.')
        self.chr_list = list(set(self.snpreader.pos[:,0]))
        self.Chr = self.snpreader.pos[:,0]
        self.Position =  self.snpreader.pos[:,1]
        self.bp =  self.snpreader.pos[:,2]
        self.SNPID = self.snpreader.sid
        self.case = np.loadtxt(case_file, dtype=self.snpreader.iid.dtype)[:,:2]
        self.case_list = list(self.case)
        self.all_list = list([tuple(x) for x in self.snpreader.iid])
        self.caseset = set([tuple(x) for x in self.case])
        self.control_list = [list(x) for x in self.all_list if x not in self.caseset]
        self.numSNP = self.snpreader.sid_count
        self.numSample = len(self.all_list)
        self.numCase = len(self.case_list)
        self.numControl = len(self.control_list)
        self.case_geno = self.snpdata.val[self.snpreader.iid_to_index(self.case)]
        L = []
        for i in self.case_list:
            L.append(i[1].decode('utf-8'))
        self.case_list_print = '\n'.join(L)
        print('Case individuals are: \n')
        print(self.case_list_print)
        print('\n')

    def ibdmapping_gw(self, Windowkb, Stretchkb, numGapSNP, numMinSNP, WindowGap, out, point):
        '''Performs genome-wide IBD mapping based on SNP streak'''
        print("********************\n"
              f"IBD mapping started.\ninput file prefix: {self.prefix}\nWindowkb: {Windowkb}\nStretchkb: "
              f"{Stretchkb}\nnumGapSNP: {numGapSNP}\nnumMinSNP: {numMinSNP}\nWindowGap: {WindowGap}\noutput file prefix: {out}")
        with open(f"{out}.txt", 'w') as f:
            out = (f"Log_for_NonparametricIBDmapping\n\nInput_Genotype_File:\t{self.prefix}.bim/fam/bed\nInput_Case_File:\t{self.case_file}"
                   f"\nNo.SNP:\t{self.numSNP}\nWindow_kb:\t{Windowkb}\nStretch_kb:\t{Stretchkb}\nWindowGap_kb:\t{WindowGap}"
                   f"\nNo.InconsistentSNP:\t{numGapSNP}\nNo.MinSNP:\t{numMinSNP}\n\nNo.Samples:\t{self.numSample}\nNo.Cases:\t{self.numCase}\n"
                   f"\nCase individuals are: \n{self.case_list_print}\n\n")
            f.write(out)
            StretchLong = LOCH_MappingTools.LOCHMappingAll(self.case_geno, self.Position, Windowkb, numGapSNP, numMinSNP, Stretchkb, WindowGap)
            if point:
                PointHitFlag = LOCH_MappingTools.PointHitonStretch(StretchLong, self.Position)
            numStretchLong = len(StretchLong) if len(StretchLong[0]) else 0
            out = f'No.IBD stretch CaseOnly:\t{numStretchLong}'
            print(out)
            out = (f"\nNo.IBD_Stretch_in_All_Cases:\t{numStretchLong}\nIBD_stretch\tChr\tStart_SNP\tEnd_SNP"
                   f"\tStart_Position(bp)\tEnd_Position(bp)\tLength(bp)\n")
            f.write(out)
            for i in range(numStretchLong):
                start = StretchLong[i][0]
                end = StretchLong[i][1]
                L = [str(i+1), str(self.Chr[start]), self.SNPID[start].decode('utf-8'), self.SNPID[end].decode('utf-8'),
                     str(self.bp[start]), str(self.bp[end]), f"{(self.bp[end] - self.bp[start])}\n"]
                out = '\t'.join(L)
                f.write(out)
            if point:
                PointHitResult = [[0] * self.numSNP for i in range(self.numControl)]
            StretchLongControl = [None] * self.numControl
            if self.numControl:
                for i in range(self.numControl):
                    case_control_list = self.case_list + [self.control_list[i]]
                    case_control_geno = self.snpdata.val[self.snpreader.iid_to_index(case_control_list)]
                    StretchLongControl[i] = LOCH_MappingTools.LOCHMappingAll(case_control_geno, self.Position, Windowkb,
                                                                            numGapSNP, numMinSNP, Stretchkb, WindowGap)
                    if point:
                        PointHitFlagControl = LOCH_MappingTools.PointHitonStretch(StretchLongControl[i], self.Position)
                        PointHitResult[i] = PointHitFlagControl
                    numStretchLongControl = len(StretchLongControl[i]) if len(StretchLongControl[i][0]) else 0
                    out = (f"\nNo.IBD_stretch_in_All_Cases_and_1_Control({self.control_list[i][1].decode('utf-8')}):\t{numStretchLongControl}"
                        f"\nIBD_stretch\tChr\tStart_SNP\tEnd_SNP\tStart_Position(bp)\tEnd_Position(bp)\tLength(bp)\n")
                    f.write(out)
                    numStretchLongControl = len(StretchLongControl[i])
                    if not StretchLongControl[i][0]:
                        numStretchLongControl = 0
                    for j in range(numStretchLongControl):
                        start = StretchLongControl[i][j][0]
                        end = StretchLongControl[i][j][1]
                        L = [str(j+1), str(self.Chr[start]), self.SNPID[start].decode('utf-8'), self.SNPID[end].decode('utf-8'),
                            str(self.bp[start]), str(self.bp[end]), f"{(self.bp[end] - self.bp[start])}\n"]
                        out = '\t'.join(L)
                        f.write(out)
            print('IBD mapping for Cases and Controls finished!!')

            if point:    
                PointHitSumControl = [0] * len(self.Position)
                for i in range(len(PointHitResult)):
                    for j in range(len(PointHitResult[0])):
                        PointHitSumControl[j] += PointHitResult[i][j]
                out = ("\n\nIndividual_Marker/Sample_IBDstatus_in_IBDregions\n\nSNP\tChr\tbp\tNo.Cases_in_IBD"
                      "\tNo.Controls_in_IBD\tAll_Cases_in_IBD(Yes:1/No:0)")
                f.write(out)
                L = []
                for i in range(len(self.control_list)):
                    L.append("\t{0}_in_IBD(Yes:1/No:0)".format(self.control_list[i][1].decode('utf-8')))
                L.append('\n')
                out = ''.join(L)
                f.write(out)
                for j in range(len(PointHitFlag)):
                    if PointHitFlag[j]:
                        L = [f"{self.SNPID[j].decode('utf-8')}\t{str(self.Chr[j])}\t{str(self.bp[j])}"
                        f"\t{str(len(self.case_list))}\t{str(PointHitSumControl[j])}\t{PointHitFlag[j]}"]
                        for i in range(len(PointHitResult)):
                            L.append(f'\t{PointHitResult[i][j]}')
                        L.append('\n')
                        out = ''.join(L)
                        f.write(out)

            case_regions = np.asarray(StretchLong)
            if self.numControl:
                ctrl_regions = [np.asarray(i) for i in StretchLongControl]
                num_ctrl_regions = len(ctrl_regions)
                if not len(ctrl_regions[0][0]):
                    num_ctrl_regions = 0
            else:
                num_ctrl_regions = 0
            num_case_regions = len(case_regions)
            if not len(case_regions[0]):
                num_case_regions = 0
            edges = []
            for i in range(num_case_regions):
                edges.append([case_regions[i][0], 0, 0])
                edges.append([case_regions[i][1], 0, 1])
            for i in range(num_ctrl_regions):
                num_ctrl_region = len(ctrl_regions[i])
                if not len(ctrl_regions[i][0]):
                    num_ctrl_region = 0
                for j in range(num_ctrl_region):
                    edges.append([ctrl_regions[i][j][0], i+1, 0])
                    edges.append([ctrl_regions[i][j][1], i+1, 1])
            state = [0] * (num_ctrl_regions+1)
            edges = sorted(edges)
            self.ibd_regions = []
            for i in range(len(edges)):
                if not edges[i][2]:
                    state[edges[i][1]] += 1
                else:
                    state[edges[i][1]] -= 1
                if i+1 == len(edges) or edges[i][0] != edges[i+1][0]:
                    self.ibd_regions.append([edges[i][0], copy.copy(state)])
            L = ["\n\nChr\tStart\tEnd\tIBD_in_Controls\tIBD_Case_specificity"]
            for i in range(len(self.ibd_regions) - 1):
                if self.ibd_regions[i][1][0]!=0:
                    prop =  round(1.0-sum(self.ibd_regions[i][1][1:])/self.numControl, 3) if self.numControl else 1.0
                    L.append(f"{self.Chr[self.ibd_regions[i][0]]}\t{self.bp[self.ibd_regions[i][0]]}"
                             f"\t{self.bp[self.ibd_regions[i+1][0]]}\t{sum(self.ibd_regions[i][1][1:])}\t{prop}")
            L.append('\nCalculation_finished_at:\t{}\n'.format(datetime.now().strftime("%Y/%m/%d %H:%M:%S")))
            out = '\n'.join(L)
            f.write(out)

    def rohmapping_gw(self, Windowkb, Stretchkb, numGapSNP, numMinSNP, WindowGap, out):
        '''Performs genome-wide runs of homozygosity mapping'''
        print("********************\n"
              f"ROH mapping started.\ninput file prefix: {self.prefix}\nWindowkb: {Windowkb}\nStretchkb: {Stretchkb}"
              f"\nnumGapSNP: {numGapSNP}\nnumMinSNP: {numMinSNP}\nWindowGap: {WindowGap}\noutput file prefix: {out}")
        with open(f"{out}.txt", 'w') as f:
            out = (f"Log_for_ROHmapping\n\nInput_File:\t{self.prefix}.ped/map/info/case\nNo.SNP:\t{self.numSNP}\nWindow_kb:\t{Windowkb}"
                   f"\nStretch_kb:\t{Stretchkb}\nWindowGap_kb:\t{WindowGap}\nNo.InconsistentSNP:\t{numGapSNP}"
                   f"\nNo.MinSNP:\t{numMinSNP}\n\nNo.Samples:\t{self.numSample}\nNo.Cases:\t{self.numCase}\n\n")
            f.write(out)

            ROHwin = LOCH_MappingTools.MakeROHonWindowMulti(self.case_geno, self.Position, Windowkb, numGapSNP, numMinSNP, Stretchkb, WindowGap)
            StretchLongArray = LOCH_MappingTools.DecideROHStretchMulti(ROHwin, self.Position, Stretchkb, WindowGap)
            
            for i in range(self.numCase):    
                numStretchLongCase = len(StretchLongArray[i]) if (StretchLongArray[i][0][1]) else 0
                out = (f"\n\nNo.ROH_in_1_Case({self.case_list[i][1].decode('utf-8')}):\t{numStretchLongCase}"
                       f"\nROH\tChr\tStart_SNP\tEnd_SNP\tStart_Position(bp)\tEnd_Position(bp)\tLength(bp)\n")
                f.write(out)
                for j in range(len(StretchLongArray[i])):
                    start = StretchLongArray[i][j][0]
                    end = StretchLongArray[i][j][1]
                    if end == 0:
                        continue
                    L = [str(j+1), str(self.Chr[start]), self.SNPID[start].decode('utf-8'), self.SNPID[end].decode('utf-8'), 
                         str(self.bp[start]), str(self.bp[end]), f"{(self.bp[end] - self.bp[start])}\n"]
                    out = '\t'.join(L)
                    f.write(out)

            if self.numControl:
                control_geno = self.snpdata.val[self.snpreader.iid_to_index(self.control_list)]
                ROHwinControl = LOCH_MappingTools.MakeROHonWindowMulti(control_geno, self.Position, Windowkb, numGapSNP, numMinSNP, Stretchkb, WindowGap)
                StretchLongControl = LOCH_MappingTools.DecideROHStretchMulti(ROHwinControl, self.Position, Stretchkb, WindowGap)

            for i in range(self.numControl):    
                numStretchLongControl = len(StretchLongControl[i]) if StretchLongControl[i][0][1] else 0
                out = (f"\n\nNo.ROH_in_1_Control({self.control_list[i][1].decode('utf-8')}):\t{numStretchLongControl}"
                       f"\nROH\tChr\tStart_SNP\tEnd_SNP\tStart_Position(bp)\tEnd_Position(bp)\tLength(bp)\n")
                f.write(out)
                for j in range(len(StretchLongControl[i])):
                    start = StretchLongControl[i][j][0]
                    end = StretchLongControl[i][j][1]
                    if end == 0:
                        continue
                    L = [str(j+1), str(self.Chr[start]), self.SNPID[start].decode('utf-8'), self.SNPID[end].decode('utf-8'), 
                         str(self.bp[start]), str(self.bp[end]), f"{(self.bp[end] - self.bp[start])}\n"]
                    out = '\t'.join(L)
                    f.write(out)    
            print('ROH detection for Cases and Controls finished!!')

            case_regions = [np.asarray(i) for i in StretchLongArray]
            edges = []
            for i in range(len(case_regions)):
                for j in range(len(case_regions[i])):
                    edges.append([case_regions[i][j][0], i, 0])
                    edges.append([case_regions[i][j][1], i, 1])
            state = [0] * (len(case_regions))
            edges = sorted(edges)
            self.roh_case_regions = []
            for i in range(len(edges)):
                if not edges[i][2]:
                    state[edges[i][1]] += 1
                else:
                    state[edges[i][1]] -= 1
                if (i+1 == len(edges) or edges[i][0] != edges[i+1][0]):
                    self.roh_case_regions.append([edges[i][0], copy.copy(state)])
            L = ["\n\nChr\tStart\tEnd\tNumber_of_ROH_in_Cases\n"]
            for i in range(len(self.roh_case_regions) - 1):
                if np.sum(self.roh_case_regions[i][1]):
                    L.append(f"{self.Chr[self.roh_case_regions[i][0]]}\t{self.bp[self.roh_case_regions[i][0]]}"
                             f"\t{self.bp[self.roh_case_regions[i+1][0]]}\t{np.sum(self.roh_case_regions[i][1])}\n")
            out = ''.join(L)
            f.write(out)

            if self.numControl:
                control_regions = [np.asarray(i) for i in StretchLongControl]
                edges = []
                for i in range(len(control_regions)):
                    for j in range(len(control_regions[i])):
                        edges.append([control_regions[i][j][0], i, 0])
                        edges.append([control_regions[i][j][1], i, 1])
                state = [0] * (len(control_regions))
                edges = sorted(edges)
                self.roh_control_regions = []
                for i in range(len(edges)):
                    if not edges[i][2]:
                        state[edges[i][1]] += 1
                    else:
                        state[edges[i][1]] -= 1
                    if (i+1 == len(edges) or edges[i][0] != edges[i+1][0]):
                        self.roh_control_regions.append([edges[i][0], copy.copy(state)])
                L = ["\n\nChr\tStart\tEnd\tNumber_of_ROH_in_Controls\n"]
                for i in range(len(self.roh_control_regions) - 1):
                    if np.sum(self.roh_control_regions[i][1]):
                        L.append(f"{self.Chr[self.roh_control_regions[i][0]]}\t{self.bp[self.roh_control_regions[i][0]]}"
                                f"\t{self.bp[self.roh_control_regions[i+1][0]]}\t{np.sum(self.roh_control_regions[i][1])}\n")
                out = ''.join(L)
                f.write(out)

            out = '\nCalculation_finished_at:\t{}\n'.format(datetime.now().strftime("%Y/%m/%d %H:%M:%S"))
            f.write(out)

    def draw_diagram(self, fig_name):
        '''Draws diagrams of mapping results'''
        chr_length = [249250621, 243199373, 198022430, 191154276, 180915260, 171115067, 159138663, 146364022, 141213431, 135534747, 135006516, 133851895, 115169878, 107349540, 102531392, 90354753, 81195210, 78077248, 59128983, 63025520, 48129895, 51304566]
        chr_names = list(range(1,23))
        #chr_list = np.unique(self.Chr)
        #chr_length = pd.DataFrame(self.snpdata.pos).groupby(0, as_index=False).max()
        size = 22
        gap_length = 10 ** 7
        array = np.triu(np.ones((size,size))).T
        gap = np.full(size, gap_length)
        global_len = np.dot(array, chr_length + gap)
        global_len = np.insert(global_len, 0, 0)
        global_len += gap_length
        fig = plt.figure(figsize=(15,5))
        cmap = plt.get_cmap("tab10")
        ax1 = fig.add_subplot(211)
        ax2 = fig.add_subplot(212)
        StChr, EdChr = 0, 22
        Stbp, Edbp = 0, global_len[EdChr]
        [i.set_xlim(Stbp, Edbp) for i in fig.get_axes()]
        ax1.set_ylim(-0.1, 1.0)
        ax1.set_xticklabels([]) 
        ax1.set_ylabel('IBD Case specificity')
        ax1.title.set_text('IBD mapping results based on SNP streak principle')
        ax1.spines['bottom'].set_visible(False)
        ax1.spines['top'].set_visible(False)
        ax1.tick_params(axis='both', which='both', length=0)
        ax1.axhline(0, color='k', linewidth=0.5)
        ax1.axhline(1, color='k', linewidth=0.5, ls='--')
        for i in range(size):
            ax1.add_patch(plt.Rectangle(xy=[global_len[i], -0.1], width=chr_length[i], height=0.05, color=cmap(i%10)))
        for i, reg in enumerate(self.ibd_regions):
            if reg[1][0]!=0:
                prop = 1 - sum(reg[1][1:])/self.numControl if self.numControl else 1.0
                ax1.add_patch(plt.Rectangle(xy=[(self.bp[reg[0]] + global_len[self.Chr[reg[0]] - 1]), 0],
                              width=self.bp[self.ibd_regions[i+1][0]] - self.bp[reg[0]], 
                              height=prop, color=cmap((self.Chr[reg[0]] - 1) % 10)))
                ax1.add_patch(plt.Rectangle(xy=[(self.bp[reg[0]] + global_len[self.Chr[reg[0]] - 1]), -0.04], 
                              width=max(self.bp[self.ibd_regions[i+1][0]] - self.bp[reg[0]], (Edbp - Stbp)*0.001),
                              height=0.03, color='r'))
        ax2.set_ylim(-self.numCase*0.1, self.numCase)
        ax2.set_xticks([(global_len[i] + global_len[i+1])/2 for i in range(StChr, EdChr)])
        ax2.set_xticklabels([i for i in chr_names])
        ax2.set_ylabel('ROH in Cases')
        ax2.title.set_text('Runs of homozygosity detection results')
        ax2.spines['bottom'].set_visible(False)
        ax2.spines['top'].set_visible(False)
        ax2.tick_params(axis='both', which='both', length=0)
        ax2.axhline(0, color='k', linewidth=0.5)
        ax2.axhline(self.numCase, color='k', linewidth=0.5, ls='--')
        for i in range(size):
            ax2.add_patch(plt.Rectangle(xy=[global_len[i], -self.numCase * 0.1], width=chr_length[i],
                                        height=self.numCase * 0.05, color=cmap(i%10)))
        for i, reg in enumerate(self.roh_case_regions):
            numROH = np.sum(reg[1])
            if numROH:
                ax2.add_patch(plt.Rectangle(xy=[(self.bp[reg[0]] + global_len[self.Chr[reg[0]] - 1]), 0], 
                              width=self.bp[self.roh_case_regions[i+1][0]] - self.bp[reg[0]], height=numROH, 
                              color=cmap((self.Chr[reg[0]] - 1) % 10)))
                if numROH == self.numCase:
                    ax2.add_patch(plt.Rectangle(xy=[(self.bp[reg[0]] + global_len[self.Chr[reg[0]] - 1]), -0.03 * numROH], 
                                width=max(self.bp[self.roh_case_regions[i+1][0]] - self.bp[reg[0]], (Edbp - Stbp)*0.001),
                                height=0.02 * numROH, color='r'))
        fig.align_labels()
        plt.savefig(fig_name)

Exemplo n.º 4

0

Exibir arquivo

Arquivo: construct_background_kernel.py Projeto: HealthML/seak

class GRMLoaderSnpReader:
    """Constructs a background kernel :math:`K_0` from given binary PLINK 1 genotype files using the leave-one-out-chromosome (LOCO) strategy.

    Initially no background kernel is constructed, only the instance attributes are initialized. The kernel gets
    constructed when calling the method :func:`compute_background_kernel` which should only be called after calling the
    :func:`update_ind` method manually or the :func:`seak.data_loaders.intersect_and_update_datasets`.
    This way, individuals which are neither contained in the test nor in the background kernel data set are excluded.

    In full rank case, loads the SNPs in blocks to construct the kernel.
    In low rank case, loads all SNPs into memory at once.

    :param str path_to_plink_files_with_prefix: path prefix to genotype PLINK files for background kernel construction
    :param int blocksize: how many genotypes to load at once; should be chosen dependent on RAM available
    :param str/int LOCO_chrom_id: identifier of the chromosome/region that is used in the respective test set and should be excluded from the background kernel or None if all variants should be included
    :param bool forcelowrank: enforce low rank data loading behavior for testing purposes

    .. note:: The leave-one-chromosome-out (LOCO) strategy can be disabled with :attr:`LOCO_chrom_id`.
    """
    def __init__(self,
                 path_or_bed,
                 blocksize,
                 LOCO_chrom_id=None,
                 forcelowrank=False):
        """Constructor."""
        self.forcelowrank = forcelowrank  # only for testing purposes!

        if isinstance(path_or_bed, str):
            self.bed = Bed(path_or_bed, count_A1=True)
        else:
            assert isinstance(
                path_or_bed, SnpReader
            ), 'path_or_bed must either be a path to a bed-file, or an instance of SnpReader.'

        self.bed.pos[:, 0] = self.bed.pos[:, 0].astype(
            'str')  # chromosome should be str, stored positions are 1-based
        self.iid_fid = pd.DataFrame(self.bed.iid,
                                    index=self.bed.iid[:, 1].astype(str),
                                    columns=['fid', 'iid'])

        self.variants_to_include = self._get_LOCO_SNV_indices(LOCO_chrom_id)

        self.blocksize = blocksize

        self.nb_ind = None
        self.nb_SNVs_unf = None
        self.G0 = None
        self.K0 = None
        self.nb_SNVs_f = None
        self.samples_overlapped = False

    def _get_LOCO_SNV_indices(self, LOCO_chrom_id):
        """Returns list of indices that should be included in the GRM.

        :param str/int LOCO_chrom_id: identifier of the chromosome/region that is used in the respective test set and should be excluded from the background kernel or None if all variants should be included
        :return: numerical indices of the SNVs to exclude from the background kernel computation
        :rtype: numpy.ndarray or ndarray-like
        """

        if LOCO_chrom_id is None:
            return np.arange(self.bed.sid_count, dtype=int)
        else:
            return np.where(~(self.bed.pos[:,
                                           0].astype(str) == LOCO_chrom_id))[0]

    def update_individuals(self, iids):
        """Sets individuals to include into the background kernel data set based on individual ids (:attr:`iids`).

        :param iids: numpy.Series of individual ids that should be retained for background kernel computation
        """
        iid_fid = self.iid_fid.loc[iids]
        self.bed = self.bed[self.bed.iid_to_index(iid_fid.values), :]
        self.samples_overlapped = True

    def get_iids(self):
        """Returns all individual ids.
        :return:
        :rtype: numpy.ndarray
        """
        return self.iid_fid.index.values

    def _build_G0(self):
        """Low rank case: constructs :math:`G_0` from provided bed file (PLINK 1).

        :return: normalized genotypes :math:`G_0` and number of SNVs that where loaded
        :rtype: numpy.ndarray, int
        """

        temp_genotypes = self.bed[:,
                                  self.variants_to_include].read().standardize(
                                      Unit()).val

        # Replaced the code below with PySnpTools internal standardizer
        #filter_invariant = ~(temp_genotypes == temp_genotypes[0, :]).all(0)
        #filter_invariant = ~filter_invariant.all(0)
        #filter_all_nan = ~np.all(np.isnan(temp_genotypes), axis=0)
        #total_filter = filter_invariant & filter_all_nan
        #temp_genotypes = temp_genotypes[:, total_filter]
        #temp_genotypes = VariantLoader.standardize(temp_genotypes)
        #nb_SNVs_filtered = temp_genotypes.shape[1]
        # Normalize
        #return temp_genotypes / np.sqrt(nb_SNVs_filtered), nb_SNVs_filtered

        # TODO: is invariant-filtering really necessary here?
        invariant = (temp_genotypes == temp_genotypes[0, :]).all(0)

        n_filtered = (~invariant).sum()
        temp_genotypes /= np.sqrt(n_filtered)

        return temp_genotypes[:, ~invariant], n_filtered

    def _build_K0_blocked(self):
        """Full rank case: Builds background kernel :math:`K_0` by loading blocks of SNPs from provided bed file (PLINK 1).

        :return: normalized background kernel :math:`K_0` and number of SNVs that where used to built the kernel
        :rtype: numpy.ndarray, int
        """

        # TODO: make use of PySnpTools KernelReader functionality

        K0 = np.zeros([self.nb_ind, self.nb_ind], dtype=np.float32)
        nb_SNVs_filtered = 0
        stop = self.nb_SNVs_unf

        for start in range(0, stop, self.blocksize):

            if start + self.blocksize >= stop:
                temp_genotypes = self.bed[:, self.
                                          variants_to_include[start:]].read(
                                          ).standardize(Unit()).val
            else:
                temp_genotypes = self.bed[:, self.variants_to_include[
                    start:start + self.blocksize]].read().standardize(
                        Unit()).val

            # Replaced the code below with the PySnpTools internal standardizer
            # temp_genotypes = VariantLoader.mean_imputation(temp_genotypes)
            # filter_invariant = temp_genotypes == temp_genotypes[0, :]
            # filter_invariant = ~filter_invariant.all(0)
            # filter_all_nan = ~np.all(np.isnan(temp_genotypes), axis=0)
            # total_filter = filter_invariant & filter_all_nan
            # temp_genotypes = temp_genotypes[:, total_filter]
            # temp_genotypes = VariantLoader.standardize(temp_genotypes)
            # temp_n_SNVS = temp_genotypes.shape[1]
            # nb_SNVs_filtered += temp_n_SNVS

            # TODO: is invariant-filtering really necessary here?
            invariant = (temp_genotypes == temp_genotypes[0, :]).all(0)

            K0 += np.matmul(temp_genotypes[:, ~invariant],
                            temp_genotypes[:, ~invariant].T)
            nb_SNVs_filtered += (~invariant).sum()

        return K0 / nb_SNVs_filtered, nb_SNVs_filtered

    def compute_background_kernel(self):
        """Computes background kernel :math:`K_0` for given set of genotypes (binary PLINK 1 files).

        Overlap with data of set to be tested should have been carried out before, such that individuals in both data
        sets match.
        Does not return anything but sets instance attributes for either the background kernel :math:`K_0` or the
        background kernel genotype matrix :math:`G_0`.
        """
        if not self.samples_overlapped:
            logging.warning(
                'Data to construct background kernel was not overlapped with data of set to be tested.'
            )
        self.nb_ind = self.bed.iid_count
        self.nb_SNVs_unf = self.bed.sid_count
        print('# of individuals for background kernel: {}'.format(self.nb_ind))
        print('# of (unfiltered) SNVs for background kernel: {}'.format(
            self.nb_SNVs_unf))
        # low rank
        if self.nb_ind > self.nb_SNVs_unf or self.forcelowrank:
            self.G0, self.nb_SNVs_f = self._build_G0()
            self.K0 = None
        # full rank
        else:
            self.G0 = None
            self.K0, self.nb_SNVs_f = self._build_K0_blocked()
        print('# of filtered SNVs for background kernel: {}'.format(
            self.nb_SNVs_f))

    def write_kernel(self, path, filetype='hdf5'):
        """Write constructed background kernel :math:`K_0` to file, using eihter pysnptools.kernelreader.KernelHdf5 or pysnptools.kernelreader.KernelNpz.

        :param str path: Path to the output file to be created.
        :param str filetype: Either 'hdf5' or 'npz'
        """
        if self.K0 is None:
            if self.G0 is not None:
                raise ValueError(
                    'G0 is initialized: Number of individuals < number of variants. In this case no kernel is constructed.'
                )
            raise ValueError(
                'K0 is not initialized, need to call compute_background_kernel() first'
            )
        elif filetype == 'hdf5':
            KernelHdf5.write(path, KernelData(self.iid_fid.values,
                                              val=self.K0))
        elif filetype == 'npz':
            KernelNpz.write(path, KernelData(self.iid_fid.values, val=self.K0))
        else:
            raise ValueError(
                'filetype has to be either "npz" or "hdf5", got {}'.format(
                    filetype))

Exemplo n.º 5

0

Exibir arquivo

Arquivo: tutorial.py Projeto: MicrosoftGenomics/PySnpTools

# [   5 4001 4001]
# [   5 4002 4002]
# ..., 
# [   5 4997 4997]
# [   5 4998 4998]
# [   5 4999 4999]]

#In one-line:
chr5data = Bed("all.bed")[:,snpreader.pos[:,0] == 5].read()

# You can turn iid or sid names into indexes
snpreader = Bed("all.bed")
iid0 =[['cid499P1','cid499P1'],
      ['cid489P1','cid489P1'],
      ['cid479P1','cid479P1']]
indexes0 = snpreader.iid_to_index(iid0)
print indexes0
#array([499, 489, 479])
snpreader0 = snpreader[indexes0,:]
print snpreader0.iid
#[['cid499P1' 'cid499P1']
# ['cid489P1' 'cid489P1']
# ['cid479P1' 'cid479P1']]

# more condensed
snpreader0 = snpreader[snpreader.iid_to_index(iid0),:]

#both a once
snpdata0chr5 = snpreader[snpreader.iid_to_index(iid0),snpreader.pos[:,0] == 5].read()
print np.mean(snpdata0chr5.val)
# 1.493