示例#1
0
 def test_mutliphase_partition_coef(self):
     m = op.phases.MultiPhase(network=self.net,
                              phases=[self.water, self.air, self.oil])
     x, y, z = self.net["pore.coords"].T
     ps_water = self.net.Ps[(y <= 3) + (y >= 8)]
     ps_air = self.net.Ps[(y > 3) * (y < 6)]
     ps_oil = self.net.Ps[(y >= 6) * (y < 8)]
     # Phase arrangement (y-axis): W | A | O | W
     m.set_occupancy(phase=self.water, pores=ps_water)
     m.set_occupancy(phase=self.air, pores=ps_air)
     m.set_occupancy(phase=self.oil, pores=ps_oil)
     const = op.models.misc.constant
     K_air_water = 2.0
     K_air_oil = 1.8
     K_water_oil = 0.73
     m.set_binary_partition_coef(propname="throat.partition_coef",
                                 phases=[self.air, self.water],
                                 model=const,
                                 value=K_air_water)
     m.set_binary_partition_coef(propname="throat.partition_coef",
                                 phases=[self.air, self.oil],
                                 model=const,
                                 value=K_air_oil)
     m.set_binary_partition_coef(propname="throat.partition_coef",
                                 phases=[self.water, self.oil],
                                 model=const,
                                 value=K_water_oil)
     K_aw = m["throat.partition_coef.air:water"]
     K_ao = m["throat.partition_coef.air:oil"]
     K_wo = m["throat.partition_coef.water:oil"]
     K_global = m["throat.partition_coef.all"]
     assert sp.isclose(K_aw.mean(), K_air_water)
     assert sp.isclose(K_ao.mean(), K_air_oil)
     assert sp.isclose(K_wo.mean(), K_water_oil)
     # Get water-air interface throats
     tmp1 = self.net.find_neighbor_throats(ps_water, mode="xor")
     tmp2 = self.net.find_neighbor_throats(ps_air, mode="xor")
     Ts_water_air_interface = sp.intersect1d(tmp1, tmp2)
     # Get air-oil interface throats
     tmp1 = self.net.find_neighbor_throats(ps_air, mode="xor")
     tmp2 = self.net.find_neighbor_throats(ps_oil, mode="xor")
     Ts_air_oil_interface = sp.intersect1d(tmp1, tmp2)
     # Get oil-water interface throats
     tmp1 = self.net.find_neighbor_throats(ps_oil, mode="xor")
     tmp2 = self.net.find_neighbor_throats(ps_water, mode="xor")
     Ts_oil_water_interface = sp.intersect1d(tmp1, tmp2)
     # K_global for water-air interface must be 1/K_air_water
     assert sp.isclose(K_global[Ts_water_air_interface].mean(),
                       1 / K_air_water)
     # K_global for air-oil interface must be K_air_oil (not 1/K_air_oil)
     assert sp.isclose(K_global[Ts_air_oil_interface].mean(), K_air_oil)
     # K_global for oil-water interface must be 1/K_water_oil
     assert sp.isclose(K_global[Ts_oil_water_interface].mean(),
                       1 / K_water_oil)
     # K_global for single-phase regions must be 1.0
     interface_throats = sp.hstack(
         (Ts_water_air_interface, Ts_air_oil_interface,
          Ts_oil_water_interface))
     Ts_single_phase = sp.setdiff1d(self.net.Ts, interface_throats)
     assert sp.isclose(K_global[Ts_single_phase].mean(), 1.0)
def subsetsWithFits(fileNumString,onlyNew=False):
    """
    Find data subsets (N) that have models that have been fit to
    all conditions.
    
    onlyNew (False)         : Optionally include only subsets that have
                              fits that are not included in the current
                              combined fitProbs.
    """
    fpd = loadFitProbData(fileNumString)
    saveFilename = fpd.values()[0]['saveFilename']
    
    Nlist = []
    for N in scipy.sort(fpd.keys()):
        # find models that have been fit to all conditions
        if len(fpd[N]['fitProbDataList']) == 1:
            fitModels = fpd[N]['fitProbDataList'][0]['logLikelihoodDict'].keys()
        else:
            fitModels = scipy.intersect1d([ fp['logLikelihoodDict'].keys() \
                                            for fp in fpd[N]['fittingProblemList'] ])
        if onlyNew:
            Nfilename = directoryPrefixNonly(fileNumString,N)+'/'+saveFilename
            fileExists = os.path.exists(Nfilename)
            if not fileExists: # no combined file exists
                if len(fitModels) > 0:
                    Nlist.append(N)
            else: # check which fit models are currently included in the saved file
                fpMultiple = load(Nfilename)
                fitModelsSaved = fpMultiple.logLikelihoodDict.keys()
                if len(scipy.intersect1d(fitModels,fitModelsSaved)) < len(fitModels):
                    Nlist.append(N)
        else:
            if len(fitModels) > 0:
                Nlist.append(N)
    return Nlist
示例#3
0
    def eval_func(self, gean):
        line_list = self.line_list.copy()
        count = 0

        for i, empty in enumerate(self.empty_list):
            for j in empty[1]:
                line_list[i][j] = gean[count]
                count += 1

        row_list = sci.array([
            line_list[:, 0], line_list[:, 1], line_list[:, 2], line_list[:, 3],
            line_list[:, 4], line_list[:, 5], line_list[:, 6], line_list[:, 7],
            line_list[:, 8]
        ])

        block_list = sci.array([
            sci.append(line_list[0:3, 0:1],
                       [line_list[0:3, 1:2], line_list[0:3, 2:3]]),
            sci.append(line_list[0:3, 3:4],
                       [line_list[0:3, 4:5], line_list[0:3, 5:6]]),
            sci.append(line_list[0:3, 6:7],
                       [line_list[0:3, 7:8], line_list[0:3, 8:9]]),
            sci.append(line_list[3:6, 0:1],
                       [line_list[3:6, 1:2], line_list[3:6, 2:3]]),
            sci.append(line_list[3:6, 3:4],
                       [line_list[3:6, 4:5], line_list[3:6, 5:6]]),
            sci.append(line_list[3:6, 6:7],
                       [line_list[3:6, 7:8], line_list[3:6, 8:9]]),
            sci.append(line_list[6:9, 0:1],
                       [line_list[6:9, 1:2], line_list[6:9, 2:3]]),
            sci.append(line_list[6:9, 3:4],
                       [line_list[6:9, 4:5], line_list[6:9, 5:6]]),
            sci.append(line_list[6:9, 6:7],
                       [line_list[6:9, 7:8], line_list[6:9, 8:9]])
        ])

        value = 0
        """
        各縦、横、ブロックをそれぞれ見ていき、数字の種類につき加算する。
        もし完成していたら、9×9×3=243のスコアになるはず(9×9の場合)
        """
        "縦"
        for line in line_list:
            value += len(sci.intersect1d(line, [1, 2, 3, 4, 5, 6, 7, 8, 9]))
        "横"
        for row in row_list:
            value += len(sci.intersect1d(row, [1, 2, 3, 4, 5, 6, 7, 8, 9]))
        "ブロック"
        for block in block_list:
            value += len(sci.intersect1d(block, [1, 2, 3, 4, 5, 6, 7, 8, 9]))
        return value
示例#4
0
def intersect_rows(array1, array2, index=None):
    """Return intersection of rows"""

    if (array1.shape[0] == 0):
        if index == True:
            return (array1, sp.zeros((0, )), sp.zeros((0, )))
        else:
            return array1
    if (array2.shape[0] == 0):
        if index == True:
            return (array2, sp.zeros((0, )), sp.zeros((0, )))
        else:
            return array2

    array1_v = array1.view([('', array1.dtype)] * array1.shape[1])
    array2_v = array2.view([('', array2.dtype)] * array2.shape[1])
    array_i = sp.intersect1d(array1_v, array2_v)

    if index == True:
        a1_i = sp.where(sp.in1d(array1_v, array_i))[0]
        a2_i = sp.where(sp.in1d(array2_v, array_i))[0]
        return (array_i.view(array1.dtype).reshape(array_i.shape[0],
                                                   array1.shape[1]), a1_i,
                a2_i)
    else:
        return array_i.view(array1.dtype).reshape(array_i.shape[0],
                                                  array1.shape[1])
示例#5
0
def eval_func(line_list, empty_list, geanSize):
# #    gean = [0,0,1,0,1,1,1,0,1,1]
    line_list, empty_list, kouho_list, geanSize = nimotu_init()
    print (sorted(set(list(iter.permutations(kouho_list , 3)))))
    count = 0
    for i, empty in enumerate(empty_list):
        for j in empty[1]:
            cell = ""
            for k in geanSize[count:count + 4]:
                cell += str(int(k))
            if (0 < int(cell, 2) and int(cell, 2) < 10):
                line_list[i][j] = int(cell, 2)
            count += 4
            
    row_list = sci.array([line_list[:, 0],
                  line_list[:, 1],
                  line_list[:, 2],
                  line_list[:, 3],
                  line_list[:, 4],
                  line_list[:, 5],
                  line_list[:, 6],
                  line_list[:, 7],
                  line_list[:, 8]])
 
    block_list = sci.array([sci.append(line_list[0:3, 0:1], [line_list[0:3, 1:2], line_list[0:3, 2:3]]),
                       sci.append(line_list[0:3, 3:4], [line_list[0:3, 4:5], line_list[0:3, 5:6]]),
                       sci.append(line_list[0:3, 6:7], [line_list[0:3, 7:8], line_list[0:3, 8:9]]),
                        sci.append(line_list[3:6, 0:1], [line_list[3:6, 1:2], line_list[3:6, 2:3]]),
                        sci.append(line_list[3:6, 3:4], [line_list[3:6, 4:5], line_list[3:6, 5:6]]),
                        sci.append(line_list[3:6, 6:7], [line_list[3:6, 7:8], line_list[3:6, 8:9]]),
                        sci.append(line_list[6:9, 0:1], [line_list[6:9, 1:2], line_list[6:9, 2:3]]),
                        sci.append(line_list[6:9, 3:4], [line_list[6:9, 4:5], line_list[6:9, 5:6]]),
                        sci.append(line_list[6:9, 6:7], [line_list[6:9, 7:8], line_list[6:9, 8:9]])])

#     print(line_list)
    value = 0
    for line in line_list:
        value += len(sci.intersect1d(line, [1, 2, 3, 4, 5, 6, 7, 8, 9]))
#         value -= len(sci.where(line == 0))
    for row in row_list:
        value += len(sci.intersect1d(row, [1, 2, 3, 4, 5, 6, 7, 8, 9]))
#         value -= len(sci.where(row == 0))
    for block in block_list:
        value += len(sci.intersect1d(block, [1, 2, 3, 4, 5, 6, 7, 8, 9]))
#         value -= len(sci.where(block == 0))
#     print ("value:" + str(value))
    return value
示例#6
0
    def eval_func(self, gean):
        line_list = self.line_list.copy()
        count = 0
 
        for i, empty in enumerate(self.empty_list):
            for j in empty[1]:
                line_list[i][j] = gean[count]
                count += 1
                
        row_list = sci.array([line_list[:, 0],
                      line_list[:, 1],
                      line_list[:, 2],
                      line_list[:, 3],
                      line_list[:, 4],
                      line_list[:, 5],
                      line_list[:, 6],
                      line_list[:, 7],
                      line_list[:, 8]])
     
        block_list = sci.array([sci.append(line_list[0:3, 0:1], [line_list[0:3, 1:2], line_list[0:3, 2:3]]),
                           sci.append(line_list[0:3, 3:4], [line_list[0:3, 4:5], line_list[0:3, 5:6]]),
                           sci.append(line_list[0:3, 6:7], [line_list[0:3, 7:8], line_list[0:3, 8:9]]),
                            sci.append(line_list[3:6, 0:1], [line_list[3:6, 1:2], line_list[3:6, 2:3]]),
                            sci.append(line_list[3:6, 3:4], [line_list[3:6, 4:5], line_list[3:6, 5:6]]),
                            sci.append(line_list[3:6, 6:7], [line_list[3:6, 7:8], line_list[3:6, 8:9]]),
                            sci.append(line_list[6:9, 0:1], [line_list[6:9, 1:2], line_list[6:9, 2:3]]),
                            sci.append(line_list[6:9, 3:4], [line_list[6:9, 4:5], line_list[6:9, 5:6]]),
                            sci.append(line_list[6:9, 6:7], [line_list[6:9, 7:8], line_list[6:9, 8:9]])])
    
        value = 0
        """
        各縦、横、ブロックをそれぞれ見ていき、数字の種類につき加算する。
        もし完成していたら、9×9×3=243のスコアになるはず(9×9の場合)
        """
        "縦"
        for line in line_list:
            value += len(sci.intersect1d(line, [1, 2, 3, 4, 5, 6, 7, 8, 9]))
        "横"
        for row in row_list:
            value += len(sci.intersect1d(row, [1, 2, 3, 4, 5, 6, 7, 8, 9]))
        "ブロック"
        for block in block_list:
            value += len(sci.intersect1d(block, [1, 2, 3, 4, 5, 6, 7, 8, 9]))
        return value
示例#7
0
def plot_overlap_ps(result_file, ss_file='/Users/bjarnivilhjalmsson/data/GIANT/GIANT_HEIGHT_Wood_et_al_2014_publicrelease_HapMapCeuFreq.txt',
                   fig_filename='/Users/bjarnivilhjalmsson/data/tmp/manhattan_combPC_HGT.png', method='combPC',
                   ylabel='Comb. PC (HIP,WC,HGT,BMI) $-log_{10}(P$-value$)$', xlabel='Height $-log_{10}(P$-value$)$', p_thres=0.00001):
    # Parse results ans SS file
    res_table = pandas.read_table(result_file)
    ss_table = pandas.read_table(ss_file)
    # Parse 
    res_sids = sp.array(res_table['SNPid'])
    if method == 'MVT':
        comb_ps = sp.array(res_table['pval'])
    elif method == 'combPC':
        comb_ps = sp.array(res_table['combPC'])
    if 'MarkerName' in ss_table.keys():
        ss_sids = sp.array(ss_table['MarkerName'])
    elif 'SNP' in ss_table.keys():
        ss_sids = sp.array(ss_table['SNP'])
    else:
        raise Exception("Don't know where to look for rs IDs")
    marg_ps = sp.array(ss_table['p'])
    
    # Filtering boring p-values
    res_p_filter = comb_ps < p_thres
    res_sids = res_sids[res_p_filter]
    comb_ps = comb_ps[res_p_filter]
#     ss_p_filter = marg_ps<p_thres
#     ss_sids = ss_sids[ss_p_filter]
#     marg_ps = marg_ps[ss_p_filter]
    
    common_sids = sp.intersect1d(res_sids, ss_sids)
    print 'Found %d SNPs in common' % (len(common_sids))
    ss_filter = sp.in1d(ss_sids, common_sids)
    res_filter = sp.in1d(res_sids, common_sids)
    
    ss_sids = ss_sids[ss_filter]
    res_sids = res_sids[res_filter]
    marg_ps = marg_ps[ss_filter]
    comb_ps = comb_ps[res_filter]
    
    print 'Now sorting'
    ss_index = sp.argsort(ss_sids)
    res_index = sp.argsort(res_sids)
    
    marg_ps = -sp.log10(marg_ps[ss_index])
    comb_ps = -sp.log10(comb_ps[res_index])
    
    with plt.style.context('fivethirtyeight'):
        plt.plot(marg_ps, comb_ps, 'b.', alpha=0.2)
        (x_min, x_max) = plt.xlim()
        (y_min, y_max) = plt.ylim()
        
        plt.plot([x_min, x_max], [y_min, y_max], 'k--', alpha=0.2)
        plt.ylabel(ylabel)
        plt.xlabel(xlabel)
        plt.tight_layout()
        plt.savefig(fig_filename)
    plt.clf()
示例#8
0
def match_samples(*sampleIDs):
    sampleID_common = sampleIDs[0]
    for sampleID in sampleIDs:
        sampleID_common = sp.intersect1d(sampleID, sampleID_common)
    idxs = []
    for sampleID in sampleIDs:
        _idxs = sp.array(
            [sp.where(sampleID == sample)[0][0] for sample in sampleID_common])
        assert (sampleID[_idxs] == sampleID_common).all()
        idxs.append(_idxs)
    return idxs
示例#9
0
def reduce(PSI_l, Xl, coverage_threshold):
    """
    Computes set cover reduction to get the most relevant samples that define the class Xl.
    :param PSI_l: (Nl x 2) matrix containing both the scale and the shape of the weibull distribution
    :param Xl: (Nl x dimension_feature_vector) matrix containing the feature vectors of each instance of a class
    :param coverage_threshold: Probability above which we consider an instance to be not enough representative of its class
    :return: The indexes of the most representative samples of a class
    """
    #This matrix D is symmetric
    D = ppp_cosine_similarity(Xl, Xl)
    # Number of instances of the class
    Nl = np.shape(D)[0]

    S = []
    for i in range(Nl):
        Si = []
        for j in range(Nl):
            if (psi_i_dist(D[i, j], PSI_l[i, 0], PSI_l[i, 1]) >=
                    coverage_threshold):
                # Sample i is redundant with respect to j
                Si.append(j)
        S.append(Si)
    # Universe
    U = list(range(0, Nl))
    # Covered index
    C = []
    # Final indexs
    I = []

    #Set Cover Implementation
    while (len(scipy.intersect1d(C, U)) != len(U)):
        # punct_ref is a counter to find the maximum in every iteration
        punct_ref = 0
        # ind represent the index that we will append to our index's list
        ind = 0
        index_s = 0
        for s in S:
            punct = 0
            relative_inclusion = scipy.isin(s, C)
            for eleme in relative_inclusion:
                if (eleme is False):
                    punct += 1
            if (punct >= punct_ref):
                ind = index_s
            index_s += 1

        C = scipy.union1d(C, S[ind])
        I.append(ind)
        S.remove(S[ind])
        if (len(S) == 0):
            break
    return I
示例#10
0
	def acceptableindices(self,list,min,max,datalimit):
		array = scipy.array(list)
		larger = scipy.where(array >= min)
		smaller = scipy.where(array <= max)
		goodarrayindices = scipy.intersect1d(larger[0],smaller[0])
		goodindices = goodarrayindices.tolist()
		# The current tilt series may need an extra index to make up the number
		while len(goodindices) < datalimit and len(goodindices) > 0 and len(list) >= datalimit:
			nextindex = goodindices[-1]+1
			if nextindex not in range(0,len(list)):
				break
			goodindices.append(goodindices[-1]+1)
		return goodindices
示例#11
0
def subsetsWithFits(fileNumString, onlyNew=False):
    """
    Find data subsets (N) that have models that have been fit to
    all conditions.
    
    onlyNew (False)         : Optionally include only subsets that have
                              fits that are not included in the current
                              combined fitProbs.
    """
    fpd = loadFitProbData(fileNumString)
    saveFilename = fpd.values()[0]['saveFilename']

    Nlist = []
    for N in scipy.sort(fpd.keys()):
        # find models that have been fit to all conditions
        if len(fpd[N]['fitProbDataList']) == 1:
            fitModels = fpd[N]['fitProbDataList'][0]['logLikelihoodDict'].keys(
            )
        else:
            fitModels = scipy.intersect1d([ fp['logLikelihoodDict'].keys() \
                                            for fp in fpd[N]['fittingProblemList'] ])
        if onlyNew:
            Nfilename = directoryPrefixNonly(fileNumString,
                                             N) + '/' + saveFilename
            fileExists = os.path.exists(Nfilename)
            if not fileExists:  # no combined file exists
                if len(fitModels) > 0:
                    Nlist.append(N)
            else:  # check which fit models are currently included in the saved file
                fpMultiple = load(Nfilename)
                fitModelsSaved = fpMultiple.logLikelihoodDict.keys()
                if len(scipy.intersect1d(fitModels,
                                         fitModelsSaved)) < len(fitModels):
                    Nlist.append(N)
        else:
            if len(fitModels) > 0:
                Nlist.append(N)
    return Nlist
示例#12
0
def get_event_ids_from_gene(gene_id, event_type, outdir, confidence):

    eids = []
    IN = h5py.File(
        os.path.join(
            outdir,
            'merge_graphs_%s_C%i.counts.hdf5' % (event_type, confidence)), 'r')
    if 'conf_idx' in IN and IN['conf_idx'].shape[0] > 0:
        cidx = IN['conf_idx'][:]
        gidx = sp.where(IN['gene_idx'][:] == gene_id)[0]
        eids.extend(sp.intersect1d(cidx, gidx))
    IN.close()

    return eids
 def acceptableindices(self, list, min, max, datalimit):
     array = scipy.array(list)
     larger = scipy.where(array >= min)
     smaller = scipy.where(array <= max)
     goodarrayindices = scipy.intersect1d(larger[0], smaller[0])
     goodindices = goodarrayindices.tolist()
     # The current tilt series may need an extra index to make up the number
     while len(goodindices) < datalimit and len(goodindices) > 0 and len(
             list) >= datalimit:
         nextindex = goodindices[-1] + 1
         if nextindex not in range(0, len(list)):
             break
         goodindices.append(goodindices[-1] + 1)
     return goodindices
示例#14
0
文件: filters.py 项目: dynaryu/eqrm
def source_model_threshold_distance_subset(distances,
                                           source_model,
                                           atten_threshold_distance):
    """
    source_model_threshold_distance_subset
    Calculate the distances of the event_set from the sites array. For those
    events less than or equal to the attenuation threshold, return a subset
    source model so that calc_and_save_SA only works on those events.

    calc_and_save_SA calculates an SA figure by getting a subset of event
    indices:

    for source in source_model:
        event_inds = source.get_event_set_indexes()
        if len(event_inds) == 0:
            continue
        sub_event_set = event_set[event_inds]

    Returns source_model_subset
    """
    # A rethink of apply_threshold distance
    # Calculate the distances of the event_set from the sites array and
    # return an event_set where distance <= atten_threshold_distance
    Rjb = distances.distance('Joyner_Boore')

    # distances is an ndarray where [sites, events]. We only want the events
    # dimension for this function as we're trimming events
    (sites_to_keep, events_to_keep) = where(Rjb <= atten_threshold_distance)

    source_model_subset = copy.deepcopy(source_model)
    # Re-sync the event indices in the source model. As we don't want to add
    # events that may already be excluded by generate_synthetic_events_fault(),
    # do the following
    # 1. Grab the event set already calculated
    # 2. The intersection of this and events_to_keep is what we want
    for source in source_model_subset:
        source_indices = source.get_event_set_indexes()
        source.set_event_set_indexes(
            intersect1d(source_indices, events_to_keep))

    return source_model_subset
示例#15
0
    def find_connecting_throat(self,P1,P2):
        r"""
        Return a the throat number connecting two given pores connected

        Parameters
        ----------
        P1 , P2 : int
            The pore numbers connected by the desired throat

        Returns
        -------
        Tnum : int
            Returns throat number, or empty array if pores are not connected
            
        Examples
        --------
        >>> pn = OpenPNM.Network.Cubic(name='doc_test').generate(divisions=[5,5,5],lattice_spacing=[1])
        >>> pn.find_connecting_throat(0,1)
        array([0])
        """
        return sp.intersect1d(self.find_neighbor_throats(P1),self.find_neighbor_throats(P2))
示例#16
0
def source_model_threshold_distance_subset(distances, source_model,
                                           atten_threshold_distance):
    """
    source_model_threshold_distance_subset
    Calculate the distances of the event_set from the sites array. For those
    events less than or equal to the attenuation threshold, return a subset
    source model so that calc_and_save_SA only works on those events.

    calc_and_save_SA calculates an SA figure by getting a subset of event
    indices:

    for source in source_model:
        event_inds = source.get_event_set_indexes()
        if len(event_inds) == 0:
            continue
        sub_event_set = event_set[event_inds]

    Returns source_model_subset
    """
    # A rethink of apply_threshold distance
    # Calculate the distances of the event_set from the sites array and
    # return an event_set where distance <= atten_threshold_distance
    Rjb = distances.distance('Joyner_Boore')

    # distances is an ndarray where [sites, events]. We only want the events
    # dimension for this function as we're trimming events
    (sites_to_keep, events_to_keep) = where(Rjb <= atten_threshold_distance)

    source_model_subset = copy.deepcopy(source_model)
    # Re-sync the event indices in the source model. As we don't want to add
    # events that may already be excluded by generate_synthetic_events_fault(),
    # do the following
    # 1. Grab the event set already calculated
    # 2. The intersection of this and events_to_keep is what we want
    for source in source_model_subset:
        source_indices = source.get_event_set_indexes()
        source.set_event_set_indexes(
            intersect1d(source_indices, events_to_keep))

    return source_model_subset
示例#17
0
    def find_connecting_throat(self, P1, P2):
        r"""
        Return the throat number connecting pairs of pores

        Parameters
        ----------
        P1 , P2 : array_like
            The pore numbers whose throats are sought.  These can be vectors
            of pore numbers, but must be the same length

        Returns
        -------
        Tnum : list of list of int
            Returns throat number(s), or empty array if pores are not connected

        Examples
        --------
        >>> import OpenPNM
        >>> pn = OpenPNM.Network.TestNet()
        >>> pn.find_connecting_throat([0, 1, 2], [2, 2, 2])
        [[], [3], []]

        TODO: This now works on 'vector' inputs, but is not actually vectorized
        in the Numpy sense, so could be slow with large P1,P2 inputs
        """
        P1 = self._parse_locations(P1)
        P2 = self._parse_locations(P2)
        Ts1 = self.find_neighbor_throats(P1, flatten=False)
        Ts2 = self.find_neighbor_throats(P2, flatten=False)
        Ts = []

        for row in range(0, len(P1)):
            if P1[row] == P2[row]:
                throat = []
            else:
                throat = sp.intersect1d(Ts1[row], Ts2[row]).tolist()
            Ts.insert(0, throat)
        Ts.reverse()
        return Ts
示例#18
0
    def find_connecting_throat(self, P1, P2):
        r"""
        Return the throat number connecting pairs of pores

        Parameters
        ----------
        P1 , P2 : array_like
            The pore numbers whose throats are sought.  These can be vectors
            of pore numbers, but must be the same length

        Returns
        -------
        Tnum : list of list of int
            Returns throat number(s), or empty array if pores are not connected

        Examples
        --------
        >>> import OpenPNM
        >>> pn = OpenPNM.Network.TestNet()
        >>> pn.find_connecting_throat([0, 1, 2], [2, 2, 2])
        [[], [3], []]

        TODO: This now works on 'vector' inputs, but is not actually vectorized
        in the Numpy sense, so could be slow with large P1,P2 inputs
        """
        P1 = self._parse_locations(P1)
        P2 = self._parse_locations(P2)
        Ts1 = self.find_neighbor_throats(P1, flatten=False)
        Ts2 = self.find_neighbor_throats(P2, flatten=False)
        Ts = []

        for row in range(0, len(P1)):
            if P1[row] == P2[row]:
                throat = []
            else:
                throat = sp.intersect1d(Ts1[row], Ts2[row]).tolist()
            Ts.insert(0, throat)
        Ts.reverse()
        return Ts
示例#19
0
def intersect_rows(array1, array2, index = None):
    """Return intersection of rows"""

    if (array1.shape[0] == 0):
        if index == True:
            return (array1, sp.zeros((0,)), sp.zeros((0,)))
        else:
            return array1
    if (array2.shape[0] == 0):
        if index == True:
            return (array2, sp.zeros((0,)), sp.zeros((0,)))
        else:
            return array2

    array1_v = array1.view([('', array1.dtype)] * array1.shape[1])
    array2_v = array2.view([('', array2.dtype)] * array2.shape[1])
    array_i = sp.intersect1d(array1_v, array2_v)

    if index == True:
        a1_i = sp.where(sp.in1d(array1_v, array_i))[0]
        a2_i = sp.where(sp.in1d(array2_v, array_i))[0]
        return (array_i.view(array1.dtype).reshape(array_i.shape[0], array1.shape[1]), a1_i, a2_i)
    else:
        return array_i.view(array1.dtype).reshape(array_i.shape[0], array1.shape[1])
示例#20
0
 fw = open('Data/NTotalGenesAndGeneSetInfoOfAlpha.tsv','w')
 for Alpha in DataDict['AlphaLvls']:
     if(Alpha==0.0):
         continue
     NTotalGenesOfAlpha   = 0
     UniqGenes            = []
     TraitSetAtAlpha      = []
     for i in xrange(len(Traits)):
         GeneSetAtAlpha      = DataDict[Traits[i]]['GeneSetAtAlpha_'+str(Alpha)]
         NTotalGenesOfAlpha += len(GeneSetAtAlpha)
         UniqGenes.extend(GeneSetAtAlpha)
         if(len(GeneSetAtAlpha)>0):
             TraitSetAtAlpha.append(Traits[i])
     TraitSetAtAlpha  = scipy.array(TraitSetAtAlpha)
     GWIntersection   = scipy.intersect1d(ar1=TraitSetAtAlpha,
                                          ar2=GWSignTraits,
                                          assume_unique=False)
     GWMWIntersection = scipy.intersect1d(ar1=TraitSetAtAlpha,
                                          ar2=GWMWSignTraits,
                                          assume_unique=False)
     GWUnion          = scipy.union1d(ar1=TraitSetAtAlpha,
                                      ar2=GWSignTraits)
     GWMWUnion        = scipy.union1d(ar1=TraitSetAtAlpha,
                                      ar2=GWMWSignTraits)
     fw.write(str(Alpha)+'\t'+\
              str(NTotalGenesOfAlpha)+'\t'+\
              str(len(scipy.unique(scipy.array(UniqGenes))))+'\t'+\
              str(len(TraitSetAtAlpha))+'\t'+\
              str(len(GWSignTraits))+'\t'+\
              str(len(GWIntersection))+'\t'+\
              str(float(len(GWIntersection))/float(len(GWUnion)))+'\t'+\
示例#21
0
def verify_alt_prime(event, gene, counts_segments, counts_edges, CFG):
    # [verified, info] = verify_exon_skip(event, fn_bam, cfg)

    # (0) valid, (1) exon_diff_cov, (2) exon_const_cov
    # (3) intron1_conf, (4) intron2_conf
    info = [1, 0, 0, 0, 0]
    verified = [0, 0]

    ### check validity of exon coordinates (>=0)
    if sp.any(event.exons1 < 0) or sp.any(event.exons2 < 0):
        info[0] = 0 
        return (verified, info)

    ### check validity of intron coordinates (only one side is differing)
    if (event.exons1[0, 1] != event.exons2[0, 1]) and (event.exons1[1, 0] != event.exons2[1, 0]):
        info[0] = 0 
        return (verified, info)

    sg = gene.splicegraph
    segs = gene.segmentgraph

    ### find exons corresponding to event
    idx_exon11 = sp.where((sg.vertices[0, :] == event.exons1[0, 0]) & (sg.vertices[1, :] == event.exons1[0, 1]))[0]
    if idx_exon11.shape[0] == 0:
        segs_exon11 = sp.where((segs.segments[0, :] >= event.exons1[0, 0]) & (segs.segments[1, :] <= event.exons1[0, 1]))[0]
    else:
        segs_exon11 = sp.where(segs.seg_match[idx_exon11, :])[1]
    idx_exon12 = sp.where((sg.vertices[0, :] == event.exons1[1, 0]) & (sg.vertices[1, :] == event.exons1[1, 1]))[0]
    if idx_exon12.shape[0] == 0:
        segs_exon12 = sp.where((segs.segments[0, :] >= event.exons1[1, 0]) & (segs.segments[1, :] <= event.exons1[1, 1]))[0]
    else:
        segs_exon12 = sp.where(segs.seg_match[idx_exon12, :])[1]
    idx_exon21 = sp.where((sg.vertices[0, :] == event.exons2[0, 0]) & (sg.vertices[1, :] == event.exons2[0, 1]))[0]
    if idx_exon21.shape[0] == 0:
        segs_exon21 = sp.where((segs.segments[0, :] >= event.exons2[0, 0]) & (segs.segments[1, :] <= event.exons2[0, 1]))[0]
    else:
        segs_exon21 = sp.where(segs.seg_match[idx_exon21, :])[1]
    idx_exon22 = sp.where((sg.vertices[0, :] == event.exons2[1, 0]) & (sg.vertices[1, :] == event.exons2[1, 1]))[0]
    if idx_exon22.shape[0] == 0:
        segs_exon22 = sp.where((segs.segments[0, :] >= event.exons2[1, 0]) & (segs.segments[1, :] <= event.exons2[1, 1]))[0]
    else:
        segs_exon22 = sp.where(segs.seg_match[idx_exon22, :] > 0)[1]

    assert(segs_exon11.shape[0] > 0)
    assert(segs_exon12.shape[0] > 0)
    assert(segs_exon21.shape[0] > 0)
    assert(segs_exon22.shape[0] > 0)

    if sp.all(segs_exon11 == segs_exon21):
        seg_exon_const = segs_exon11
        seg_diff = sp.setdiff1d(segs_exon12, segs_exon22)
        if seg_diff.shape[0] == 0:
            seg_diff = sp.setdiff1d(segs_exon22, segs_exon12)
        seg_const = sp.intersect1d(segs_exon12, segs_exon22)
    elif sp.all(segs_exon12 == segs_exon22):
        seg_exon_const = segs_exon12
        seg_diff = sp.setdiff1d(segs_exon11, segs_exon21)
        if seg_diff.shape[0] == 0:
            seg_diff = sp.setdiff1d(segs_exon21, segs_exon11)
        seg_const = sp.intersect1d(segs_exon21, segs_exon11)
    else:
        print >> sys.stderr, "ERROR: both exons differ in alt prime event in verify_alt_prime"
        sys.exit(1)
    seg_const = sp.r_[seg_exon_const, seg_const]

    seg_lens = segs.segments[1, :] - segs.segments[0, :]

    # exon_diff_cov
    info[1] = sp.sum(counts_segments[seg_diff] * seg_lens[seg_diff]) / sp.sum(seg_lens[seg_diff])
    # exon_const_cov
    info[2] = sp.sum(counts_segments[seg_const] * seg_lens[seg_const]) / sp.sum(seg_lens[seg_const])

    if info[1] >= CFG['alt_prime']['min_diff_rel_cov'] * info[2]:
        verified[0] = 1

    ### check intron confirmations as sum of valid intron scores
    ### intron score is the number of reads confirming this intron
    # intron1_conf 
    idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([segs_exon11[-1], segs_exon12[0]], segs.seg_edges.shape))[0]
    assert(idx.shape[0] > 0)
    info[3] = counts_edges[idx, 1]
    # intron2_conf 
    idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([segs_exon21[-1], segs_exon22[0]], segs.seg_edges.shape))[0]
    assert(idx.shape[0] > 0)
    info[4] = counts_edges[idx, 1]

    if min(info[3], info[4]) >= CFG['alt_prime']['min_intron_count']:
        verified[1] = 1

    return (verified, info)
示例#22
0
def AssignLikelyContentsToSudokuGridLookupDictAndCheckPredictions(sudokuGridLookupDict, \
updatedColonyPurificationDict):

    from numpy import arange
    from scipy import intersect1d, unique
    import pdb
    from copy import deepcopy

    cpKeys = list(updatedColonyPurificationDict.keys())

    totalHopedForCorrect = 0
    totalPredictedWells = 0
    totalPredictionsCorrect = 0
    hopedForTransposonsInCollection = []

    # Go through the colony purification dict
    for key in cpKeys:
        line = updatedColonyPurificationDict[key]
        rowPool = line['rowPool']
        colPool = str(int(line['colPool']))
        pcPool = line['pcPool']
        prPool = line['prPool']

        # Takes the progenitor contents and expands them a little to allow for slop in
        # transposon position
        progenitorContentsExpanded = []
        progenitorContentsUnexpanded = []
        progenitorContentsUnexpandedLocatabilities = []
        i = 0

        while i < len(line['progenitorContents']):
            progCoord = line['progenitorContents'][i][0]
            progLocatability = line['progenitorContents'][i][1]
            progCoordExpanded = list(
                arange(progCoord - 1, progCoord + 2, 1, int))
            progenitorContentsExpanded += progCoordExpanded
            progenitorContentsUnexpanded.append(progCoord)
            progenitorContentsUnexpandedLocatabilities.append(progLocatability)
            i += 1

        hopedForCoord = int(line['hopedForTransposonCoord'])
        hopedForTransposonCoords = list(
            arange(hopedForCoord - 1, hopedForCoord + 2, 1, int))

        sudokuWell = sudokuGridLookupDict[prPool][pcPool].wellGrid[rowPool][
            colPool]
        sudokuWell.updateSimplifiedReadAlignmentCoords()
        sudokuWell.hasPredictionForContents = True
        sudokuWell.predictionsForContents = progenitorContentsExpanded
        sudokuWell.hopedForCoord = hopedForCoord
        sudokuWell.progenitorContents = progenitorContentsUnexpanded
        sudokuWell.progenitorLocatabilities = progenitorContentsUnexpandedLocatabilities
        sudokuWell.condensationType = line['condensationType']

        progenitorCol = line['progenitorCol']
        progenitorPlate = line['progenitorPlate']
        progenitorRow = line['progenitorRow']

        sudokuWell.addressDict['progenitor'] = {'plateName':progenitorPlate, \
        'row':progenitorRow, 'col':progenitorCol}

        # Intersect the predicted and hoped for transposon coords with the coords that intersect
        # with the wel location

        intersectPredictionsContents = intersect1d(sudokuWell.predictionsForContents, \
        sudokuWell.simplifiedReadAlignmentCoords)

        intersectHopedForContents = intersect1d(hopedForTransposonCoords, \
        sudokuWell.simplifiedReadAlignmentCoords)

        # Make some new read alignment coords

        # Check to see if the predicted and hoped for coords are there

        if len(intersectPredictionsContents) > 0:
            sudokuWell.predictionCorrect = True

            groupedIntersectPredictionsContents = \
            GroupGenomicCoords(intersectPredictionsContents, maxGap=1)

            sudokuWell.simplifiedLikelyReadAlignmentCoords = groupedIntersectPredictionsContents

            totalPredictionsCorrect += 1

        if len(intersectHopedForContents) > 0:
            sudokuWell.hopedForPresent = True
            sudokuWell.predictionCorrect = True

            groupedIntersectHopedForContents = \
            GroupGenomicCoords(intersectHopedForContents, maxGap=1)

            sudokuWell.simplifiedLikelyReadAlignmentCoords = groupedIntersectHopedForContents

            hopedForTransposonsInCollection.append(hopedForCoord)
            totalHopedForCorrect += 1

        totalPredictedWells += 1

        newReadAlignmentCoords = []

        if sudokuWell.predictionCorrect == True:
            for likelyCoord in sudokuWell.simplifiedLikelyReadAlignmentCoords:
                for readAlignmentCoord in sudokuWell.readAlignmentCoords:
                    coord = readAlignmentCoord.coord
                    if likelyCoord - 3 <= coord <= likelyCoord + 3:
                        newReadAlignmentCoords.append(
                            deepcopy(readAlignmentCoord))

        if sudokuWell.predictionCorrect and len(newReadAlignmentCoords) == 0:
            print('No coordinates found for well with correct prediction')
            pdb.set_trace()

        sudokuWell.readAlignmentCoords = newReadAlignmentCoords

    hopedForTransposonsInCollection = unique(hopedForTransposonsInCollection)
    print('Total Predicted Wells: ' + str(totalPredictedWells))
    print('Total Hoped for Correct: ' + str(totalHopedForCorrect))
    print('Total Predictions Correct: ' + str(totalPredictionsCorrect))
    print('Total Unique Hoped For Transposons: ' +
          str(len(hopedForTransposonsInCollection)))

    return
示例#23
0
def AssignLikelyContentsToSudokuGridLookupDictAndCheckPredictions(sudokuGridLookupDict, \
updatedColonyPurificationDict):
	
	from numpy import arange
	from scipy import intersect1d, unique
	import pdb
	from copy import deepcopy
	
	cpKeys = list(updatedColonyPurificationDict.keys())

	totalHopedForCorrect = 0
	totalPredictedWells = 0
	totalPredictionsCorrect = 0
	hopedForTransposonsInCollection = []

	# Go through the colony purification dict
	for key in cpKeys:
		line = updatedColonyPurificationDict[key]
		rowPool = line['rowPool']
		colPool = str(int(line['colPool']))
		pcPool = line['pcPool']
		prPool = line['prPool']
		
		# Takes the progenitor contents and expands them a little to allow for slop in 
		# transposon position
		progenitorContentsExpanded = []
		progenitorContentsUnexpanded = []
		progenitorContentsUnexpandedLocatabilities = []
		i = 0

		while i < len(line['progenitorContents']):
			progCoord = line['progenitorContents'][i][0]
			progLocatability = line['progenitorContents'][i][1]
			progCoordExpanded = list(arange(progCoord - 1, progCoord + 2, 1, int))
			progenitorContentsExpanded += progCoordExpanded
			progenitorContentsUnexpanded.append(progCoord)
			progenitorContentsUnexpandedLocatabilities.append(progLocatability)
			i += 1
	
		hopedForCoord = int(line['hopedForTransposonCoord'])
		hopedForTransposonCoords = list(arange(hopedForCoord - 1, hopedForCoord + 2, 1, int))
	
		sudokuWell = sudokuGridLookupDict[prPool][pcPool].wellGrid[rowPool][colPool]
		sudokuWell.updateSimplifiedReadAlignmentCoords()
		sudokuWell.hasPredictionForContents = True
		sudokuWell.predictionsForContents = progenitorContentsExpanded
		sudokuWell.hopedForCoord = hopedForCoord
		sudokuWell.progenitorContents = progenitorContentsUnexpanded
		sudokuWell.progenitorLocatabilities = progenitorContentsUnexpandedLocatabilities
		sudokuWell.condensationType = line['condensationType']
		
		progenitorCol = line['progenitorCol']
		progenitorPlate = line['progenitorPlate']
		progenitorRow = line['progenitorRow']
		
		sudokuWell.addressDict['progenitor'] = {'plateName':progenitorPlate, \
		'row':progenitorRow, 'col':progenitorCol}
	
		# Intersect the predicted and hoped for transposon coords with the coords that intersect
		# with the wel location
	
		intersectPredictionsContents = intersect1d(sudokuWell.predictionsForContents, \
		sudokuWell.simplifiedReadAlignmentCoords)
	
		intersectHopedForContents = intersect1d(hopedForTransposonCoords, \
		sudokuWell.simplifiedReadAlignmentCoords)
		
		
		# Make some new read alignment coords
			
		# Check to see if the predicted and hoped for coords are there
	
		if len(intersectPredictionsContents) > 0:
			sudokuWell.predictionCorrect = True
			
			groupedIntersectPredictionsContents = \
			GroupGenomicCoords(intersectPredictionsContents, maxGap=1)
			
			sudokuWell.simplifiedLikelyReadAlignmentCoords = groupedIntersectPredictionsContents
						
			totalPredictionsCorrect += 1
		
		if len(intersectHopedForContents) > 0:
			sudokuWell.hopedForPresent = True
			sudokuWell.predictionCorrect = True
			
			groupedIntersectHopedForContents = \
			GroupGenomicCoords(intersectHopedForContents, maxGap=1)
			
			sudokuWell.simplifiedLikelyReadAlignmentCoords = groupedIntersectHopedForContents
			
			hopedForTransposonsInCollection.append(hopedForCoord)
			totalHopedForCorrect += 1
			
		totalPredictedWells += 1
		
		newReadAlignmentCoords = []
		
		if sudokuWell.predictionCorrect == True:
			for likelyCoord in sudokuWell.simplifiedLikelyReadAlignmentCoords:
				for readAlignmentCoord in sudokuWell.readAlignmentCoords:
					coord = readAlignmentCoord.coord
					if likelyCoord - 3 <= coord <= likelyCoord + 3:
						newReadAlignmentCoords.append(deepcopy(readAlignmentCoord))
		
		if sudokuWell.predictionCorrect and len(newReadAlignmentCoords) == 0:
			print('No coordinates found for well with correct prediction')
			pdb.set_trace()
		
		sudokuWell.readAlignmentCoords = newReadAlignmentCoords

	hopedForTransposonsInCollection = unique(hopedForTransposonsInCollection)
	print('Total Predicted Wells: ' + str(totalPredictedWells))
	print('Total Hoped for Correct: ' + str(totalHopedForCorrect))
	print('Total Predictions Correct: ' + str(totalPredictionsCorrect))
	print('Total Unique Hoped For Transposons: ' + str(len(hopedForTransposonsInCollection)))
	
	return
示例#24
0
def load_data(CFG, is_Ens=True, gene_set='GOCB', het_only = True, het_onlyCB=True, pairs=False, filter_median = True, combine=False, filter_expressed = 0):
	f = h5py.File(CFG['train_file'],'r')
	Y = f['LogNcountsMmus'][:]
	labels = f['labels'][:].ravel()
	
	futil = h5py.File(CFG['util_file'],'r')
	Y_util = futil['LogNcountsQuartz'][:]
	
	ftst = h5py.File(CFG['test_file'],'r')
	if is_Ens ==True:
		genes = f['EnsIds'][:]
		genes_util = futil['gene_names_all'][:]
	else:
		genes = SP.char.lower(f['sym_names'][:])
		genes_util = SP.char.lower(futil['sym_namesQ'][:])

	#test file
	labels_util = futil['phase_vecS'][:]*2+futil['phase_vecG2M'][:]*3+futil['phase_vecG1'][:]
	if CFG['util_file']==CFG['test_file']:
		genes_tst = genes_util 
		YT = ftst['LogNcountsQuartz'][:]
		labels_tst = ftst['phase_vecS'][:]*2+ftst['phase_vecG2M'][:]*3+ftst['phase_vecG1'][:]
	elif is_Ens == False:
		ftst = h5py.File(CFG['test_file'],'r')
		YT = ftst['counts'][:]
		genes_tst = SP.char.lower(ftst['sym_names'][:])
		#genes_tst = ftst['ensIds'][:]
		#labels_tst = SP.array([1,1,1,1,1])#ftst['labels'][:].ravel() 
		labels_tst = ftst['labels'][:].ravel()
	elif is_Ens == True:
		ftst = h5py.File(CFG['test_file'],'r')
		YT = ftst['counts'][:]
		#genes_tst = ftst['sym_names'][:]
		genes_tst = ftst['ensIds'][:]
		#labels_tst = SP.array([1,1,1,1,1])#ftst['labels'][:].ravel() 
		labels_tst = ftst['labels'][:].ravel() 
	
	if 'class_labels' in ftst.keys():
		class_labels = ftst['class_labels'][:]
	else:
		class_labels = [i.astype('str') for i in labels_tst]
		class_labels = SP.sort(SP.unique(class_labels))
	heterogen_util = genes_util[SP.intersect1d(SP.where(Y_util.mean(0)>0)[0],SP.where(futil['genes_heterogen'][:]==1)[0])]
	heterogen_train = genes[SP.intersect1d(SP.where(Y.mean(0)>0)[0],SP.where(f['genes_heterogen'][:]==1)[0])]
	

	cellcyclegenes_GO = genes[SP.unique(f['cellcyclegenes_filter'][:].ravel() -1)] # idx of cell cycle genes
	cellcyclegenes_CB = genes[f['ccCBall_gene_indices'][:].ravel() -1]		# idxof cell cycle genes ...
	


	if SP.any(gene_set=='GOCB'):	
		cc_ens = SP.union1d(cellcyclegenes_GO,cellcyclegenes_CB)
	elif SP.any(gene_set=='GO'):
		cc_ens = cellcyclegenes_GO 
	elif SP.any(gene_set=='CB'):
		cc_ens = cellcyclegenes_CB 
	elif SP.any(gene_set=='all'):
		cc_ens = genes 
	else:
		#assert(gene_set in CFG.keys()), str(gene_set+' does not exist. Chose different gene set.')
		cc_ens = gene_set 

	
	if het_only==True:
		cc_ens = SP.intersect1d(cc_ens, heterogen_train)
		if pairs==True:
			Y = Y[:,SP.where(f['genes_heterogen'][:]==1)[0]]
			genes = genes[SP.where(f['genes_heterogen'][:]==1)[0]]
	if het_onlyCB==True:
		cc_ens = SP.intersect1d(cc_ens, heterogen_util)
	
	#filter_expressed = .2
	lod = 0
	if filter_expressed>0: 
		medY = SP.sum(Y>lod,0)*1.0
		idx_filter = (medY/SP.float_(Y.shape[0]))>filter_expressed
		Y = Y[:,idx_filter]
		genes = genes[idx_filter]
		
		#medY_tst = SP.sum(Y_tst>lod,0)
		#Y_tst = Y_tst[:,medY_tst>filter_expressed]
		#genes_tst = genes_tst[medY_tst>filter_expressed]		
		
		medY_util = SP.sum(Y_util>lod,0)
		idx_filter = (medY_util/SP.float_(Y_util.shape[0]))>filter_expressed
		Y_util = Y_util[:,idx_filter]
		genes_util = genes_util[idx_filter]		
	
	cc_ens = SP.intersect1d(cc_ens, genes)
	cc_ens = SP.intersect1d(cc_ens, genes_tst)
	cc_ens = SP.intersect1d(cc_ens, genes_util)
		
	if combine==True:
		genes = list(genes)
		genes_util = list(genes_util)
		genes_intersect = SP.intersect1d(genes,genes_util)
		cidx_tr = [ genes.index(x) for x in genes_intersect ]
		cidx_util = [genes_util.index(x) for x in genes_intersect]	
		genes = SP.array(genes)[cidx_tr]
		genes_util = SP.array(genes_util)[cidx_util]
		Y = SP.vstack([Y[:,cidx_tr],Y_util[:,cidx_util]])
		genes = genes_intersect
		labels = SP.hstack([labels, labels_util])				


	Y_tst = YT
	cc_data = {}
	cc_data['cc_ens'] = cc_ens
	cc_data['labels_tst'] = labels_tst	
	cc_data['labels'] = labels
	cc_data['genes_tst'] = genes_tst 
	cc_data['genes'] = genes 
	cc_data['Y'] = Y 
	cc_data['Y_test'] = Y_tst 
	cc_data['class_labels'] = class_labels 
	return cc_data
示例#25
0
def multidim_intersect(arr1, arr2):
    arr1_view = arr1.view([('',arr1.dtype)]*arr1.shape[1])
    arr2_view = arr2.view([('',arr2.dtype)]*arr2.shape[1])
    intersected = sp.intersect1d(arr1_view, arr2_view)
    return intersected.view(arr1.dtype).reshape(-1, arr1.shape[1])
               'REM.'         :os.path.join(BASEDIR, 'gtex_tables/GTex_rest_wo_cells_samples.txt')}
gt_dict = utils.get_gt_dict(sample_dict)

for event_type in event_types:

    picklefile = '%s/pca_skl_%s.TN.conf_%.2f.pickle' % (datadir, event_type, 1.0 - conf)
    hdf5file = '%s/pca_skl_%s.TN.conf_%.2f.hdf5' % (datadir, event_type, 1.0 - conf)
    if not os.path.exists(picklefile):

        ### get indices of confident events 
        IN = h5py.File('%s/merge_graphs_%s_C3.counts.hdf5' % (basedir_icgc, event_type), 'r')
        c_idx = IN['conf_idx'][:].astype('int')
        IN.close()
        IN = h5py.File('%s/merge_graphs_%s_C3.counts.hdf5' % (basedir_gtex, event_type), 'r')
        c_idx_gt = IN['conf_idx'][:].astype('int') 
        c_idx_gt = sp.intersect1d(c_idx_gt, c_idx)
        c_idx = sp.intersect1d(c_idx, c_idx_gt)
        IN.close()
        assert sp.all(c_idx == c_idx_gt)

        ### load TCGA data from hdf5
        print('Loading data from TCGA hdf5')
        IN = h5py.File('%s/merge_graphs_%s_C3.counts.hdf5' % (basedir_icgc, event_type), 'r')
        c_idx = IN['conf_idx'][:].astype('int')
        strains = sp.array([x.split('.')[0] for x in IN['strains'][:]], dtype='str')

        ### get psi values
        psi = sp.empty((IN['psi'].shape[0], c_idx.shape[0]), dtype='float')
        chunksize = IN['psi'].chunks[1] * 30
        cum = 0
        for c, chunk in enumerate(range(0, IN['psi'].shape[1], chunksize)):
示例#27
0
def plot_overlap_ps(
        result_file,
        ss_file='/Users/bjarnivilhjalmsson/data/GIANT/GIANT_HEIGHT_Wood_et_al_2014_publicrelease_HapMapCeuFreq.txt',
        fig_filename='/Users/bjarnivilhjalmsson/data/tmp/manhattan_combPC_HGT.png',
        method='combPC',
        ylabel='Comb. PC (HIP,WC,HGT,BMI) $-log_{10}(P$-value$)$',
        xlabel='Height $-log_{10}(P$-value$)$',
        p_thres=0.00001):
    # Parse results ans SS file
    res_table = pandas.read_table(result_file)
    ss_table = pandas.read_table(ss_file)
    # Parse
    res_sids = sp.array(res_table['SNPid'])
    if method == 'MVT':
        comb_ps = sp.array(res_table['pval'])
    elif method == 'combPC':
        comb_ps = sp.array(res_table['combPC'])
    if 'MarkerName' in ss_table.keys():
        ss_sids = sp.array(ss_table['MarkerName'])
    elif 'SNP' in ss_table.keys():
        ss_sids = sp.array(ss_table['SNP'])
    else:
        raise Exception("Don't know where to look for rs IDs")
    marg_ps = sp.array(ss_table['p'])

    # Filtering boring p-values
    res_p_filter = comb_ps < p_thres
    res_sids = res_sids[res_p_filter]
    comb_ps = comb_ps[res_p_filter]
    #     ss_p_filter = marg_ps<p_thres
    #     ss_sids = ss_sids[ss_p_filter]
    #     marg_ps = marg_ps[ss_p_filter]

    common_sids = sp.intersect1d(res_sids, ss_sids)
    print 'Found %d SNPs in common' % (len(common_sids))
    ss_filter = sp.in1d(ss_sids, common_sids)
    res_filter = sp.in1d(res_sids, common_sids)

    ss_sids = ss_sids[ss_filter]
    res_sids = res_sids[res_filter]
    marg_ps = marg_ps[ss_filter]
    comb_ps = comb_ps[res_filter]

    print 'Now sorting'
    ss_index = sp.argsort(ss_sids)
    res_index = sp.argsort(res_sids)

    marg_ps = -sp.log10(marg_ps[ss_index])
    comb_ps = -sp.log10(comb_ps[res_index])

    with plt.style.context('fivethirtyeight'):
        plt.plot(marg_ps, comb_ps, 'b.', alpha=0.2)
        (x_min, x_max) = plt.xlim()
        (y_min, y_max) = plt.ylim()

        plt.plot([x_min, x_max], [y_min, y_max], 'k--', alpha=0.2)
        plt.ylabel(ylabel)
        plt.xlabel(xlabel)
        plt.tight_layout()
        plt.savefig(fig_filename)
    plt.clf()
示例#28
0
                                                                 sample_size)]
            tmp[j, k] = sc.unique(sample).size

    #~ pdb.set_trace()
    alpha[i] = tmp.mean(axis=1)  # mean # spp per cell

    ## Per band, measure beta diversity
    tmp2 = sc.zeros((nrows, ncols))
    for j in range(nrows):
        #~ pdb.set_trace()
        for k in range(ncols):
            a = community[j, k][community[j, k] > 0]
            b = community[j, (k + 15) %
                          30][community[j, (k + 15) % 30] >
                              0]  # cell on opposite side of mountain to `a`
            shared_spp = sc.intersect1d(a, b)
            probability = shared_spp.size / cell_abundances[j]
            tmp2[j, k] = probability

    #~ pdb.set_trace()
    beta[i] = tmp2.mean(axis=1)

    gamma[i] = nspp_per_band[-10:].mean(axis=0)
    band_areas = cell_areas * T_theta
    gamma_area[i] = gamma[i] / band_areas

alpha = alpha[alpha.sum(axis=1) > 0]
beta = beta[beta.sum(axis=1) > 0]
# Delete rows with no data (file ID doesn't exist).

mean_alpha = alpha.mean(axis=0)
示例#29
0
def phenotype_correlations(request, q=None):
    """
    Return data for phenotype-phenotype correlations and between phenotype accession overlap
    ---

    produces:
        - application/json
    """
    #id string to list
    pids = map(int, q.split(","))
    pheno_dict = {}
    for i, pid in enumerate(pids):
        try:
            phenotype = Phenotype.objects.published().get(pk=pid)
        except:
            return Response({'message': 'FAILED', 'not_found': pid})
        pheno_acc_infos = phenotype.phenotypevalue_set.prefetch_related(
            'obs_unit__accession')
        values = sp.array(pheno_acc_infos.values_list('value', flat=True))
        samples = sp.array(
            pheno_acc_infos.values_list('obs_unit__accession__id', flat=True))
        name = str(
            phenotype.name.replace("<i>", "").replace("</i>", "") + " (" +
            str(phenotype.study.name) + ")")
        pheno_dict[str(phenotype.name) + "_" + str(phenotype.study.name) +
                   "_" + str(i)] = {
                       'samples': samples,
                       'y': values,
                       'name': name,
                       'id': str(phenotype.id)
                   }
        #str(phenotype.name) + "_" + str(phenotype.study.name) + "_" + str(i)}
    #compute correlation matrix
    corr_mat = sp.ones((len(pheno_dict), len(pheno_dict))) * sp.nan
    spear_mat = sp.ones((len(pheno_dict), len(pheno_dict))) * sp.nan
    pheno_keys = pheno_dict.keys()
    axes_data = []
    scatter_data = []
    sample_data = []
    slabels = {}
    for i, pheno1 in enumerate(pheno_keys):
        axes_data.append({
            "label": pheno_dict[pheno1]['name'],
            "index": str(i),
            "pheno_id": str(pheno_dict[pheno1]['id'])
        })
        samples1 = pheno_dict[pheno1]['samples']
        y1 = pheno_dict[pheno1]['y']
        #store scatter data
        scatter_data.append({
            "label": pheno_dict[pheno1]['name'],
            "pheno_id": str(pheno_dict[pheno1]['id']),
            "samples": samples1.tolist(),
            "values": y1.tolist()
        })

        for j, pheno2 in enumerate(pheno_keys):
            samples2 = pheno_dict[pheno2]['samples']
            y2 = pheno_dict[pheno2]['y']
            #match accessions
            ind = (sp.reshape(samples1,
                              (samples1.shape[0], 1)) == samples2).nonzero()
            y_tmp = y1[ind[0]]
            y2 = y2[ind[1]]
            if y1.shape[0] > 0 and y2.shape[0] > 0:
                corr_mat[i][j] = stats.pearsonr(y_tmp.flatten(),
                                                y2.flatten())[0]
                spear_mat[i][j] = stats.spearmanr(y_tmp.flatten(),
                                                  y2.flatten())[0]
            #compute sample intersections
            if pheno1 == pheno2:
                continue
            if pheno1 + "_" + pheno2 in slabels:
                continue
            if pheno2 + "_" + pheno1 in slabels:
                continue
            slabels[pheno1 + "_" + pheno2] = True
            A = samples1.shape[0]
            B = samples2.shape[0]
            C = sp.intersect1d(samples1, samples2).shape[0]
            sample_data.append({
                "labelA": pheno_dict[pheno1]['name'],
                "labelA_id": pheno_dict[pheno1]['id'],
                "labelB": pheno_dict[pheno2]['name'],
                "labelB_id": pheno_dict[pheno2]['id'],
                "A": A,
                "B": B,
                "C": C
            })
    data = {}
    data['axes_data'] = axes_data
    data['scatter_data'] = scatter_data
    data['sample_data'] = sample_data
    data['corr_mat'] = str(corr_mat.tolist()).replace("nan", "NaN")
    data['spear_mat'] = str(spear_mat.tolist()).replace("nan", "NaN")

    if request.method == "GET":
        return Response(data)
示例#30
0
		os.makedirs(run_dir)

	#load data
	f = h5py.File(CFG['data_file'],'r')
	Y = f['LogNcountsQuartz'][:]
	tech_noise = f['LogVar_techQuartz_logfit'][:]
	genes_het_bool=f['genes_heterogen'][:]	 # index of heterogeneous(??!??) genes
	geneID = f['gene_names_all'][:]			# gene names
	cellcyclegenes_filter = SP.unique(f['ccGO_gene_indices'][:].ravel() -1) # idx of cell cycle genes
	cellcyclegenes_filterCB600 = f['ccCBall_gene_indices'][:].ravel() -1		# idxof cell cycle genes ...
   

	# filter cell cycle genes
	idx_cell_cycle = SP.union1d(cellcyclegenes_filter,cellcyclegenes_filterCB600)
	Ymean2 = Y.mean(0)**2>0
	idx_cell_cycle_noise_filtered = SP.intersect1d(idx_cell_cycle,SP.array(SP.where(Ymean2.ravel()>0)))
	Ycc = Y[:,idx_cell_cycle_noise_filtered]
	
	#Fit GPLVM to data 
	k = 1					 # number of latent factors
	file_name = CFG['panama_file']# name of the cache file
	recalc = True # recalculate X and Kconf
	sclvm = scLVM(Y)
	X,Kcc,varGPLVM = sclvm.fitGPLVM(idx=idx_cell_cycle_noise_filtered,k=1,out_dir='./cache',file_name=file_name,recalc=recalc)

	#3. load relevant dataset for analysis
	genes_het=SP.array(SP.where(f['genes_heterogen'][:].ravel()==1))

   # considers only heterogeneous genes
	Ihet = genes_het_bool==1
	Y	= Y[:,Ihet]

fr = open("Yoshiko.csv", "r")
YoshikoClusterDict = {}
YoshikoClusters = []
fr.readline()
for Line in fr:
    LSplit = Line.strip().split(",")
    YoshikoClusterDict[LSplit[0]] = LSplit[-1].split(";")
    YoshikoClusters.append(LSplit[0])
fr.close()

fw = open("MtbClusterOverlappingGenes.csv", "w")
fw.write("ClusterIndex,OverlappingGenes,Metabolites\n")
for Cluster in YoshikoClusters:
    ClusterGeneDict = {}
    for Mtb in YoshikoClusterDict[Cluster]:
        ClusterGeneDict[Mtb] = HeinzGeneDict[Mtb]
    Cntr = 0
    InterSectionArray = None
    CurrentArray = None
    for Value in ClusterGeneDict.itervalues():
        if Cntr == 0:
            CurrentArray = Value
        InterSectionArray = scipy.intersect1d(CurrentArray, Value)
        Cntr += 1
    fw.write(
        Cluster + "," + ";".join(InterSectionArray.tolist()) + "," + ";".join(YoshikoClusterDict[str(Cluster)]) + "\n"
    )
fw.close()
示例#32
0
def coordinate_genotypes_ss_w_ld_ref(genotype_file = None,
                                    reference_genotype_file = None,
                                    hdf5_file = None,
                                    genetic_map_dir=None,
                                    check_mafs=False,
                                    min_maf=0.01):
#   recode_dict = {1:'A', 2:'T', 3:'C', 4:'G'} #1K genomes recoding..
    print 'Coordinating things w genotype file: %s \nref. genot. file: %s'%(genotype_file, reference_genotype_file) 
    plinkf = plinkfile.PlinkFile(genotype_file)
    
    #Loads only the individuals... (I think?)
    samples = plinkf.get_samples()
    num_individs = len(samples)
    Y = [s.phenotype for s in samples]
    fids = [s.fid for s in samples]
    iids = [s.iid for s in samples]
    
    unique_phens = sp.unique(Y)
    if len(unique_phens)==1:
        print 'Unable to find phenotype values.'
        has_phenotype=False
    elif len(unique_phens)==2:
        cc_bins = sp.bincount(Y)
        assert len(cc_bins)==2, 'Problems with loading phenotype'
        print 'Loaded %d controls and %d cases'%(cc_bins[0], cc_bins[1])
        has_phenotype=True
    else:
        print 'Found quantitative phenotype values'
        has_phenotype=True

    #Figure out chromosomes and positions.  
    print 'Parsing validation genotype bim file'
    loci = plinkf.get_loci()
    plinkf.close()
    gf_chromosomes = [l.chromosome for l in loci] 

    chromosomes = sp.unique(gf_chromosomes)
    chromosomes.sort()
    
    chr_dict = _get_chrom_dict_(loci, chromosomes)

    print 'Parsing LD reference genotype bim file'
    plinkf_ref = plinkfile.PlinkFile(reference_genotype_file)
    loci_ref = plinkf_ref.get_loci()
    plinkf_ref.close()
    
    chr_dict_ref = _get_chrom_dict_(loci_ref, chromosomes)
#     chr_dict_ref = _get_chrom_dict_bim_(reference_genotype_file+'.bim', chromosomes)
    
    #Open HDF5 file and prepare out data
    assert not 'iids' in hdf5_file.keys(), 'Something is wrong with the HDF5 file?'
    if has_phenotype:
        hdf5_file.create_dataset('y', data=Y)
    
    hdf5_file.create_dataset('fids', data=fids)
    hdf5_file.create_dataset('iids', data=iids)
    ssf = hdf5_file['sum_stats']
    cord_data_g = hdf5_file.create_group('cord_data')

    maf_adj_risk_scores = sp.zeros(num_individs)
    num_common_snps = 0
    #corr_list = []
    
    tot_g_ss_nt_concord_count = 0
    tot_rg_ss_nt_concord_count = 0
    tot_g_rg_nt_concord_count = 0    
    tot_num_non_matching_nts = 0
   
    #Now iterate over chromosomes
    for chrom in chromosomes:
        ok_indices = {'g':[], 'rg':[], 'ss':[]}
        
        chr_str = 'chrom_%d'%chrom
        print 'Working on chromsome: %s'%chr_str
        
        chrom_d = chr_dict[chr_str]
        chrom_d_ref = chr_dict_ref[chr_str]
        try:
            ssg = ssf['chrom_%d' % chrom]
        except Exception, err_str:
            print err_str
            print 'Did not find chromsome in SS dataset.'
            print 'Continuing.'
            continue

        ssg = ssf['chrom_%d' % chrom]
        g_sids = chrom_d['sids']
        rg_sids = chrom_d_ref['sids']
        ss_sids = ssg['sids'][...]
        print 'Found %d SNPs in validation data, %d SNPs in LD reference data, and %d SNPs in summary statistics.'%(len(g_sids), len(rg_sids), len(ss_sids))
        common_sids = sp.intersect1d(ss_sids, g_sids)
        common_sids = sp.intersect1d(common_sids, rg_sids)
        print 'Found %d SNPs on chrom %d that were common across all datasets'%(len(common_sids), chrom)

        ss_snp_map = []
        g_snp_map = []
        rg_snp_map = []
        
        ss_sid_dict = {}
        for i, sid in enumerate(ss_sids):
            ss_sid_dict[sid]=i

        g_sid_dict = {}
        for i, sid in enumerate(g_sids):
            g_sid_dict[sid]=i

        rg_sid_dict = {}
        for i, sid in enumerate(rg_sids):
            rg_sid_dict[sid]=i
            
        for sid in common_sids:
            g_snp_map.append(g_sid_dict[sid])
        
        #order by positions
        g_positions = sp.array(chrom_d['positions'])[g_snp_map]
        order = sp.argsort(g_positions)
        #order = order.tolist()
        g_snp_map = sp.array(g_snp_map)[order]
        g_snp_map = g_snp_map.tolist()
        common_sids = sp.array(common_sids)[order]

        #Get the other two maps
        for sid in common_sids:
            rg_snp_map.append(rg_sid_dict[sid])
        
        for sid in common_sids:
            ss_snp_map.append(ss_sid_dict[sid])
            
        
        g_nts = sp.array(chrom_d['nts'])
        rg_nts = sp.array(chrom_d_ref['nts'])
        rg_nts_ok = sp.array(rg_nts)[rg_snp_map]
#         rg_nts_l = []
#         for nt in rg_nts_ok:
#             rg_nts_l.append([recode_dict[nt[0]],recode_dict[nt[1]]])
#         rg_nts_ok = sp.array(rg_nts_l)
        ss_nts = ssg['nts'][...]
        betas = ssg['betas'][...]
        log_odds = ssg['log_odds'][...]

        if 'freqs' in ssg.keys():
            ss_freqs = ssg['freqs'][...]

        g_ss_nt_concord_count = sp.sum(g_nts[g_snp_map] == ss_nts[ss_snp_map])/2.0
        rg_ss_nt_concord_count = sp.sum(rg_nts_ok == ss_nts[ss_snp_map])/2.0
        g_rg_nt_concord_count = sp.sum(g_nts[g_snp_map] == rg_nts_ok)/2.0
        print 'Nucleotide concordance counts out of %d genotypes: vg-g: %d, vg-ss: %d, g-ss: %d'%(len(g_snp_map),g_rg_nt_concord_count, g_ss_nt_concord_count, rg_ss_nt_concord_count)
        tot_g_ss_nt_concord_count += g_ss_nt_concord_count
        tot_rg_ss_nt_concord_count += rg_ss_nt_concord_count
        tot_g_rg_nt_concord_count += g_rg_nt_concord_count


        num_non_matching_nts = 0
        num_ambig_nts = 0


        #Identifying which SNPs have nucleotides that are ok..
        ok_nts = []
        for g_i, rg_i, ss_i in it.izip(g_snp_map, rg_snp_map, ss_snp_map):
            
            #To make sure, is the SNP id the same?
            assert g_sids[g_i]==rg_sids[rg_i]==ss_sids[ss_i], 'Some issues with coordinating the genotypes.'
            
            g_nt = g_nts[g_i]
            rg_nt = rg_nts[rg_i]
#             rg_nt = [recode_dict[rg_nts[rg_i][0]],recode_dict[rg_nts[rg_i][1]]]
            ss_nt = ss_nts[ss_i]

            #Is the nucleotide ambiguous.
            g_nt = [g_nts[g_i][0],g_nts[g_i][1]]
            if tuple(g_nt) in ambig_nts:
                num_ambig_nts +=1
                tot_num_non_matching_nts += 1                
                continue
            
            #First check if nucleotide is sane?
            if (not g_nt[0] in valid_nts) or (not g_nt[1] in valid_nts):
                num_non_matching_nts += 1
                tot_num_non_matching_nts += 1                
                continue
            
            os_g_nt = sp.array([opp_strand_dict[g_nt[0]], opp_strand_dict[g_nt[1]]])

            flip_nts = False
            if not ((sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt)) and (sp.all(g_nt == rg_nt) or sp.all(os_g_nt == rg_nt))):
                if sp.all(g_nt == rg_nt) or sp.all(os_g_nt == rg_nt):
                    flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0] == ss_nt[1]) or (os_g_nt[1] == ss_nt[0] and os_g_nt[0] == ss_nt[1])
                    #Try flipping the SS nt
                    if flip_nts:
                        betas[ss_i] = -betas[ss_i]                        
                        log_odds[ss_i] = -log_odds[ss_i]    
                        if 'freqs' in ssg.keys():
                            ss_freqs[ss_i] = 1-ss_freqs[ss_i]
                    else:
                        print "Nucleotides don't match after all?: g_sid=%s, ss_sid=%s, g_i=%d, ss_i=%d, g_nt=%s, ss_nt=%s" % \
                            (g_sids[g_i], ss_sids[ss_i], g_i, ss_i, str(g_nt), str(ss_nt))
                        num_non_matching_nts += 1
                        tot_num_non_matching_nts += 1
                        continue

                    
                else:
                    num_non_matching_nts += 1
                    tot_num_non_matching_nts += 1
                    continue
                    # Opposite strand nucleotides
            
           
            # everything seems ok.
            ok_indices['g'].append(g_i)
            ok_indices['rg'].append(rg_i)
            ok_indices['ss'].append(ss_i)

            ok_nts.append(g_nt)
#             if flip_nts:
#                 ok_nts.append([ss_nt[1],ss_nt[0]])
#             else:
#                 ok_nts.append(ss_nt)                

                        
        #print '%d SNPs in LD references to be flipped.'%((len(ref_snp_directions)-sp.sum(ref_snp_directions))/2.0)
        print '%d SNPs had ambiguous nucleotides.' % num_ambig_nts 
        print '%d SNPs were excluded due to nucleotide issues.' % num_non_matching_nts 
        print '%d SNPs were retained on chromosome %d.' % (len(ok_indices['g']), chrom)

        #Resorting by position
        positions = sp.array(chrom_d['positions'])[ok_indices['g']]
#         order = sp.argsort(positions)
#         sorted_positions = positions[order]
#         assert sp.all(sorted_positions==positions), 'Perhaps something is wrong here?'
#         ok_indices['g'] = list(sp.array(ok_indices['g'])[order])
#         ok_indices['ss'] = list(sp.array(ok_indices['ss'])[order])

        
        #Now parse SNPs ..
        snp_indices = sp.array(chrom_d['snp_indices'])
        snp_indices = snp_indices[ok_indices['g']] #Pinpoint where the SNPs are in the file.
        raw_snps,freqs = _parse_plink_snps_(genotype_file, snp_indices)
        
        snp_indices_ref = sp.array(chrom_d_ref['snp_indices'])
        snp_indices_ref = snp_indices_ref[ok_indices['rg']] #Pinpoint where the SNPs are in the file.
        raw_ref_snps, freqs_ref = _parse_plink_snps_(reference_genotype_file, snp_indices_ref)
        
        
        snp_stds_ref = sp.sqrt(2*freqs_ref*(1-freqs_ref)) 
        snp_means_ref = freqs_ref*2

        snp_stds = sp.sqrt(2*freqs*(1-freqs)) 
        snp_means = freqs*2
        
        betas = betas[ok_indices['ss']]  # * sp.sqrt(freqs * (1 - freqs))
        log_odds = log_odds[ok_indices['ss']]  # * sp.sqrt(freqs * (1 - freqs))

        ps = ssg['ps'][...][ok_indices['ss']]
        nts = sp.array(ok_nts)#[order]
        sids = ssg['sids'][...][ok_indices['ss']]

        #For debugging...
#         g_sids = sp.array(chrom_d['sids'])[ok_indices['g']]
#         rg_sids = sp.array(chrom_d_ref['sids'])[ok_indices['rg']]
#         ss_sids = ssg['sids'][...][ok_indices['ss']]
#         assert sp.all(g_sids==rg_sids) and sp.all(rg_sids==ss_sids), 'WTF?'
        
        #Check SNP frequencies..
        if check_mafs and 'freqs' in ssg.keys():
            ss_freqs = ss_freqs[ok_indices['ss']]
            freq_discrepancy_snp = sp.absolute(ss_freqs-(1-freqs))>0.15
            if sp.any(freq_discrepancy_snp):
                print 'Warning: %d SNPs were filtered due to high allele frequency discrepancy between summary statistics and validation sample'%sp.sum(freq_discrepancy_snp)
#                 print freqs[freq_discrepancy_snp]
#                 print ss_freqs[freq_discrepancy_snp]
                 
                #Filter freq_discrepancy_snps
                ok_freq_snps = sp.negative(freq_discrepancy_snp)
                raw_snps = raw_snps[ok_freq_snps]
                snp_stds = snp_stds[ok_freq_snps]
                snp_means = snp_means[ok_freq_snps]
                raw_ref_snps = raw_ref_snps[ok_freq_snps]
                snp_stds_ref = snp_stds_ref[ok_freq_snps]
                snp_means_ref = snp_means_ref[ok_freq_snps]
                freqs = freqs[ok_freq_snps]
                freqs_ref = freqs_ref[ok_freq_snps]
                ps = ps[ok_freq_snps]
                positions = positions[ok_freq_snps]
                nts = nts[ok_freq_snps]
                sids = sids[ok_freq_snps]
                betas = betas[ok_freq_snps]
                log_odds = log_odds[ok_freq_snps]
                #For debugging...
#         if sp.any(freq_discrepancy_snp):
#             g_sids = g_sids[ok_freq_snps]
#             rg_sids = rg_sids[ok_freq_snps]
#             ss_sids = ss_sids[ok_freq_snps]
#         assert sp.all(g_sids==rg_sids) and sp.all(rg_sids==ss_sids), 'WTF?'

        
        
        #Filter minor allele frequency SNPs.
        maf_filter = (freqs>min_maf)*(freqs<(1-min_maf))
        maf_filter_sum = sp.sum(maf_filter)
        n_snps = len(maf_filter)
        assert maf_filter_sum<=n_snps, "WTF?"
        if sp.sum(maf_filter)<n_snps:
            raw_snps = raw_snps[maf_filter]
            snp_stds = snp_stds[maf_filter]
            snp_means = snp_means[maf_filter]
            raw_ref_snps = raw_ref_snps[maf_filter]
            snp_stds_ref = snp_stds_ref[maf_filter]
            snp_means_ref = snp_means_ref[maf_filter]
            freqs = freqs[maf_filter]
            freqs_ref = freqs_ref[maf_filter]
            ps = ps[maf_filter]
            positions = positions[maf_filter]
            nts = nts[maf_filter]
            sids = sids[maf_filter]
            betas = betas[maf_filter]
            log_odds = log_odds[maf_filter]
#         if sp.sum(maf_filter)<n_snps:
#             g_sids = g_sids[maf_filter]
#             rg_sids = rg_sids[maf_filter]
#             ss_sids = ss_sids[maf_filter]
#         assert sp.all(g_sids==rg_sids) and sp.all(rg_sids==ss_sids), 'WTF?'
        
        
        
        maf_adj_prs = sp.dot(log_odds, raw_snps)
        if has_phenotype:
            maf_adj_corr = sp.corrcoef(Y, maf_adj_prs)[0, 1]
            print 'Log odds, per genotype PRS correlation w phenotypes for chromosome %d was %0.4f' % (chrom, maf_adj_corr)

        genetic_map = [] 
        if genetic_map_dir is not None:
            with gzip.open(genetic_map_dir+'chr%d.interpolated_genetic_map.gz'%chrom) as f:
                for line in f:
                    l = line.split()
                    if l[0] in sid_set:
                        genetic_map.append(l[0])
        
        
        print 'Now storing coordinated data to HDF5 file.'
        ofg = cord_data_g.create_group('chrom_%d' % chrom)
        ofg.create_dataset('raw_snps_val', data=raw_snps, compression='lzf')
        ofg.create_dataset('snp_stds_val', data=snp_stds)
        ofg.create_dataset('snp_means_val', data=snp_means)
        ofg.create_dataset('freqs_val', data=freqs)
        ofg.create_dataset('raw_snps_ref', data=raw_ref_snps, compression='lzf')
        ofg.create_dataset('snp_stds_ref', data=snp_stds_ref)
        ofg.create_dataset('snp_means_ref', data=snp_means_ref)
        ofg.create_dataset('freqs_ref', data=freqs_ref)
        ofg.create_dataset('nts', data=nts)
        ofg.create_dataset('ps', data=ps)
        ofg.create_dataset('positions', data=positions)
        ofg.create_dataset('sids', data=sids)
        if genetic_map_dir is not None:
            ofg.create_dataset('genetic_map', data=genetic_map)
        ofg.create_dataset('betas', data=betas)
        ofg.create_dataset('log_odds', data=log_odds)
        ofg.create_dataset('log_odds_prs', data=maf_adj_prs)
#         print 'Sum betas', sp.sum(betas ** 2)
        #ofg.create_dataset('prs', data=prs)
        
        
        #risk_scores += prs
        maf_adj_risk_scores += maf_adj_prs
        num_common_snps += len(betas)
示例#33
0
    strains_tcga_short = strains_tcga_short[lkidx]
    ctypes = ctypes[lkidx]
    tcga_is_tumor = tcga_is_tumor[lkidx]
else:
    lkidx = sp.arange(strains_tcga.shape[0])

print 'loading data for GTEx'
IN_GT = h5py.File(os.path.join(paths.basedir_as_gtex, 'spladder', 'genes_graph_conf%i.merge_graphs.validated.count.hdf5' % CONF), 'r')
gids_gtex =  IN_GT['gene_ids_edges'][:, 0]
gnames_gtex = IN_GT['gene_names'][:]
strains_gtex = IN_GT['strains'][:]
gtypes = sp.array([gt_dict[x.split('.')[0]] if x.split('.')[0] in gt_dict else 'NA' for x in strains_gtex], dtype='str')

gid_names_tcga = sp.array([gnames_tcga[i] for i in gids_tcga], dtype='str')
gid_names_gtex = sp.array([gnames_gtex[i] for i in gids_gtex], dtype='str')
gid_names_common = sp.intersect1d(gid_names_tcga, gid_names_gtex)
kidx_tcga = sp.where(sp.in1d(gid_names_tcga, gid_names_common))[0]
kidx_gtex = sp.where(sp.in1d(gid_names_gtex, gid_names_common))[0]
gids_tcga = gids_tcga[kidx_tcga]
gids_gtex = gids_gtex[kidx_gtex]

if not os.path.exists(os.path.join(paths.basedir_tss, 'tss_size_factors%s%s.cpickle' % (wl_tag, fl_tag))):

    ### compute total edge count for GTEx samples
    print 'Computing total edge count for GTEx samples'
    ### get gene intervals
    s_idx = sp.argsort(gids_gtex, kind='mergesort')
    _, f_idx = sp.unique(gids_gtex[s_idx], return_index=True)
    l_idx = sp.r_[f_idx[1:], gids_gtex.shape[0]]
    ### get counts
    genecounts_gtex = sp.zeros((f_idx.shape[0], IN_GT['edges'].shape[1]), dtype='int')
示例#34
0
from barplot import *
from default import *

from IPython.display import Latex

data = '/home/zhen/box/Manuscript/Tarik/Tarik2.h5f'
f = h5py.File(data,'r')
Y = f['Y'][:]                 # gene expression matrix
tech_noise = f['tech_noise'][:]       # technical noise
genes_het_bool=f['genes_het_bool'][:]     # index of heterogeneous genes
geneID = f['gene_names'][:]            # gene names
idx_cell_cycle = f['idx_cellcyclegenes'][:]

# determine non-zero counts
idx_nonzero = SP.nonzero((Y.mean(0)**2)>0)[0]
idx_cell_cycle_noise_filtered = SP.intersect1d(idx_cell_cycle,idx_nonzero)
# subset gene expression matrix
Ycc = Y[:,idx_cell_cycle_noise_filtered]

k = 20                   # number of latent factors
out_dir = '/home/zhen/box/Manuscript/Tarik/cache'      # folder where results are cached
file_name = 'Kcc.hdf5'    # name of the cache file
recalc = True             # recalculate X and Kconf
use_ard = True            # use automatic relevance detection
sclvm = scLVM(Y)
#Fit model with 80 factors
X_ARD,Kcc_ARD,varGPLVM_ARD = sclvm.fitGPLVM(idx=idx_cell_cycle_noise_filtered,
                                            k=k,
                                            out_dir=out_dir,
                                            file_name=file_name,
                                            recalc=recalc, 
示例#35
0
            sd_segs = scipy.std(segs, 0)

            for i in xrange(lsegs):

                R = scipy.argmax(segs[i])

                if R != 200:
                    outliers.append(i)
                elif max(segs[i]) > M:
                    outliers.append(i)
                elif min(segs[i]) < m:
                    outliers.append(i)
                else:
                    up = pylab.find(segs[i] > mean_curve - sd_segs)
                    down = pylab.find(segs[i] < mean_curve + sd_segs)
                    ins = len(scipy.intersect1d(up, down))

                    if ins < 540:
                        outliers.append(i)

            goods = list(alls - set(outliers))

            if goods != []:
                ax1.plot(segs[goods].T, 'k')
                ax1.axis('tight')
                ax1.grid()
            if outliers != []:
                ax2.plot(segs[outliers].T, 'r')
                ax2.axis('tight')
                ax2.grid()
            fig.savefig('falc_temp/temp/output-%d.png' % rid)
                ### annotated
                counts = []
                curr_xloc = []
                for i, t in enumerate(dsets_plot):
                    if p[0] == 'ALL':
                        t_idx = sp.arange(tids[t].shape[0])
                    else:
                        if t == 'gt':
                            t_idx = sp.where(tids[t] == p[1])[0]
                        else:
                            t_idx = sp.where(tids[t] == p[0])[0]
                    if count[(t, p)] == 0:
                        counts.append(0)
                    else:
                        counts.append(
                            sp.intersect1d(anno_idx,
                                           count[(t, p)][thresh]).shape[0])
                    curr_xloc.append(j * len(dsets_plot) + i + buff)
                    if t == 'tn':
                        labels.append(label_dict[t][p[0]] + '\nN=%i' %
                                      (sp.sum(~is_tumor[t_idx])))
                    elif t == 'tc':
                        labels.append(label_dict[t][p[0]] + '\nN=%i' %
                                      (sp.sum(is_tumor[t_idx])))
                    else:
                        labels.append(label_dict[t][p[0]] + '\nN=%i' %
                                      (t_idx.shape[0]))

                ax.bar(curr_xloc, counts, 0.5, color=colors[p[0]])

                ### not annotated
                counts2 = []
示例#37
0
def coordinate_datasets(reference_genotype_file, hdf5_file, summary_dict,
                        validation_genotype_file=None,
                        genetic_map_dir=None,
                        min_maf=0.01,
                        skip_coordination=False, 
                        max_freq_discrep = 0.15,
                        debug=False):
    
    summary_dict[3.9]={'name':'dash', 'value':'Coordination'}
    t0 = time.time()
    if validation_genotype_file is not None:
        print('Coordinating datasets (Summary statistics, LD reference genotypes, and Validation genotypes).')
    else:
        print('Coordinating datasets (Summary statistics and LD reference genotypes).')
        
    plinkf = plinkfile.PlinkFile(reference_genotype_file)

    # Figure out chromosomes and positions.
    if debug:
        print('Parsing plinkf_dict_val reference genotypes')
    loci = plinkf.get_loci()
    plinkf.close()
    summary_dict[4]={'name':'Num individuals in LD Reference data:','value':plinkfiles.get_num_indivs(reference_genotype_file)}
    summary_dict[4.1]={'name':'SNPs in LD Reference data:','value':len(loci)}
    gf_chromosomes = [l.chromosome for l in loci]
    
    chromosomes = sp.unique(gf_chromosomes)
    chromosomes.sort()

    chr_dict = plinkfiles.get_chrom_dict(loci, chromosomes)
    
    if validation_genotype_file is not None:
        if debug:
            print('Parsing LD validation bim file')
        plinkf_val = plinkfile.PlinkFile(validation_genotype_file)

        # Loads only the individuals... 
        plinkf_dict_val = plinkfiles.get_phenotypes(plinkf_val)
        
        loci_val = plinkf_val.get_loci()
        plinkf_val.close()
        summary_dict[5]={'name':'SNPs in Validation data:','value':len(loci_val)}

        chr_dict_val = plinkfiles.get_chrom_dict(loci_val, chromosomes)

        # Open HDF5 file and prepare out data
        assert not 'iids' in hdf5_file, 'Something is wrong with the HDF5 file, no individuals IDs were found.'
        if plinkf_dict_val['has_phenotype']:
            hdf5_file.create_dataset('y', data=plinkf_dict_val['phenotypes'])
            summary_dict[6]={'name':'Num validation phenotypes:','value':plinkf_dict_val['num_individs']}
   
        hdf5_file.create_dataset('fids', data=sp.array(plinkf_dict_val['fids'], dtype=util.fids_dtype))
        hdf5_file.create_dataset('iids', data=sp.array(plinkf_dict_val['iids'], dtype=util.iids_dtype))

        maf_adj_risk_scores = sp.zeros(plinkf_dict_val['num_individs'])

    
    # Now summary statistics
    ssf = hdf5_file['sum_stats']
    cord_data_g = hdf5_file.create_group('cord_data')

    num_common_snps = 0
    # corr_list = []



    chromosomes_found = set()
    num_snps_common_before_filtering =0
    num_snps_common_after_filtering =0
    tot_num_non_matching_nts = 0
    tot_num_non_supported_nts = 0
    tot_num_ambig_nts = 0
    tot_num_freq_discrep_filtered_snps = 0
    tot_num_maf_filtered_snps = 0
    tot_g_ss_nt_concord_count = 0
    if validation_genotype_file is not None:
        tot_g_vg_nt_concord_count = 0
        tot_vg_ss_nt_concord_count = 0
        
    # Now iterate over chromosomes
    chrom_i = 0
    for chrom in chromosomes:
        chrom_i +=1
        if not debug:
            sys.stdout.write('\b\b\b\b\b\b\b%0.2f%%' % (100.0 * (float(chrom_i) / (len(chromosomes)+1))))
            sys.stdout.flush()            
        try:
            chr_str = 'chrom_%d' % chrom
            ssg = ssf[chr_str]
                    
        except Exception as err_str:
                print(err_str)
                print('Did not find chromosome %d in SS dataset.'%chrom)
                print('Continuing.')
                continue
        
        if debug:
            print('Coordinating data for chromosome %s' % chr_str)

        chromosomes_found.add(chrom)
        
        #Get summary statistics chromosome group
        ssg = ssf['chrom_%d' % chrom]
        ss_sids = (ssg['sids'][...]).astype(util.sids_u_dtype)
        if validation_genotype_file is not None:
            chrom_d_val = chr_dict_val[chr_str]
            vg_sids = chrom_d_val['sids']
            common_sids = sp.intersect1d(ss_sids, vg_sids)
            
            # A map from sid to index for validation data        
            vg_sid_dict = {}
            for i, sid in enumerate(vg_sids):
                vg_sid_dict[sid] = i
        else:
            common_sids = ss_sids

        # A map from sid to index for summary stats        
        ss_sid_dict = {}
        for i, sid in enumerate(ss_sids):
            ss_sid_dict[sid] = i

        #The indices to retain for the LD reference genotypes
        chrom_d = chr_dict[chr_str]
        g_sids = chrom_d['sids']
        common_sids = sp.intersect1d(common_sids, g_sids)
        
        # A map from sid to index for LD reference data        
        g_sid_dict = {}
        for i, sid in enumerate(g_sids):
            g_sid_dict[sid] = i

        if debug:
            print('Found %d SNPs on chrom %d that were common across all datasets' % (len(common_sids), chrom))
            print('Ordering SNPs by genomic positions (based on LD reference genotypes).')
        
        g_snp_map = []
        for sid in common_sids:
            g_snp_map.append(g_sid_dict[sid])
        # order by positions (based on LD reference file)
        g_positions = sp.array(chrom_d['positions'])[g_snp_map]
        order = sp.argsort(g_positions)

        g_snp_map = sp.array(g_snp_map)[order]
        g_snp_map = g_snp_map.tolist()
        common_sids = sp.array(common_sids)[order]


        # Get the ordered sum stats SNPs indices.
        ss_snp_map = []
        for sid in common_sids:
            ss_snp_map.append(ss_sid_dict[sid])


        # Get the ordered validation SNPs indices
        if validation_genotype_file is not None:
            vg_snp_map = []
            for sid in common_sids:
                vg_snp_map.append(vg_sid_dict[sid])
            vg_nts = sp.array(chrom_d_val['nts'])
            vg_nts_ok = sp.array(vg_nts)[vg_snp_map]


        g_nts = sp.array(chrom_d['nts'])
        ss_nts = (ssg['nts'][...]).astype(util.nts_u_dtype)
        betas = ssg['betas'][...]
        log_odds = ssg['log_odds'][...]

        if 'freqs' in ssg:
            ss_freqs = ssg['freqs'][...]

        g_ss_nt_concord_count = sp.sum(
            g_nts[g_snp_map] == ss_nts[ss_snp_map]) / 2.0
        if validation_genotype_file is not None:
            vg_ss_nt_concord_count = sp.sum(vg_nts_ok == ss_nts[ss_snp_map]) / 2.0
            g_vg_nt_concord_count = sp.sum(g_nts[g_snp_map] == vg_nts_ok) / 2.0
            if debug:
                print('Nucleotide concordance counts out of %d genotypes, vg-rg: %d ; vg-ss: %d' % (len(g_snp_map), g_vg_nt_concord_count, vg_ss_nt_concord_count))
            tot_vg_ss_nt_concord_count += vg_ss_nt_concord_count
            tot_g_vg_nt_concord_count += g_vg_nt_concord_count
        tot_g_ss_nt_concord_count += g_ss_nt_concord_count
        if debug:
            print('Nucleotide concordance counts out of %d genotypes, rg-ss: %d' % (len(g_snp_map), g_ss_nt_concord_count))

        num_freq_discrep_filtered_snps = 0
        num_non_matching_nts = 0
        num_non_supported_nts = 0
        num_ambig_nts = 0

        # Identifying which SNPs have nucleotides that are ok..
        ok_nts = []
        ok_indices = {'g': [], 'ss': []}
        if validation_genotype_file is not None:
            ok_indices['vg']=[]

        #Now loop over SNPs to coordinate nucleotides.        
        if validation_genotype_file is not None:
            for g_i, vg_i, ss_i in zip(g_snp_map, vg_snp_map, ss_snp_map):
    
                # To make sure, is the SNP id the same?
                assert g_sids[g_i] == vg_sids[vg_i] == ss_sids[ss_i], 'Some issues with coordinating the genotypes.'
    
                g_nt = g_nts[g_i]
                if not skip_coordination:
    
                    vg_nt = vg_nts[vg_i]
                    ss_nt = ss_nts[ss_i]
    
                    # Is the nucleotide ambiguous.
                    g_nt = [g_nts[g_i][0], g_nts[g_i][1]]
                    if tuple(g_nt) in util.ambig_nts:
                        num_ambig_nts += 1
                        continue
    
                    # First check if nucleotide is sane?
                    if (not g_nt[0] in util.valid_nts) or (not g_nt[1] in util.valid_nts):
                        num_non_supported_nts += 1
                        continue
    
                    os_g_nt = sp.array(
                        [util.opp_strand_dict[g_nt[0]], util.opp_strand_dict[g_nt[1]]])
    
                    flip_nts = False
                    
                    #Coordination is a bit more complicate when validation genotypes are provided..
                    if not ((sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt)) and (sp.all(g_nt == vg_nt) or sp.all(os_g_nt == vg_nt))):
                        if sp.all(g_nt == vg_nt) or sp.all(os_g_nt == vg_nt):
                            flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0] == ss_nt[1]) or (
                                os_g_nt[1] == ss_nt[0] and os_g_nt[0] == ss_nt[1])
                            # Try flipping the SS nt
                            if flip_nts:
                                betas[ss_i] = -betas[ss_i]
                                log_odds[ss_i] = -log_odds[ss_i]
                                if 'freqs' in ssg:
                                    ss_freqs[ss_i] = 1 - ss_freqs[ss_i]
                            else:
                                if debug:
                                    print("Nucleotides don't match after all?: g_sid=%s, ss_sid=%s, g_i=%d, ss_i=%d, g_nt=%s, ss_nt=%s" % \
                                          (g_sids[g_i], ss_sids[ss_i], g_i,
                                           ss_i, str(g_nt), str(ss_nt)))
                                num_non_matching_nts += 1
                                continue
    
                        else:
                            num_non_matching_nts += 1
                            continue
                            # Opposite strand nucleotides
    
                # everything seems ok.
                ok_indices['g'].append(g_i)
                ok_indices['vg'].append(vg_i)
                ok_indices['ss'].append(ss_i)
    
                ok_nts.append(g_nt)
        else:
            for g_i, ss_i in zip(g_snp_map, ss_snp_map):
    
                # To make sure, is the SNP id the same?
                assert g_sids[g_i] == ss_sids[ss_i], 'Some issues with coordinating the genotypes.'
    
                g_nt = g_nts[g_i]
                if not skip_coordination:
    
                    ss_nt = ss_nts[ss_i]
    
                    # Is the nucleotide ambiguous.
                    g_nt = [g_nts[g_i][0], g_nts[g_i][1]]
                    if tuple(g_nt) in util.ambig_nts:
                        num_ambig_nts += 1
                        continue
    
                    # First check if nucleotide is sane?
                    if (not g_nt[0] in util.valid_nts) or (not g_nt[1] in util.valid_nts):
                        num_non_matching_nts += 1
                        continue
    
                    os_g_nt = sp.array(
                        [util.opp_strand_dict[g_nt[0]], util.opp_strand_dict[g_nt[1]]])
    
                    flip_nts = False
                    
                    #Coordination is a bit more complicate when validation genotypes are provided..
                    if not sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt):
                        flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0] == ss_nt[1]) or (
                            os_g_nt[1] == ss_nt[0] and os_g_nt[0] == ss_nt[1])
                        
                        # Try flipping the SS nt
                        if flip_nts:
                            betas[ss_i] = -betas[ss_i]
                            log_odds[ss_i] = -log_odds[ss_i]
                            if 'freqs' in ssg and ss_freqs[ss_i]>0:
                                ss_freqs[ss_i] = 1.0 - ss_freqs[ss_i]
                        else:
                            if debug:
                                print("Nucleotides don't match after all?: g_sid=%s, ss_sid=%s, g_i=%d, ss_i=%d, g_nt=%s, ss_nt=%s" % \
                                      (g_sids[g_i], ss_sids[ss_i], g_i,
                                       ss_i, str(g_nt), str(ss_nt)))
                            num_non_matching_nts += 1
                            continue
                   
                # everything seems ok.
                ok_indices['g'].append(g_i)
                ok_indices['ss'].append(ss_i)
                ok_nts.append(g_nt)
                
        if debug:
            print('%d SNPs had ambiguous nucleotides.' % num_ambig_nts)
            print('%d SNPs were excluded due to nucleotide issues.' % num_non_matching_nts)

        
        # Resorting by position
        positions = sp.array(chrom_d['positions'])[ok_indices['g']]

        # Now parse SNPs ..
        snp_indices = sp.array(chrom_d['snp_indices'])
        # Pinpoint where the SNPs are in the file.
        snp_indices = snp_indices[ok_indices['g']]
        raw_snps, freqs = plinkfiles.parse_plink_snps(
            reference_genotype_file, snp_indices)
        snp_stds = sp.sqrt(2 * freqs * (1 - freqs))
        snp_means = freqs * 2

        betas = betas[ok_indices['ss']]  
        log_odds = log_odds[ok_indices['ss']]  

        ps = ssg['ps'][...][ok_indices['ss']]
        nts = sp.array(ok_nts)  
        sids = (ssg['sids'][...]).astype(util.sids_u_dtype)
        sids = sids[ok_indices['ss']]

        #Parse validation genotypes, if available
        if validation_genotype_file is not None:
            snp_indices_val = sp.array(chrom_d_val['snp_indices'])
            # Pinpoint where the SNPs are in the file.
            snp_indices_val = snp_indices_val[ok_indices['vg']]
            raw_snps_val, freqs_val = plinkfiles.parse_plink_snps(
                validation_genotype_file, snp_indices_val)
    
            snp_stds_val = sp.sqrt(2 * freqs_val * (1 - freqs_val))
            snp_means_val = freqs_val * 2

        # Check SNP frequencies, screen for possible problems..
        if max_freq_discrep<1 and 'freqs' in ssg:
            ss_freqs = ss_freqs[ok_indices['ss']]
            ok_freq_snps = sp.logical_or(sp.absolute(ss_freqs - freqs) < max_freq_discrep,sp.absolute(ss_freqs + freqs-1) < max_freq_discrep) #Array of np.bool values
            ok_freq_snps = sp.logical_or(ok_freq_snps,ss_freqs<=0) #Only consider SNPs that actually have frequencies
            num_freq_discrep_filtered_snps = len(ok_freq_snps)- sp.sum(ok_freq_snps)
            assert num_freq_discrep_filtered_snps>=0, "Problems when filtering SNPs with frequency discrepencies"
            if num_freq_discrep_filtered_snps>0:
                # Filter freq_discrepancy_snps
                raw_snps = raw_snps[ok_freq_snps]
                snp_stds = snp_stds[ok_freq_snps]
                snp_means = snp_means[ok_freq_snps]
                freqs = freqs[ok_freq_snps]
                ps = ps[ok_freq_snps]
                positions = positions[ok_freq_snps]
                nts = nts[ok_freq_snps]
                sids = sids[ok_freq_snps]
                betas = betas[ok_freq_snps]
                log_odds = log_odds[ok_freq_snps]
                if validation_genotype_file is not None:
                    raw_snps_val = raw_snps_val[ok_freq_snps]
                    snp_stds_val = snp_stds_val[ok_freq_snps]
                    snp_means_val = snp_means_val[ok_freq_snps]
                    freqs_val = freqs_val[ok_freq_snps]
            if debug:
                print('Filtered %d SNPs due to frequency discrepancies'%num_freq_discrep_filtered_snps)

        # Filter minor allele frequency SNPs.
        maf_filter = (freqs > min_maf) * (freqs < (1 - min_maf))
        num_maf_filtered_snps = len(maf_filter)-sp.sum(maf_filter)
        assert num_maf_filtered_snps>=0, "Problems when filtering SNPs with low minor allele frequencies"
        if num_maf_filtered_snps>0:
            raw_snps = raw_snps[maf_filter]
            snp_stds = snp_stds[maf_filter]
            snp_means = snp_means[maf_filter]
            freqs = freqs[maf_filter]
            ps = ps[maf_filter]
            positions = positions[maf_filter]
            nts = nts[maf_filter]
            sids = sids[maf_filter]
            betas = betas[maf_filter]
            log_odds = log_odds[maf_filter]
            if validation_genotype_file is not None:
                raw_snps_val = raw_snps_val[maf_filter]
                snp_stds_val = snp_stds_val[maf_filter]
                snp_means_val = snp_means_val[maf_filter]
                freqs_val = freqs_val[maf_filter]
            if debug:
                print('Filtered %d SNPs due to low MAF'%num_maf_filtered_snps)

        genetic_map = []
        if genetic_map_dir is not None:
            with gzip.open(genetic_map_dir + 'chr%d.interpolated_genetic_map.gz' % chrom) as f:
                for line in f:
                    l = line.split()
#                     if l[0] in sid_set:
#                         genetic_map.append(l[0])
        else:
            genetic_map = None

        coord_data_dict = {'chrom': 'chrom_%d' % chrom, 
                           'raw_snps_ref': raw_snps, 
                           'snp_stds_ref': snp_stds, 
                           'snp_means_ref': snp_means, 
                           'freqs_ref': freqs,
                           'ps': ps,
                           'positions': positions,
                           'nts': nts,
                           'sids': sids,
                           'genetic_map': genetic_map,
                           'betas': betas,
                           'log_odds': log_odds}
        if validation_genotype_file is not None:
            maf_adj_prs = sp.dot(log_odds, raw_snps_val)
            if debug and plinkf_dict_val['has_phenotype']:
                maf_adj_corr = sp.corrcoef(plinkf_dict_val['phenotypes'], maf_adj_prs)[0, 1]
                print('Log odds, per genotype PRS correlation w phenotypes for chromosome %d was %0.4f' % (chrom, maf_adj_corr))
            coord_data_dict['raw_snps_val']=raw_snps_val
            coord_data_dict['snp_stds_val']=snp_stds_val
            coord_data_dict['snp_means_val']=snp_means_val
            coord_data_dict['freqs_val']=freqs_val
            coord_data_dict['log_odds_prs']=maf_adj_prs
            maf_adj_risk_scores += maf_adj_prs
         
         
        write_coord_data(cord_data_g, coord_data_dict, debug=debug)
        if debug:
            print('%d SNPs were retained on chromosome %d.' % (len(sids), chrom))
        
        
        num_snps_common_before_filtering += len(common_sids)
        num_snps_common_after_filtering += len(sids)
        tot_num_ambig_nts += num_ambig_nts
        tot_num_non_supported_nts += num_non_supported_nts
        tot_num_non_matching_nts += num_non_matching_nts
        tot_num_freq_discrep_filtered_snps += num_freq_discrep_filtered_snps
        tot_num_maf_filtered_snps += num_maf_filtered_snps

    if not debug:
        sys.stdout.write('\b\b\b\b\b\b\b%0.2f%%\n' % (100.0))
        sys.stdout.flush()                        


    # Now calculate the prediction r^2
    if validation_genotype_file:
        if debug and plinkf_dict_val['has_phenotype']:
            maf_adj_corr = sp.corrcoef(
                plinkf_dict_val['phenotypes'], maf_adj_risk_scores)[0, 1]
            print('Log odds, per PRS correlation for the whole genome was %0.4f (r^2=%0.4f)' % (maf_adj_corr, maf_adj_corr ** 2))
            print('Overall nucleotide concordance counts: rg_vg: %d, rg_ss: %d, vg_ss: %d' % (tot_g_vg_nt_concord_count, tot_g_ss_nt_concord_count, tot_vg_ss_nt_concord_count))
    else:
        if debug:
            print('Overall nucleotide concordance counts, rg_ss: %d' % (tot_g_ss_nt_concord_count))        
    
    summary_dict[7]={'name':'Num chromosomes used:','value':len(chromosomes_found)}
    summary_dict[8]={'name':'SNPs common across datasets:','value':num_snps_common_before_filtering}
    summary_dict[9]={'name':'SNPs retained after filtering:','value':num_snps_common_after_filtering}
    if tot_num_ambig_nts>0:
        summary_dict[10]={'name':'SNPs w ambiguous nucleotides filtered:','value':tot_num_ambig_nts}
    if tot_num_non_supported_nts>0:
        summary_dict[10.1]={'name':'SNPs w unknown/unsupported nucleotides filtered:','value':tot_num_non_supported_nts}
    if tot_num_non_matching_nts>0:
        summary_dict[11]={'name':'SNPs w other nucleotide discrepancies filtered:','value':tot_num_non_matching_nts}
    if min_maf>0:
        summary_dict[12]={'name':'SNPs w MAF<%0.3f filtered:'%min_maf,'value':tot_num_maf_filtered_snps}
    if max_freq_discrep<0.5:
        summary_dict[13]={'name':'SNPs w allele freq discrepancy > %0.3f filtered:'%max_freq_discrep,'value':tot_num_freq_discrep_filtered_snps}

    t1 = time.time()
    t = (t1 - t0)
    summary_dict[13.9]={'name':'dash', 'value':'Running times'}
    summary_dict[15]={'name':'Run time for coordinating datasets:','value': '%d min and %0.2f sec'%(t / 60, t % 60)}
示例#38
0
	for iph in range(Y.shape[0]):
		for jph in range(Y.shape[0]):
			if SP.bitwise_and(phase_vec[iph]==phase_vec[jph], phase_vec[iph]==3):
				KG2M[iph,jph]=1

	#intra-phase variations in cell size
	sfCellSize = SP.log10(f['ratioEndo'][:])
	sfCellSize -= sfCellSize.mean()
	sfCellSize = sfCellSize.reshape(1,sfCellSize.shape[0])
	Ksize = SP.dot(sfCellSize.transpose(), sfCellSize)
	Ksize /= Ksize.diagonal().mean() 

	# filter cell cycle genes
	idx_cell_cycle = SP.union1d(cellcyclegenes_filter,cellcyclegenes_filterCB600)
	Ymean2 = Y.mean(0)**2>0
	idx_cell_cycle_noise_filtered = SP.intersect1d(idx_cell_cycle,SP.array(SP.where(Ymean2.ravel()>0)))
	Ycc = Y[:,idx_cell_cycle_noise_filtered]
	
	#Fit GPLVM to data 
	k = 1					 # number of latent factors
	file_name = CFG['panama_file']# name of the cache file
	recalc = True # recalculate X and Kconf
	sclvm = scLVM(Y)
	pdb.set_trace()
	X,Kcc,varGPLVM = sclvm.fitGPLVM(idx=idx_cell_cycle_noise_filtered,k=1,out_dir='./cache',file_name=file_name,recalc=recalc)

	#3. load relevant dataset for analysis
	genes_het=SP.array(SP.where(f['genes_heterogen'][:].ravel()==1))
	tech_noise=f['LogVar_techMmus'][:]

   # considers only heterogeneous genes
def main():
    figs = dict()
    figs['stats'] = plt.figure(figsize=(12, 8))
    figs['stats_log'] = plt.figure(figsize=(12, 8))
    figs['stats_full'] = plt.figure(figsize=(12, 8))
    figs['stats_full_log'] = plt.figure(figsize=(12, 8))
    gss = dict()
    gss['stats'] = gridspec.GridSpec(2, 3) #, wspace=0.0, hspace=0.0)
    gss['stats_log'] = gridspec.GridSpec(2, 3) #, wspace=0.0, hspace=0.0)
    gss['stats_full'] = gridspec.GridSpec(2, 3) #, wspace=0.0, hspace=0.0)
    gss['stats_full_log'] = gridspec.GridSpec(2, 3) #, wspace=0.0, hspace=0.0)

    for e, event_type in enumerate(event_types):

        print('Handling %s' % event_type, file=sys.stderr)
        
        ### load events detected in annotation only
        anno = pickle.load(open(os.path.join(BASEDIR_ANNO, 'merge_graphs_%s_C%i.pickle' % (event_type, CONF)), 'r'))
        if isinstance(anno, tuple):
            anno = anno[0]

        ### load annotation index
        is_anno_gtex = pickle.load(open(os.path.join(BASEDIR_GTEX, 'merge_graphs_%s_C%i.anno_only.pickle' % (event_type, CONF)), 'r'))
        is_anno_icgc_t = pickle.load(open(os.path.join(BASEDIR_ICGC_T, 'merge_graphs_%s_C%i.anno_only.pickle' % (event_type, CONF)), 'r'))
        is_anno_icgc_n = pickle.load(open(os.path.join(BASEDIR_ICGC_N, 'merge_graphs_%s_C%i.anno_only.pickle' % (event_type, CONF)), 'r'))

        ### load confident events
        IN = h5py.File(os.path.join(BASEDIR_GTEX, 'merge_graphs_%s_C%i.counts.hdf5' % (event_type, CONF)), 'r')
        idx_conf_gtex = IN['conf_idx'][:]
        IN.close()
        IN = h5py.File(os.path.join(BASEDIR_ICGC_T, 'merge_graphs_%s_C%i.counts.hdf5' % (event_type, CONF)), 'r')
        idx_conf_icgc_t = IN['conf_idx'][:]
        IN.close()
        IN = h5py.File(os.path.join(BASEDIR_ICGC_N, 'merge_graphs_%s_C%i.counts.hdf5' % (event_type, CONF)), 'r')
        idx_conf_icgc_n = IN['conf_idx'][:]
        IN.close()

        ### load filtered events
        #IN = h5py.File(os.path.join(BASEDIR_GTEX, 'merge_graphs_%s_C%i.counts.r10_s50_V10.hdf5' % (event_type, CONF)), 'r')
        #idx_filt_gtex = IN['filter_idx'][:]
        #IN.close()
        #IN = h5py.File(os.path.join(BASEDIR_ICGC, 'merge_graphs_%s_C%i.counts.r10_s50_V10.hdf5' % (event_type, CONF)), 'r')
        #idx_filt_icgc = IN['filter_idx'][:]
        #IN.close()

        ### load psi filtered events
        idx_psi_gtex = pickle.load(open(os.path.join(BASEDIR_GTEX, 'merge_graphs_%s_C%i.counts.hdf5.psi_filt.pickle' % (event_type, CONF)), 'r'))[1]
        idx_psi_icgc_t = pickle.load(open(os.path.join(BASEDIR_ICGC_T, 'merge_graphs_%s_C%i.counts.hdf5.psi_filt.pickle' % (event_type, CONF)), 'r'))[1]
        idx_psi_icgc_n = pickle.load(open(os.path.join(BASEDIR_ICGC_N, 'merge_graphs_%s_C%i.counts.hdf5.psi_filt.pickle' % (event_type, CONF)), 'r'))[1]

        ### plot stats for normal counts (FULL)
        ax = figs['stats_full'].add_subplot(gss['stats_full'][e / 3, e % 3])
        xlabels_full = ['detected', 'confident']
        xlabels_part = ['confident']
        xlabels_full.extend(['dpsi > %.1f' % _ for _ in sorted(idx_psi_gtex.keys())])
        xlabels_part.extend(['dpsi > %.1f' % _ for _ in sorted(idx_psi_gtex.keys())])
        # all confirmed events, further filtered by PSI - GTEX
        data1_gtex = [is_anno_gtex.shape[0], idx_conf_gtex.shape[0]]
        data1_gtex.extend([sp.intersect1d(idx_conf_gtex, idx_psi_gtex[_]).shape[0] for _ in sorted(idx_psi_gtex.keys())])
        data1_gtex = sp.array(data1_gtex)
        lg, = ax.plot(sp.arange(data1_gtex.shape[0]), data1_gtex, '-b', label='GTEx')
        # all annotated confirmed events, further filtered by PSI - GTEX
        data2_gtex = [sp.sum(is_anno_gtex), sp.sum(is_anno_gtex[idx_conf_gtex])]
        data2_gtex.extend([sp.sum(is_anno_gtex[sp.intersect1d(idx_conf_gtex, idx_psi_gtex[_])]) for _ in sorted(idx_psi_gtex.keys())])
        data2_gtex = sp.array(data2_gtex)
        lga, = ax.plot(sp.arange(data2_gtex.shape[0]), data2_gtex, '--b', label='GTEx (anno)')
        # all confirmed events, further filtered by PSI - ICGC_T
        data1_icgc_t = [is_anno_icgc_t.shape[0], idx_conf_icgc_t.shape[0]]
        data1_icgc_t.extend([sp.intersect1d(idx_conf_icgc_t, idx_psi_icgc_t[_]).shape[0] for _ in sorted(idx_psi_icgc_t.keys())])
        data1_icgc_t = sp.array(data1_icgc_t)
        lit, = ax.plot(sp.arange(data1_icgc_t.shape[0]), data1_icgc_t, '-r', label='ICGC Tumor')
        # all annotated confirmed events, further filtered by PSI - ICGC
        data2_icgc_t = [sp.sum(is_anno_icgc_t), sp.sum(is_anno_icgc_t[idx_conf_icgc_t])]
        data2_icgc_t.extend([sp.sum(is_anno_icgc_t[sp.intersect1d(idx_conf_icgc_t, idx_psi_icgc_t[_])]) for _ in sorted(idx_psi_icgc_t.keys())])
        data2_icgc_t = sp.array(data2_icgc_t)
        lita, = ax.plot(sp.arange(data2_icgc_t.shape[0]), data2_icgc_t, '--r', label='ICGC Tumor (anno)')
        # all confirmed events, further filtered by PSI - ICGC_T
        data1_icgc_n = [is_anno_icgc_n.shape[0], idx_conf_icgc_n.shape[0]]
        data1_icgc_n.extend([sp.intersect1d(idx_conf_icgc_n, idx_psi_icgc_n[_]).shape[0] for _ in sorted(idx_psi_icgc_n.keys())])
        data1_icgc_n = sp.array(data1_icgc_n)
        lin, = ax.plot(sp.arange(data1_icgc_n.shape[0]), data1_icgc_n, '-g', label='ICGC Normal')
        # all annotated confirmed events, further filtered by PSI - ICGC
        data2_icgc_n = [sp.sum(is_anno_icgc_n), sp.sum(is_anno_icgc_n[idx_conf_icgc_n])]
        data2_icgc_n.extend([sp.sum(is_anno_icgc_n[sp.intersect1d(idx_conf_icgc_n, idx_psi_icgc_n[_])]) for _ in sorted(idx_psi_icgc_n.keys())])
        data2_icgc_n = sp.array(data2_icgc_n)
        lina, = ax.plot(sp.arange(data2_icgc_n.shape[0]), data2_icgc_n, '--g', label='ICGC Normal (anno)')
        axs.set_ticks_outer(ax)
        axs.clean_axis(ax)
        if e == len(event_types) - 1:
            ax.legend(handles=[lit, lita, lin, lina, lg, lga], loc='upper right', frameon=False, fontsize=10)
        ax.set_xticks(list(range(len(xlabels_full))))
        if e < len(event_types) - 3:
            ax.set_xticklabels([])
        else:
            ax.set_xticklabels(xlabels_full, rotation=90, fontsize=10)
        ax.set_title(event_dict[event_type])
        ax.xaxis.grid(True)


        ### plots stats for log10 counts (FULL)
        ax = figs['stats_full_log'].add_subplot(gss['stats_full_log'][e / 3, e % 3])
        lg, = ax.plot(sp.arange(data1_gtex.shape[0]), sp.log10(data1_gtex + 1), '-b', label='GTEx')
        lga, = ax.plot(sp.arange(data2_gtex.shape[0]), sp.log10(data2_gtex + 1), '--b', label='GTEx (anno)')
        lit, = ax.plot(sp.arange(data1_icgc_t.shape[0]), sp.log10(data1_icgc_t + 1), '-r', label='ICGC Tumor')
        lita, = ax.plot(sp.arange(data2_icgc_t.shape[0]), sp.log10(data2_icgc_t + 1), '--r', label='ICGC Tumor (anno)')
        lin, = ax.plot(sp.arange(data1_icgc_n.shape[0]), sp.log10(data1_icgc_n + 1), '-g', label='ICGC Normal')
        lina, = ax.plot(sp.arange(data2_icgc_n.shape[0]), sp.log10(data2_icgc_n + 1), '--g', label='ICGC Normal (anno)')
        axs.set_ticks_outer(ax)
        axs.clean_axis(ax)
        if e == len(event_types) - 1:
            ax.legend(handles=[lit, lita, lin, lina, lg, lga], loc='lower left', frameon=False, fontsize=10)
        ax.set_xticks(list(range(len(xlabels_full))))
        if e < len(event_types) - 3:
            ax.set_xticklabels([])
        else:
            ax.set_xticklabels(xlabels_full, rotation=90, fontsize=10)
        ax.set_title(event_dict[event_type])
        ax.xaxis.grid(True)

        ### plot stats for normal counts (only conf)
        ax = figs['stats'].add_subplot(gss['stats'][e / 3, e % 3])
        lg, = ax.plot(sp.arange(data1_gtex.shape[0] - 1), data1_gtex[1:], '-b', label='GTEx')
        lga, = ax.plot(sp.arange(data2_gtex.shape[0] - 1), data2_gtex[1:], '--b', label='GTEx (anno)')
        lit, = ax.plot(sp.arange(data1_icgc_t.shape[0] - 1), data1_icgc_t[1:], '-r', label='ICGC Tumor')
        lita, = ax.plot(sp.arange(data2_icgc_t.shape[0] - 1), data2_icgc_t[1:], '--r', label='ICGC Tumor (anno)')
        lin, = ax.plot(sp.arange(data1_icgc_n.shape[0] - 1), data1_icgc_n[1:], '-g', label='ICGC Normal')
        lina, = ax.plot(sp.arange(data2_icgc_n.shape[0] - 1), data2_icgc_n[1:], '--g', label='ICGC Normal (anno)')
        axs.set_ticks_outer(ax)
        axs.clean_axis(ax)
        if e == len(event_types) - 1:
            ax.legend(handles=[lit, lita, lin, lina, lg, lga], loc='upper right', frameon=False, fontsize=10)
        ax.set_xticks(list(range(len(xlabels_part))))
        if e < len(event_types) - 3:
            ax.set_xticklabels([])
        else:
            ax.set_xticklabels(xlabels_part, rotation=90, fontsize=10)
        ax.set_title(event_dict[event_type])
        ax.xaxis.grid(True)

        ### plots stats for log10 counts (only cony)
        ax = figs['stats_log'].add_subplot(gss['stats_log'][e / 3, e % 3])
        lg, = ax.plot(sp.arange(data1_gtex.shape[0] - 1), sp.log10(data1_gtex[1:] + 1), '-b', label='GTEx')
        lga, = ax.plot(sp.arange(data2_gtex.shape[0] - 1), sp.log10(data2_gtex[1:] + 1), '--b', label='GTEx (anno)')
        lit, = ax.plot(sp.arange(data1_icgc_t.shape[0] - 1), sp.log10(data1_icgc_t[1:] + 1), '-r', label='ICGC Tumor')
        lita, = ax.plot(sp.arange(data2_icgc_t.shape[0] - 1), sp.log10(data2_icgc_t[1:] + 1), '--r', label='ICGC Tumor (anno)')
        lin, = ax.plot(sp.arange(data1_icgc_n.shape[0] - 1), sp.log10(data1_icgc_n[1:] + 1), '-g', label='ICGC Normal')
        lina, = ax.plot(sp.arange(data2_icgc_n.shape[0] - 1), sp.log10(data2_icgc_n[1:] + 1), '--g', label='ICGC Normal (anno)')
        axs.set_ticks_outer(ax)
        axs.clean_axis(ax)
        if e == len(event_types) - 1:
            ax.legend(handles=[lit, lita, lin, lina, lg, lga], loc='lower left', frameon=False, fontsize=10)
        ax.set_xticks(list(range(len(xlabels_part))))
        if e < len(event_types) - 3:
            ax.set_xticklabels([])
        else:
            ax.set_xticklabels(xlabels_part, rotation=90, fontsize=10)
        ax.set_title(event_dict[event_type])
        ax.xaxis.grid(True)


    for p in figs:
        figs[p].tight_layout()
        figs[p].savefig(os.path.join(PLOTDIR, 'event_overview_cumm_Liver_C%i_%s.pdf' % (CONF, p)), format='pdf', bbox_inches='tight')
        figs[p].savefig(os.path.join(PLOTDIR, 'event_overview_cumm_Liver_C%i_%s.png' % (CONF, p)), format='png', bbox_inches='tight')
        plt.close(figs[p])
示例#40
0
directory = "." #Directory of the simulation data
#list_fields=['Ex','Ey','Ez','Bz_m','Rho_electron1','Rho_proton','Jx','Jy','Jz'] #List of the fields you want to extract (Ei,Bi,Ji_sn,Rho,rho_sn, where n is the number of the species and i the direction (x,y,z))
list_fields=['Ey','Ex','Rho_electron'] 
first_cycle = 0
last_cycle = 11000
cycle_step = 5000 # step between two displayed cycles (minimum is given by outputcyle of the inputfile)
plot_on_axis = 0 # Also plot 1D graph of the quantities on axis if ==1.
suffix = "" #Suffix to be added in produced files name.
####################################################################################

filename = directory + "/Fields_folded.h5" # Proc file name
h5file = tables.openFile(filename, mode = "r", title = "Fields_file")

existing_files = scipy.arange(0,last_cycle,cycle_step)
asked_files = scipy.arange(first_cycle, last_cycle)
list_files=scipy.intersect1d(existing_files,asked_files)
nb_files=len(list_files)

Filebyproc=nb_files/p #Minimum Nomber of files you have to read
reste=nb_files-Filebyproc*p
if (rank<reste):
    Filerange=scipy.arange(rank*(Filebyproc+1),(rank+1)*(Filebyproc+1))
else:
    Filerange=scipy.arange((reste*(Filebyproc+1)+(rank-reste)*Filebyproc),(reste*(Filebyproc+1)+(rank-reste+1)*Filebyproc))
#print nb_files, 'filerange =', Filerange, rank
list_cycle = list(list_files[Filerange])
print "cycle", list_cycle

print h5file.root._f_listNodes
if (rank==0):
    print h5file.root._f_getChild(repr(list_cycle[0]).rjust(10,"0"))._f_listNodes
示例#41
0
def coordinate_genotypes_ss_w_ld_ref(genotype_file=None,
                                     reference_genotype_file=None,
                                     hdf5_file=None,
                                     genetic_map_dir=None,
                                     check_mafs=False,
                                     min_maf=0.01):
    #   recode_dict = {1:'A', 2:'T', 3:'C', 4:'G'} #1K genomes recoding..
    print 'Coordinating things w genotype file: %s \nref. genot. file: %s' % (
        genotype_file, reference_genotype_file)
    plinkf = plinkfile.PlinkFile(genotype_file)

    #Loads only the individuals... (I think?)
    samples = plinkf.get_samples()
    num_individs = len(samples)
    Y = [s.phenotype for s in samples]
    fids = [s.fid for s in samples]
    iids = [s.iid for s in samples]

    unique_phens = sp.unique(Y)
    if len(unique_phens) == 1:
        print 'Unable to find phenotype values.'
        has_phenotype = False
    elif len(unique_phens) == 2:
        cc_bins = sp.bincount(Y)
        assert len(cc_bins) == 2, 'Problems with loading phenotype'
        print 'Loaded %d controls and %d cases' % (cc_bins[0], cc_bins[1])
        has_phenotype = True
    else:
        print 'Found quantitative phenotype values'
        has_phenotype = True

    #Figure out chromosomes and positions.
    print 'Parsing validation genotype bim file'
    loci = plinkf.get_loci()
    plinkf.close()
    gf_chromosomes = [l.chromosome for l in loci]

    chromosomes = sp.unique(gf_chromosomes)
    chromosomes.sort()

    chr_dict = _get_chrom_dict_(loci, chromosomes)

    print 'Parsing LD reference genotype bim file'
    plinkf_ref = plinkfile.PlinkFile(reference_genotype_file)
    loci_ref = plinkf_ref.get_loci()
    plinkf_ref.close()

    chr_dict_ref = _get_chrom_dict_(loci_ref, chromosomes)
    #     chr_dict_ref = _get_chrom_dict_bim_(reference_genotype_file+'.bim', chromosomes)

    #Open HDF5 file and prepare out data
    assert not 'iids' in hdf5_file.keys(
    ), 'Something is wrong with the HDF5 file?'
    if has_phenotype:
        hdf5_file.create_dataset('y', data=Y)

    hdf5_file.create_dataset('fids', data=fids)
    hdf5_file.create_dataset('iids', data=iids)
    ssf = hdf5_file['sum_stats']
    cord_data_g = hdf5_file.create_group('cord_data')

    maf_adj_risk_scores = sp.zeros(num_individs)
    num_common_snps = 0
    #corr_list = []

    tot_g_ss_nt_concord_count = 0
    tot_rg_ss_nt_concord_count = 0
    tot_g_rg_nt_concord_count = 0
    tot_num_non_matching_nts = 0

    #Now iterate over chromosomes
    for chrom in chromosomes:
        ok_indices = {'g': [], 'rg': [], 'ss': []}

        chr_str = 'chrom_%d' % chrom
        print 'Working on chromsome: %s' % chr_str

        chrom_d = chr_dict[chr_str]
        chrom_d_ref = chr_dict_ref[chr_str]
        try:
            ssg = ssf['chrom_%d' % chrom]
        except Exception, err_str:
            print err_str
            print 'Did not find chromsome in SS dataset.'
            print 'Continuing.'
            continue

        ssg = ssf['chrom_%d' % chrom]
        g_sids = chrom_d['sids']
        rg_sids = chrom_d_ref['sids']
        ss_sids = ssg['sids'][...]
        print 'Found %d SNPs in validation data, %d SNPs in LD reference data, and %d SNPs in summary statistics.' % (
            len(g_sids), len(rg_sids), len(ss_sids))
        common_sids = sp.intersect1d(ss_sids, g_sids)
        common_sids = sp.intersect1d(common_sids, rg_sids)
        print 'Found %d SNPs on chrom %d that were common across all datasets' % (
            len(common_sids), chrom)

        ss_snp_map = []
        g_snp_map = []
        rg_snp_map = []

        ss_sid_dict = {}
        for i, sid in enumerate(ss_sids):
            ss_sid_dict[sid] = i

        g_sid_dict = {}
        for i, sid in enumerate(g_sids):
            g_sid_dict[sid] = i

        rg_sid_dict = {}
        for i, sid in enumerate(rg_sids):
            rg_sid_dict[sid] = i

        for sid in common_sids:
            g_snp_map.append(g_sid_dict[sid])

        #order by positions
        g_positions = sp.array(chrom_d['positions'])[g_snp_map]
        order = sp.argsort(g_positions)
        #order = order.tolist()
        g_snp_map = sp.array(g_snp_map)[order]
        g_snp_map = g_snp_map.tolist()
        common_sids = sp.array(common_sids)[order]

        #Get the other two maps
        for sid in common_sids:
            rg_snp_map.append(rg_sid_dict[sid])

        for sid in common_sids:
            ss_snp_map.append(ss_sid_dict[sid])

        g_nts = sp.array(chrom_d['nts'])
        rg_nts = sp.array(chrom_d_ref['nts'])
        rg_nts_ok = sp.array(rg_nts)[rg_snp_map]
        #         rg_nts_l = []
        #         for nt in rg_nts_ok:
        #             rg_nts_l.append([recode_dict[nt[0]],recode_dict[nt[1]]])
        #         rg_nts_ok = sp.array(rg_nts_l)
        ss_nts = ssg['nts'][...]
        betas = ssg['betas'][...]
        log_odds = ssg['log_odds'][...]

        if 'freqs' in ssg.keys():
            ss_freqs = ssg['freqs'][...]

        g_ss_nt_concord_count = sp.sum(
            g_nts[g_snp_map] == ss_nts[ss_snp_map]) / 2.0
        rg_ss_nt_concord_count = sp.sum(rg_nts_ok == ss_nts[ss_snp_map]) / 2.0
        g_rg_nt_concord_count = sp.sum(g_nts[g_snp_map] == rg_nts_ok) / 2.0
        print 'Nucleotide concordance counts out of %d genotypes: vg-g: %d, vg-ss: %d, g-ss: %d' % (
            len(g_snp_map), g_rg_nt_concord_count, g_ss_nt_concord_count,
            rg_ss_nt_concord_count)
        tot_g_ss_nt_concord_count += g_ss_nt_concord_count
        tot_rg_ss_nt_concord_count += rg_ss_nt_concord_count
        tot_g_rg_nt_concord_count += g_rg_nt_concord_count

        num_non_matching_nts = 0
        num_ambig_nts = 0

        #Identifying which SNPs have nucleotides that are ok..
        ok_nts = []
        for g_i, rg_i, ss_i in it.izip(g_snp_map, rg_snp_map, ss_snp_map):

            #To make sure, is the SNP id the same?
            assert g_sids[g_i] == rg_sids[rg_i] == ss_sids[
                ss_i], 'Some issues with coordinating the genotypes.'

            g_nt = g_nts[g_i]
            rg_nt = rg_nts[rg_i]
            #             rg_nt = [recode_dict[rg_nts[rg_i][0]],recode_dict[rg_nts[rg_i][1]]]
            ss_nt = ss_nts[ss_i]

            #Is the nucleotide ambiguous.
            g_nt = [g_nts[g_i][0], g_nts[g_i][1]]
            if tuple(g_nt) in ambig_nts:
                num_ambig_nts += 1
                tot_num_non_matching_nts += 1
                continue

            #First check if nucleotide is sane?
            if (not g_nt[0] in valid_nts) or (not g_nt[1] in valid_nts):
                num_non_matching_nts += 1
                tot_num_non_matching_nts += 1
                continue

            os_g_nt = sp.array(
                [opp_strand_dict[g_nt[0]], opp_strand_dict[g_nt[1]]])

            flip_nts = False
            if not ((sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt)) and
                    (sp.all(g_nt == rg_nt) or sp.all(os_g_nt == rg_nt))):
                if sp.all(g_nt == rg_nt) or sp.all(os_g_nt == rg_nt):
                    flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0]
                                == ss_nt[1]) or (os_g_nt[1] == ss_nt[0]
                                                 and os_g_nt[0] == ss_nt[1])
                    #Try flipping the SS nt
                    if flip_nts:
                        betas[ss_i] = -betas[ss_i]
                        log_odds[ss_i] = -log_odds[ss_i]
                        if 'freqs' in ssg.keys():
                            ss_freqs[ss_i] = 1 - ss_freqs[ss_i]
                    else:
                        print "Nucleotides don't match after all?: g_sid=%s, ss_sid=%s, g_i=%d, ss_i=%d, g_nt=%s, ss_nt=%s" % \
                            (g_sids[g_i], ss_sids[ss_i], g_i, ss_i, str(g_nt), str(ss_nt))
                        num_non_matching_nts += 1
                        tot_num_non_matching_nts += 1
                        continue

                else:
                    num_non_matching_nts += 1
                    tot_num_non_matching_nts += 1
                    continue
                    # Opposite strand nucleotides

            # everything seems ok.
            ok_indices['g'].append(g_i)
            ok_indices['rg'].append(rg_i)
            ok_indices['ss'].append(ss_i)

            ok_nts.append(g_nt)
#             if flip_nts:
#                 ok_nts.append([ss_nt[1],ss_nt[0]])
#             else:
#                 ok_nts.append(ss_nt)

#print '%d SNPs in LD references to be flipped.'%((len(ref_snp_directions)-sp.sum(ref_snp_directions))/2.0)
        print '%d SNPs had ambiguous nucleotides.' % num_ambig_nts
        print '%d SNPs were excluded due to nucleotide issues.' % num_non_matching_nts
        print '%d SNPs were retained on chromosome %d.' % (len(
            ok_indices['g']), chrom)

        #Resorting by position
        positions = sp.array(chrom_d['positions'])[ok_indices['g']]
        #         order = sp.argsort(positions)
        #         sorted_positions = positions[order]
        #         assert sp.all(sorted_positions==positions), 'Perhaps something is wrong here?'
        #         ok_indices['g'] = list(sp.array(ok_indices['g'])[order])
        #         ok_indices['ss'] = list(sp.array(ok_indices['ss'])[order])

        #Now parse SNPs ..
        snp_indices = sp.array(chrom_d['snp_indices'])
        snp_indices = snp_indices[
            ok_indices['g']]  #Pinpoint where the SNPs are in the file.
        raw_snps, freqs = _parse_plink_snps_(genotype_file, snp_indices)

        snp_indices_ref = sp.array(chrom_d_ref['snp_indices'])
        snp_indices_ref = snp_indices_ref[
            ok_indices['rg']]  #Pinpoint where the SNPs are in the file.
        raw_ref_snps, freqs_ref = _parse_plink_snps_(reference_genotype_file,
                                                     snp_indices_ref)

        snp_stds_ref = sp.sqrt(2 * freqs_ref * (1 - freqs_ref))
        snp_means_ref = freqs_ref * 2

        snp_stds = sp.sqrt(2 * freqs * (1 - freqs))
        snp_means = freqs * 2

        betas = betas[ok_indices['ss']]  # * sp.sqrt(freqs * (1 - freqs))
        log_odds = log_odds[ok_indices['ss']]  # * sp.sqrt(freqs * (1 - freqs))

        ps = ssg['ps'][...][ok_indices['ss']]
        nts = sp.array(ok_nts)  #[order]
        sids = ssg['sids'][...][ok_indices['ss']]

        #For debugging...
        #         g_sids = sp.array(chrom_d['sids'])[ok_indices['g']]
        #         rg_sids = sp.array(chrom_d_ref['sids'])[ok_indices['rg']]
        #         ss_sids = ssg['sids'][...][ok_indices['ss']]
        #         assert sp.all(g_sids==rg_sids) and sp.all(rg_sids==ss_sids), 'WTF?'

        #Check SNP frequencies..
        if check_mafs and 'freqs' in ssg.keys():
            ss_freqs = ss_freqs[ok_indices['ss']]
            freq_discrepancy_snp = sp.absolute(ss_freqs - (1 - freqs)) > 0.15
            if sp.any(freq_discrepancy_snp):
                print 'Warning: %d SNPs were filtered due to high allele frequency discrepancy between summary statistics and validation sample' % sp.sum(
                    freq_discrepancy_snp)
                #                 print freqs[freq_discrepancy_snp]
                #                 print ss_freqs[freq_discrepancy_snp]

                #Filter freq_discrepancy_snps
                ok_freq_snps = sp.negative(freq_discrepancy_snp)
                raw_snps = raw_snps[ok_freq_snps]
                snp_stds = snp_stds[ok_freq_snps]
                snp_means = snp_means[ok_freq_snps]
                raw_ref_snps = raw_ref_snps[ok_freq_snps]
                snp_stds_ref = snp_stds_ref[ok_freq_snps]
                snp_means_ref = snp_means_ref[ok_freq_snps]
                freqs = freqs[ok_freq_snps]
                freqs_ref = freqs_ref[ok_freq_snps]
                ps = ps[ok_freq_snps]
                positions = positions[ok_freq_snps]
                nts = nts[ok_freq_snps]
                sids = sids[ok_freq_snps]
                betas = betas[ok_freq_snps]
                log_odds = log_odds[ok_freq_snps]
                #For debugging...
#         if sp.any(freq_discrepancy_snp):
#             g_sids = g_sids[ok_freq_snps]
#             rg_sids = rg_sids[ok_freq_snps]
#             ss_sids = ss_sids[ok_freq_snps]
#         assert sp.all(g_sids==rg_sids) and sp.all(rg_sids==ss_sids), 'WTF?'

#Filter minor allele frequency SNPs.
        maf_filter = (freqs > min_maf) * (freqs < (1 - min_maf))
        maf_filter_sum = sp.sum(maf_filter)
        n_snps = len(maf_filter)
        assert maf_filter_sum <= n_snps, "WTF?"
        if sp.sum(maf_filter) < n_snps:
            raw_snps = raw_snps[maf_filter]
            snp_stds = snp_stds[maf_filter]
            snp_means = snp_means[maf_filter]
            raw_ref_snps = raw_ref_snps[maf_filter]
            snp_stds_ref = snp_stds_ref[maf_filter]
            snp_means_ref = snp_means_ref[maf_filter]
            freqs = freqs[maf_filter]
            freqs_ref = freqs_ref[maf_filter]
            ps = ps[maf_filter]
            positions = positions[maf_filter]
            nts = nts[maf_filter]
            sids = sids[maf_filter]
            betas = betas[maf_filter]
            log_odds = log_odds[maf_filter]


#         if sp.sum(maf_filter)<n_snps:
#             g_sids = g_sids[maf_filter]
#             rg_sids = rg_sids[maf_filter]
#             ss_sids = ss_sids[maf_filter]
#         assert sp.all(g_sids==rg_sids) and sp.all(rg_sids==ss_sids), 'WTF?'

        maf_adj_prs = sp.dot(log_odds, raw_snps)
        if has_phenotype:
            maf_adj_corr = sp.corrcoef(Y, maf_adj_prs)[0, 1]
            print 'Log odds, per genotype PRS correlation w phenotypes for chromosome %d was %0.4f' % (
                chrom, maf_adj_corr)

        genetic_map = []
        if genetic_map_dir is not None:
            with gzip.open(genetic_map_dir +
                           'chr%d.interpolated_genetic_map.gz' % chrom) as f:
                for line in f:
                    l = line.split()
                    if l[0] in sid_set:
                        genetic_map.append(l[0])

        print 'Now storing coordinated data to HDF5 file.'
        ofg = cord_data_g.create_group('chrom_%d' % chrom)
        ofg.create_dataset('raw_snps_val', data=raw_snps, compression='lzf')
        ofg.create_dataset('snp_stds_val', data=snp_stds)
        ofg.create_dataset('snp_means_val', data=snp_means)
        ofg.create_dataset('freqs_val', data=freqs)
        ofg.create_dataset('raw_snps_ref',
                           data=raw_ref_snps,
                           compression='lzf')
        ofg.create_dataset('snp_stds_ref', data=snp_stds_ref)
        ofg.create_dataset('snp_means_ref', data=snp_means_ref)
        ofg.create_dataset('freqs_ref', data=freqs_ref)
        ofg.create_dataset('nts', data=nts)
        ofg.create_dataset('ps', data=ps)
        ofg.create_dataset('positions', data=positions)
        ofg.create_dataset('sids', data=sids)
        if genetic_map_dir is not None:
            ofg.create_dataset('genetic_map', data=genetic_map)
        ofg.create_dataset('betas', data=betas)
        ofg.create_dataset('log_odds', data=log_odds)
        ofg.create_dataset('log_odds_prs', data=maf_adj_prs)
        #         print 'Sum betas', sp.sum(betas ** 2)
        #ofg.create_dataset('prs', data=prs)

        #risk_scores += prs
        maf_adj_risk_scores += maf_adj_prs
        num_common_snps += len(betas)
示例#42
0
文件: core.py 项目: senaj/scLVM
    def varianceDecomposition(self,
                              K=None,
                              tech_noise=None,
                              idx=None,
                              i0=None,
                              i1=None,
                              max_iter=10,
                              verbose=False,
                              cache=True):
        """
		Args:
			K:				list of random effects to be considered in the analysis
			idx:			indices of the genes to be considered in the analysis
			i0:				gene index from which the anlysis starts
			i1:				gene index to which the analysis stops
			max_iter:		maximum number of random restarts
			verbose:		if True, print progresses
		"""

        if tech_noise != None: self.set_tech_noise(tech_noise)
        assert self.tech_noise != None, 'scLVM:: specify technical noise'
        assert K != None, 'scLVM:: specify K'

        if type(K) != list: K = [K]
        for k in K:
            assert k.shape[0] == self.N, 'scLVM:: K dimension dismatch'
            assert k.shape[1] == self.N, 'scLVM:: K dimension dismatch'

        if idx == None:
            if i0 == None or i1 == None:
                i0 = 0
                i1 = self.G
            idx = SP.arange(i0, i1)
        elif type(idx) != SP.ndarray:
            idx = SP.array(idx)
        idx = SP.intersect1d(
            SP.array(idx),
            SP.where(self.Y.std(0) > 0)
            [0])  #only makes sense if gene is expressed in at least one cell
        _G = len(idx)
        var = SP.zeros((_G, len(K) + 2))
        _idx = SP.zeros(_G)
        geneID = SP.zeros(_G, dtype=str)
        conv = SP.zeros(_G) == 1
        Ystar = [SP.zeros((self.N, _G)) for i in range(len(K))]
        count = 0
        Yidx = self.Y[:, idx]
        Ystd = Yidx - Yidx.mean(0)
        Ystd /= Yidx.std(0)  #delta optimization might be more efficient
        tech_noise = self.tech_noise[idx] / SP.array(Yidx.std(0))**2

        for ids in range(_G):
            if verbose:
                print '.. fitting gene %d' % ids
            # extract a single gene
            y = Ystd[:, ids:ids + 1]
            # build and fit variance decomposition model
            vc = VAR.VarianceDecomposition(y)
            vc.addFixedEffect()
            for k in K:
                vc.addRandomEffect(k)
            vc.addRandomEffect(SP.eye(self.N))
            vc.addRandomEffect(SP.eye(self.N))
            vc.vd.getTerm(len(K) + 1).getKcf().setParamMask(SP.zeros(1))
            for iter_i in range(max_iter):
                scales0 = y.std() * SP.randn(len(K) + 2)
                scales0[len(K) + 1] = SP.sqrt(tech_noise[ids])
                _conv = vc.optimize(scales0=scales0)
                if _conv: break

            conv[count] = _conv
            if not _conv:
                var[count, -2] = SP.maximum(0, y.var() - tech_noise[ids])
                var[count, -1] = tech_noise[ids]
                count += 1
                continue
            _var = vc.getVarianceComps()[0, :]
            KiY = vc.gp.agetKEffInvYCache().ravel()
            for ki in range(len(K)):
                Ystar[ki][:, count] = _var[ki] * SP.dot(K[ki], KiY)
            var[count, :] = _var
            count += 1
        if self.geneID != None: geneID = SP.array(self.geneID)[idx]
        col_header = ['hidden_%d' % i for i in range(len(K))]
        col_header.append('biol_noise')
        col_header.append('tech_noise')
        col_header = SP.array(col_header)

        #annotate column and rows of var and Ystar
        var_info = {'gene_idx': idx, 'col_header': col_header, 'conv': conv}
        if geneID != None: var_info['geneID'] = SP.array(geneID)
        Ystar_info = {'gene_idx': idx, 'conv': conv}
        if geneID != None: Ystar_info['geneID'] = SP.array(geneID)

        # cache stuff
        if cache == True:
            self.var = var
            self.Ystar = Ystar
            self.var_info = var_info
            self.Ystar_info = Ystar_info
        else:
            return var, var_info
示例#43
0
def coordinate_datasets(reference_genotype_file, hdf5_file, summary_dict,
                        validation_genotype_file=None,
                        genetic_map_dir=None,
                        min_maf=0.01,
                        skip_coordination=False, 
                        max_freq_discrep = 0.15,
                        debug=False):
    
    summary_dict[3.9]={'name':'dash', 'value':'Coordination'}
    t0 = time.time()
    if validation_genotype_file is not None:
        print('Coordinating datasets (Summary statistics, LD reference genotypes, and Validation genotypes).')
    else:
        print('Coordinating datasets (Summary statistics and LD reference genotypes).')
        
    plinkf = plinkfile.PlinkFile(reference_genotype_file)

    # Figure out chromosomes and positions.
    if debug:
        print('Parsing plinkf_dict_val reference genotypes')
    loci = plinkf.get_loci()
    plinkf.close()
    summary_dict[4]={'name':'Num individuals in LD Reference data:','value':plinkfiles.get_num_indivs(reference_genotype_file)}
    summary_dict[4.1]={'name':'SNPs in LD Reference data:','value':len(loci)}
    gf_chromosomes = [l.chromosome for l in loci]
    
    chromosomes = sp.unique(gf_chromosomes)
    chromosomes.sort()

    chr_dict = plinkfiles.get_chrom_dict(loci, chromosomes)
    
    if validation_genotype_file is not None:
        if debug:
            print('Parsing LD validation bim file')
        plinkf_val = plinkfile.PlinkFile(validation_genotype_file)

        # Loads only the individuals... 
        plinkf_dict_val = plinkfiles.get_phenotypes(plinkf_val)
        
        loci_val = plinkf_val.get_loci()
        plinkf_val.close()
        summary_dict[5]={'name':'SNPs in Validation data:','value':len(loci_val)}

        chr_dict_val = plinkfiles.get_chrom_dict(loci_val, chromosomes)

        # Open HDF5 file and prepare out data
        assert not 'iids' in hdf5_file, 'Something is wrong with the HDF5 file, no individuals IDs were found.'
        if plinkf_dict_val['has_phenotype']:
            hdf5_file.create_dataset('y', data=plinkf_dict_val['phenotypes'])
            summary_dict[6]={'name':'Num validation phenotypes:','value':plinkf_dict_val['num_individs']}
   
        hdf5_file.create_dataset('fids', data=sp.array(plinkf_dict_val['fids'], dtype=util.fids_dtype))
        hdf5_file.create_dataset('iids', data=sp.array(plinkf_dict_val['iids'], dtype=util.iids_dtype))

        maf_adj_risk_scores = sp.zeros(plinkf_dict_val['num_individs'])

    
    # Now summary statistics
    ssf = hdf5_file['sum_stats']
    cord_data_g = hdf5_file.create_group('cord_data')

    num_common_snps = 0
    # corr_list = []



    chromosomes_found = set()
    num_snps_common_before_filtering =0
    num_snps_common_after_filtering =0
    tot_num_non_matching_nts = 0
    tot_num_non_supported_nts = 0
    tot_num_ambig_nts = 0
    tot_num_freq_discrep_filtered_snps = 0
    tot_num_maf_filtered_snps = 0
    tot_g_ss_nt_concord_count = 0
    if validation_genotype_file is not None:
        tot_g_vg_nt_concord_count = 0
        tot_vg_ss_nt_concord_count = 0
        
    # Now iterate over chromosomes
    chrom_i = 0
    for chrom in chromosomes:
        chrom_i +=1
        if not debug:
            sys.stdout.write('\r%0.2f%%' % (100.0 * (float(chrom_i) / (len(chromosomes)+1))))
            sys.stdout.flush()            
        try:
            chr_str = 'chrom_%d' % chrom
            ssg = ssf[chr_str]
                    
        except Exception as err_str:
                print(err_str)
                print('Did not find chromosome %d in SS dataset.'%chrom)
                print('Continuing.')
                continue
        
        if debug:
            print('Coordinating data for chromosome %s' % chr_str)

        chromosomes_found.add(chrom)
        
        #Get summary statistics chromosome group
        ssg = ssf['chrom_%d' % chrom]
        ss_sids = (ssg['sids'][...]).astype(util.sids_u_dtype)
        if validation_genotype_file is not None:
            chrom_d_val = chr_dict_val[chr_str]
            vg_sids = chrom_d_val['sids']
            common_sids = sp.intersect1d(ss_sids, vg_sids)
            
            # A map from sid to index for validation data        
            vg_sid_dict = {}
            for i, sid in enumerate(vg_sids):
                vg_sid_dict[sid] = i
        else:
            common_sids = ss_sids

        # A map from sid to index for summary stats        
        ss_sid_dict = {}
        for i, sid in enumerate(ss_sids):
            ss_sid_dict[sid] = i

        #The indices to retain for the LD reference genotypes
        chrom_d = chr_dict[chr_str]
        g_sids = chrom_d['sids']
        common_sids = sp.intersect1d(common_sids, g_sids)
        
        # A map from sid to index for LD reference data        
        g_sid_dict = {}
        for i, sid in enumerate(g_sids):
            g_sid_dict[sid] = i

        if debug:
            print('Found %d SNPs on chrom %d that were common across all datasets' % (len(common_sids), chrom))
            print('Ordering SNPs by genomic positions (based on LD reference genotypes).')
        
        g_snp_map = []
        for sid in common_sids:
            g_snp_map.append(g_sid_dict[sid])
        # order by positions (based on LD reference file)
        g_positions = sp.array(chrom_d['positions'])[g_snp_map]
        order = sp.argsort(g_positions)

        g_snp_map = sp.array(g_snp_map)[order]
        g_snp_map = g_snp_map.tolist()
        common_sids = sp.array(common_sids)[order]


        # Get the ordered sum stats SNPs indices.
        ss_snp_map = []
        for sid in common_sids:
            ss_snp_map.append(ss_sid_dict[sid])


        # Get the ordered validation SNPs indices
        if validation_genotype_file is not None:
            vg_snp_map = []
            for sid in common_sids:
                vg_snp_map.append(vg_sid_dict[sid])
            vg_nts = sp.array(chrom_d_val['nts'])
            vg_nts_ok = sp.array(vg_nts)[vg_snp_map]


        g_nts = sp.array(chrom_d['nts'])
        ss_nts = (ssg['nts'][...]).astype(util.nts_u_dtype)
        betas = ssg['betas'][...]
        log_odds = ssg['log_odds'][...]

        if 'freqs' in ssg:
            ss_freqs = ssg['freqs'][...]

        g_ss_nt_concord_count = sp.sum(
            g_nts[g_snp_map] == ss_nts[ss_snp_map]) / 2.0
        if validation_genotype_file is not None:
            vg_ss_nt_concord_count = sp.sum(vg_nts_ok == ss_nts[ss_snp_map]) / 2.0
            g_vg_nt_concord_count = sp.sum(g_nts[g_snp_map] == vg_nts_ok) / 2.0
            if debug:
                print('Nucleotide concordance counts out of %d genotypes, vg-rg: %d ; vg-ss: %d' % (len(g_snp_map), g_vg_nt_concord_count, vg_ss_nt_concord_count))
            tot_vg_ss_nt_concord_count += vg_ss_nt_concord_count
            tot_g_vg_nt_concord_count += g_vg_nt_concord_count
        tot_g_ss_nt_concord_count += g_ss_nt_concord_count
        if debug:
            print('Nucleotide concordance counts out of %d genotypes, rg-ss: %d' % (len(g_snp_map), g_ss_nt_concord_count))

        num_freq_discrep_filtered_snps = 0
        num_non_matching_nts = 0
        num_non_supported_nts = 0
        num_ambig_nts = 0

        # Identifying which SNPs have nucleotides that are ok..
        ok_nts = []
        ok_indices = {'g': [], 'ss': []}
        if validation_genotype_file is not None:
            ok_indices['vg']=[]

        #Now loop over SNPs to coordinate nucleotides.        
        if validation_genotype_file is not None:
            for g_i, vg_i, ss_i in zip(g_snp_map, vg_snp_map, ss_snp_map):
    
                # To make sure, is the SNP id the same?
                assert g_sids[g_i] == vg_sids[vg_i] == ss_sids[ss_i], 'Some issues with coordinating the genotypes.'
    
                g_nt = g_nts[g_i]
                if not skip_coordination:
    
                    vg_nt = vg_nts[vg_i]
                    ss_nt = ss_nts[ss_i]
    
                    # Is the nucleotide ambiguous.
                    g_nt = [g_nts[g_i][0], g_nts[g_i][1]]
                    if tuple(g_nt) in util.ambig_nts:
                        num_ambig_nts += 1
                        continue
    
                    # First check if nucleotide is sane?
                    if (not g_nt[0] in util.valid_nts) or (not g_nt[1] in util.valid_nts):
                        num_non_supported_nts += 1
                        continue
    
                    os_g_nt = sp.array(
                        [util.opp_strand_dict[g_nt[0]], util.opp_strand_dict[g_nt[1]]])
    
                    flip_nts = False
                    
                    #Coordination is a bit more complicate when validation genotypes are provided..
                    if not ((sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt)) and (sp.all(g_nt == vg_nt) or sp.all(os_g_nt == vg_nt))):
                        if sp.all(g_nt == vg_nt) or sp.all(os_g_nt == vg_nt):
                            flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0] == ss_nt[1]) or (
                                os_g_nt[1] == ss_nt[0] and os_g_nt[0] == ss_nt[1])
                            # Try flipping the SS nt
                            if flip_nts:
                                betas[ss_i] = -betas[ss_i]
                                log_odds[ss_i] = -log_odds[ss_i]
                                if 'freqs' in ssg:
                                    ss_freqs[ss_i] = 1 - ss_freqs[ss_i]
                            else:
                                if debug:
                                    print("Nucleotides don't match after all?: g_sid=%s, ss_sid=%s, g_i=%d, ss_i=%d, g_nt=%s, ss_nt=%s" % \
                                          (g_sids[g_i], ss_sids[ss_i], g_i,
                                           ss_i, str(g_nt), str(ss_nt)))
                                num_non_matching_nts += 1
                                continue
    
                        else:
                            num_non_matching_nts += 1
                            continue
                            # Opposite strand nucleotides
    
                # everything seems ok.
                ok_indices['g'].append(g_i)
                ok_indices['vg'].append(vg_i)
                ok_indices['ss'].append(ss_i)
    
                ok_nts.append(g_nt)
        else:
            for g_i, ss_i in zip(g_snp_map, ss_snp_map):
    
                # To make sure, is the SNP id the same?
                assert g_sids[g_i] == ss_sids[ss_i], 'Some issues with coordinating the genotypes.'
    
                g_nt = g_nts[g_i]
                if not skip_coordination:
    
                    ss_nt = ss_nts[ss_i]
    
                    # Is the nucleotide ambiguous.
                    g_nt = [g_nts[g_i][0], g_nts[g_i][1]]
                    if tuple(g_nt) in util.ambig_nts:
                        num_ambig_nts += 1
                        continue
    
                    # First check if nucleotide is sane?
                    if (not g_nt[0] in util.valid_nts) or (not g_nt[1] in util.valid_nts):
                        num_non_matching_nts += 1
                        continue
    
                    os_g_nt = sp.array(
                        [util.opp_strand_dict[g_nt[0]], util.opp_strand_dict[g_nt[1]]])
    
                    flip_nts = False
                    
                    #Coordination is a bit more complicate when validation genotypes are provided..
                    if not sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt):
                        flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0] == ss_nt[1]) or (
                            os_g_nt[1] == ss_nt[0] and os_g_nt[0] == ss_nt[1])
                        
                        # Try flipping the SS nt
                        if flip_nts:
                            betas[ss_i] = -betas[ss_i]
                            log_odds[ss_i] = -log_odds[ss_i]
                            if 'freqs' in ssg and ss_freqs[ss_i]>0:
                                ss_freqs[ss_i] = 1.0 - ss_freqs[ss_i]
                        else:
                            if debug:
                                print("Nucleotides don't match after all?: g_sid=%s, ss_sid=%s, g_i=%d, ss_i=%d, g_nt=%s, ss_nt=%s" % \
                                      (g_sids[g_i], ss_sids[ss_i], g_i,
                                       ss_i, str(g_nt), str(ss_nt)))
                            num_non_matching_nts += 1
                            continue
                   
                # everything seems ok.
                ok_indices['g'].append(g_i)
                ok_indices['ss'].append(ss_i)
                ok_nts.append(g_nt)
                
        if debug:
            print('%d SNPs had ambiguous nucleotides.' % num_ambig_nts)
            print('%d SNPs were excluded due to nucleotide issues.' % num_non_matching_nts)

        
        # Resorting by position
        positions = sp.array(chrom_d['positions'])[ok_indices['g']]

        # Now parse SNPs ..
        snp_indices = sp.array(chrom_d['snp_indices'])
        # Pinpoint where the SNPs are in the file.
        snp_indices = snp_indices[ok_indices['g']]
        raw_snps, freqs = plinkfiles.parse_plink_snps(
            reference_genotype_file, snp_indices)
        snp_stds = sp.sqrt(2 * freqs * (1 - freqs))
        snp_means = freqs * 2

        betas = betas[ok_indices['ss']]  
        log_odds = log_odds[ok_indices['ss']]  

        ns = ssg['ns'][...][ok_indices['ss']]
        ps = ssg['ps'][...][ok_indices['ss']]
        nts = sp.array(ok_nts)  
        sids = (ssg['sids'][...]).astype(util.sids_u_dtype)
        sids = sids[ok_indices['ss']]

        #Parse validation genotypes, if available
        if validation_genotype_file is not None:
            snp_indices_val = sp.array(chrom_d_val['snp_indices'])
            # Pinpoint where the SNPs are in the file.
            snp_indices_val = snp_indices_val[ok_indices['vg']]
            raw_snps_val, freqs_val = plinkfiles.parse_plink_snps(
                validation_genotype_file, snp_indices_val)
    
            snp_stds_val = sp.sqrt(2 * freqs_val * (1 - freqs_val))
            snp_means_val = freqs_val * 2

        # Check SNP frequencies, screen for possible problems..
        if max_freq_discrep<1 and 'freqs' in ssg:
            ss_freqs = ss_freqs[ok_indices['ss']]
            ok_freq_snps = sp.logical_or(sp.absolute(ss_freqs - freqs) < max_freq_discrep,sp.absolute(ss_freqs + freqs-1) < max_freq_discrep) #Array of np.bool values
            ok_freq_snps = sp.logical_or(ok_freq_snps,ss_freqs<=0) #Only consider SNPs that actually have frequencies
            num_freq_discrep_filtered_snps = len(ok_freq_snps)- sp.sum(ok_freq_snps)
            assert num_freq_discrep_filtered_snps>=0, "Problems when filtering SNPs with frequency discrepencies"
            if num_freq_discrep_filtered_snps>0:
                # Filter freq_discrepancy_snps
                raw_snps = raw_snps[ok_freq_snps]
                snp_stds = snp_stds[ok_freq_snps]
                snp_means = snp_means[ok_freq_snps]
                freqs = freqs[ok_freq_snps]
                ps = ps[ok_freq_snps]
                ns = ns[ok_freq_snps]
                positions = positions[ok_freq_snps]
                nts = nts[ok_freq_snps]
                sids = sids[ok_freq_snps]
                betas = betas[ok_freq_snps]
                log_odds = log_odds[ok_freq_snps]
                if validation_genotype_file is not None:
                    raw_snps_val = raw_snps_val[ok_freq_snps]
                    snp_stds_val = snp_stds_val[ok_freq_snps]
                    snp_means_val = snp_means_val[ok_freq_snps]
                    freqs_val = freqs_val[ok_freq_snps]
            if debug:
                print('Filtered %d SNPs due to frequency discrepancies'%num_freq_discrep_filtered_snps)

        # Filter minor allele frequency SNPs.
        maf_filter = (freqs > min_maf) * (freqs < (1 - min_maf))
        num_maf_filtered_snps = len(maf_filter)-sp.sum(maf_filter)
        assert num_maf_filtered_snps>=0, "Problems when filtering SNPs with low minor allele frequencies"
        if num_maf_filtered_snps>0:
            raw_snps = raw_snps[maf_filter]
            snp_stds = snp_stds[maf_filter]
            snp_means = snp_means[maf_filter]
            freqs = freqs[maf_filter]
            ps = ps[maf_filter]
            ns = ns[maf_filter]
            positions = positions[maf_filter]
            nts = nts[maf_filter]
            sids = sids[maf_filter]
            betas = betas[maf_filter]
            log_odds = log_odds[maf_filter]
            if validation_genotype_file is not None:
                raw_snps_val = raw_snps_val[maf_filter]
                snp_stds_val = snp_stds_val[maf_filter]
                snp_means_val = snp_means_val[maf_filter]
                freqs_val = freqs_val[maf_filter]
            if debug:
                print('Filtered %d SNPs due to low MAF'%num_maf_filtered_snps)

        genetic_map = []
        if genetic_map_dir is not None:
            with gzip.open(genetic_map_dir + 'chr%d.interpolated_genetic_map.gz' % chrom) as f:
                for line in f:
                    l = line.split()
#                     if l[0] in sid_set:
#                         genetic_map.append(l[0])
        else:
            genetic_map = None

        coord_data_dict = {'chrom': 'chrom_%d' % chrom, 
                           'raw_snps_ref': raw_snps, 
                           'snp_stds_ref': snp_stds, 
                           'snp_means_ref': snp_means, 
                           'freqs_ref': freqs,
                           'ps': ps,
                           'ns': ns,
                           'positions': positions,
                           'nts': nts,
                           'sids': sids,
                           'genetic_map': genetic_map,
                           'betas': betas,
                           'log_odds': log_odds}
        if validation_genotype_file is not None:
            maf_adj_prs = sp.dot(log_odds, raw_snps_val)
            if debug and plinkf_dict_val['has_phenotype']:
                maf_adj_corr = sp.corrcoef(plinkf_dict_val['phenotypes'], maf_adj_prs)[0, 1]
                print('Log odds, per genotype PRS correlation w phenotypes for chromosome %d was %0.4f' % (chrom, maf_adj_corr))
            coord_data_dict['raw_snps_val']=raw_snps_val
            coord_data_dict['snp_stds_val']=snp_stds_val
            coord_data_dict['snp_means_val']=snp_means_val
            coord_data_dict['freqs_val']=freqs_val
            coord_data_dict['log_odds_prs']=maf_adj_prs
            maf_adj_risk_scores += maf_adj_prs
         
         
        write_coord_data(cord_data_g, coord_data_dict, debug=debug)
        if debug:
            print('%d SNPs were retained on chromosome %d.' % (len(sids), chrom))
        
        
        num_snps_common_before_filtering += len(common_sids)
        num_snps_common_after_filtering += len(sids)
        tot_num_ambig_nts += num_ambig_nts
        tot_num_non_supported_nts += num_non_supported_nts
        tot_num_non_matching_nts += num_non_matching_nts
        tot_num_freq_discrep_filtered_snps += num_freq_discrep_filtered_snps
        tot_num_maf_filtered_snps += num_maf_filtered_snps

    if not debug:
        sys.stdout.write('\r%0.2f%%\n' % (100.0))
        sys.stdout.flush()                        


    # Now calculate the prediction r^2
    if validation_genotype_file:
        if debug and plinkf_dict_val['has_phenotype']:
            maf_adj_corr = sp.corrcoef(
                plinkf_dict_val['phenotypes'], maf_adj_risk_scores)[0, 1]
            print('Log odds, per PRS correlation for the whole genome was %0.4f (r^2=%0.4f)' % (maf_adj_corr, maf_adj_corr ** 2))
            print('Overall nucleotide concordance counts: rg_vg: %d, rg_ss: %d, vg_ss: %d' % (tot_g_vg_nt_concord_count, tot_g_ss_nt_concord_count, tot_vg_ss_nt_concord_count))
    else:
        if debug:
            print('Overall nucleotide concordance counts, rg_ss: %d' % (tot_g_ss_nt_concord_count))        
    
    summary_dict[7]={'name':'Num chromosomes used:','value':len(chromosomes_found)}
    summary_dict[8]={'name':'SNPs common across datasets:','value':num_snps_common_before_filtering}
    summary_dict[9]={'name':'SNPs retained after filtering:','value':num_snps_common_after_filtering}
    if tot_num_ambig_nts>0:
        summary_dict[10]={'name':'SNPs w ambiguous nucleotides filtered:','value':tot_num_ambig_nts}
    if tot_num_non_supported_nts>0:
        summary_dict[10.1]={'name':'SNPs w unknown/unsupported nucleotides filtered:','value':tot_num_non_supported_nts}
    if tot_num_non_matching_nts>0:
        summary_dict[11]={'name':'SNPs w other nucleotide discrepancies filtered:','value':tot_num_non_matching_nts}
    if min_maf>0:
        summary_dict[12]={'name':'SNPs w MAF<%0.3f filtered:'%min_maf,'value':tot_num_maf_filtered_snps}
    if max_freq_discrep<0.5:
        summary_dict[13]={'name':'SNPs w allele freq discrepancy > %0.3f filtered:'%max_freq_discrep,'value':tot_num_freq_discrep_filtered_snps}

    t1 = time.time()
    t = (t1 - t0)
    summary_dict[13.9]={'name':'dash', 'value':'Running times'}
    summary_dict[15]={'name':'Run time for coordinating datasets:','value': '%d min and %0.2f sec'%(t / 60, t % 60)}
示例#44
0
文件: core.py 项目: senaj/scLVM
    def fitLMM(self,
               expr=None,
               K=None,
               tech_noise=None,
               idx=None,
               i0=None,
               i1=None,
               verbose=False,
               recalc=True,
               standardize=True):
        """
		Args:
			K:				list of random effects to be considered in the analysis
							if K is none, it does not consider any random effect
			expr:				correlations are calculated between the gene expression data (self.Y) and these measures provided in expr. If None, self.Y i sused 	
			idx:
			indices of the genes to be considered in the analysis
			i0:				gene index from which the anlysis starts
			i1:				gene index to which the analysis stops
			verbose:		if True, print progress
			recalc:			if True, re-do variance decomposition
			standardize:		if True, standardize also expression 
		Returns:
			pv:				matrix of pvalues
			beta:			matrix of correlations
			info:			dictionary annotates pv and beta rows and columns, containing
							gene_idx_row:	index of the genes in rows
							conv:		boolean vetor marking genes for which variance decomposition has converged
							gene_row:   annotate rows of matrices
		"""

        if idx == None:
            if i0 == None or i1 == None:
                i0 = 0
                i1 = self.G
            idx = SP.arange(i0, i1)
        elif type(idx) != SP.ndarray:
            idx = SP.array(idx)
        idx = SP.intersect1d(
            idx,
            SP.where(self.Y.std(0) > 0)
            [0])  #only makes sense if gene is expressed in at least one cell

        if K != None:
            if type(K) != list: K = [K]
            if (recalc == True and len(K) > 1) or (recalc == True
                                                   and self.var == None):
                print 'performing variance decomposition first...'
                var_raw, var_info = self.varianceDecomposition(K=K,
                                                               idx=idx,
                                                               cache=False)
                var = var_raw / var_raw.sum(1)[:, SP.newaxis]
            elif recalc == False and len(K) > 1:
                assert self.var != None, 'scLVM:: when multiple hidden factors are considered, varianceDecomposition decomposition must be used prior to this method'
                warnings.warn(
                    'scLVM:: recalc should only be set to False by advanced users: scLVM then assumes that the random effects are the same as those for which the variance decompostion was performed earlier.'
                )
                var_raw = self.var
                var_info = self.var_info
                var = var_raw / var_raw.sum(1)[:, SP.newaxis]

        lmm_params = {
            'covs': SP.ones([self.N, 1]),
            'NumIntervalsDeltaAlt': 100,
            'NumIntervalsDelta0': 100,
            'searchDelta': True
        }

        Yidx = self.Y[:, idx]
        Ystd = Yidx - Yidx.mean(0)
        Ystd /= Yidx.std(0)  #delta optimization might be more efficient

        if expr == None:
            expr = Ystd
        elif standardize == True:
            exprStd = expr
            exprStd = expr - expr.mean(0)
            exprStd /= expr.std(0)
            expr = exprStd

        _G1 = idx.shape[0]
        _G2 = expr.shape[1]

        geneID = SP.zeros(_G1, dtype=str)

        beta = SP.zeros((_G1, _G2))
        pv = SP.zeros((_G1, _G2))
        count = 0

        for ids in range(_G1):
            if verbose:
                print '.. fitting gene %d' % ids
            # extract a single gene
            if K != None:
                if len(K) > 1:
                    if var_info['conv'][count] == True:
                        _K = SP.sum(
                            [var[count, i] * K[i] for i in range(len(K))], 0)
                        _K /= _K.diagonal().mean()
                    else:
                        _K = None
                else:
                    _K = K[0]
            else:
                _K = None
            lm = QTL.test_lmm(expr,
                              Ystd[:, ids:ids + 1],
                              K=_K,
                              verbose=False,
                              **lmm_params)
            pv[count, :] = lm.getPv()[0, :]
            beta[count, :] = lm.getBetaSNP()[0, :]
            count += 1

        if self.geneID != None: geneID = SP.array(self.geneID)[idx]
        if recalc == True and K != None and len(K) > 1:
            info = {'conv': var_info['conv'], 'gene_idx_row': idx}
        else:
            info = {'gene_idx_row': idx}
        if geneID != None: info['gene_row'] = geneID

        return pv, beta, info
    QueryGOIDs          = []
    NQueryGeneEntrezIDs = 0
    for i in xrange(len(QueryGeneEntrezIDs)):
        if(QueryGeneEntrezIDs=='None'):
            continue
        NQueryGeneEntrezIDs += 1
        Ind                  = scipy.where(BGSData[BGSHeader.index('GeneEntrezID')]==QueryGeneEntrezIDs[i])[0]
        QueryGOIDs.extend(BGSData[BGSHeader.index('GOId'),Ind].tolist())
    QueryGOIDs = scipy.unique(scipy.array(QueryGOIDs))

    for g in xrange(len(QueryGOIDs)):
        Indices                 = scipy.where(BGSData[BGSHeader.index('GOId')]==QueryGOIDs[g])[0]
        EntrezGenes             = BGSData[BGSHeader.index('GeneEntrezID'),Indices]
        NGenesWithThisGOID      = len(Indices)
        QueryGenesWithThisGOID  = scipy.intersect1d(EntrezGenes,QueryGeneEntrezIDs)
        NQueryGenesWithThisGOID = len(QueryGenesWithThisGOID)
        QueryGSymbols           = []
        for EID in QueryGenesWithThisGOID:
            QueryGSymbols.append(QueryGeneSymbols[QueryGeneEntrezIDs.index(EID)])
        EnrichmentFactor = (float(NQueryGenesWithThisGOID)/float(len(QueryGeneEntrezIDs)))/(float(NGenesWithThisGOID)/float(NGenesWithGOEntry))
        GODescr          = BGSData[BGSHeader.index('GODescr'),Indices[0]]
        PValue           = scipy.stats.hypergeom.sf(NQueryGenesWithThisGOID,NGenesWithGOEntry,NQueryGeneEntrezIDs,NGenesWithThisGOID)
        print g,\
              QueryGOIDs[g],\
              GODescr,\
              NGenesWithThisGOID,\
              NGenesWithGOEntry,\
              NQueryGenesWithThisGOID,\
              len(QueryGeneEntrezIDs),\
              ','.join(QueryGenesWithThisGOID),\
        if ((not os.path.exists(count_file_GRCH37)) or (not os.path.exists(count_file_SNP_maternal)) or (not os.path.exists(count_file_SNP_paternal)) or (not os.path.exists(count_file_SV_maternal)) or (not os.path.exists(count_file_SV_paternal))):
            print "skip: %s" % element_id
            RV_file_exist.append([element_id,os.path.exists(count_file_GRCH37),os.path.exists(count_file_SNP_maternal),os.path.exists(count_file_SNP_paternal),os.path.exists(count_file_SV_maternal),os.path.exists(count_file_SV_paternal)])
            RV_file.append([element_id,count_file_GRCH37,count_file_SNP_maternal,count_file_SNP_paternal,count_file_SV_maternal,count_file_SV_paternal])
            continue
        #1. load lists
        count_GRCH37 = cPickle.load(open(count_file_GRCH37,'rb'))
        count_SNP_maternal = cPickle.load(open(count_file_SNP_maternal,'rb'))
        count_SNP_paternal = cPickle.load(open(count_file_SNP_paternal,'rb'))
        count_SV_maternal = cPickle.load(open(count_file_SV_maternal,'rb'))
        count_SV_paternal = cPickle.load(open(count_file_SV_paternal,'rb'))
        
        count_SNP = SP.union1d(count_SNP_maternal,count_SNP_paternal)
        count_SV = SP.union1d(count_SV_maternal,count_SV_paternal)
        count_intersect_GRCH37_SNP  = SP.intersect1d(count_SNP,count_GRCH37)
        count_intersect_GRCH37_SV  = SP.intersect1d(count_SV,count_GRCH37)
        count_intersect_SNP_SV  = SP.intersect1d(count_SNP,count_SV)

        count_ex_GRCH37_SNP = SP.setdiff1d(count_GRCH37,count_SNP)
        count_ex_GRCH37_SV = SP.setdiff1d(count_GRCH37,count_SV)
        count_ex_SNP_GRCH37 = SP.setdiff1d(count_SNP,count_GRCH37)
        count_ex_SV_GRCH37 = SP.setdiff1d(count_SV,count_GRCH37)
        count_ex_SNP_SV = SP.setdiff1d(count_SNP,count_SV)
        count_ex_SV_SNP = SP.setdiff1d(count_SV,count_SNP)
    
        #store a couple of things
        rv = []
        rv = {'element_id': element_id,'count_ref': len(count_GRCH37),'count_SNP_maternal':len(count_SNP_maternal),'count_SNP_paternal':len(count_SNP_paternal),'count_SV_maternal':len(count_SV_maternal),'count_SV_paternal':len(count_SV_paternal),'count_SNP':len(count_SNP),'count_SV':len(count_SV),'count_intersect_GRCH37_SNP':len(count_intersect_GRCH37_SNP),'count_intersect_GRCH37_SV':len(count_intersect_GRCH37_SV),'count_intersect_SNP_SV':len(count_intersect_SNP_SV),'count_ex_GRCH37_SNP':len(count_ex_GRCH37_SNP),'count_ex_GRCH37_SV':len(count_ex_GRCH37_SV),'count_ex_SNP_GRCH37':len(count_ex_SNP_GRCH37),'count_ex_SV_GRCH37':len(count_ex_SV_GRCH37),'count_ex_SNP_SV':len(count_ex_SNP_SV),'count_ex_SV_SNP':len(count_ex_SV_SNP)}
        RV.append(rv)
        pass
示例#47
0
文件: verify.py 项目: xtmgah/spladder
def verify_alt_prime(event, gene, counts_segments, counts_edges, CFG):
    # [verified, info] = verify_exon_skip(event, fn_bam, cfg)

    # (0) valid, (1) exon_diff_cov, (2) exon_const_cov
    # (3) intron1_conf, (4) intron2_conf
    info = [1, 0, 0, 0, 0]
    verified = [0, 0]

    ### check validity of exon coordinates (>=0)
    if sp.any(event.exons1 < 0) or sp.any(event.exons2 < 0):
        info[0] = 0
        return (verified, info)

    ### check validity of intron coordinates (only one side is differing)
    if (event.exons1[0, 1] != event.exons2[0, 1]) and (event.exons1[1, 0] !=
                                                       event.exons2[1, 0]):
        info[0] = 0
        return (verified, info)

    sg = gene.splicegraph
    segs = gene.segmentgraph

    ### find exons corresponding to event
    idx_exon11 = sp.where((sg.vertices[0, :] == event.exons1[0, 0])
                          & (sg.vertices[1, :] == event.exons1[0, 1]))[0]
    if idx_exon11.shape[0] == 0:
        segs_exon11 = sp.where((segs.segments[0, :] >= event.exons1[0, 0]) &
                               (segs.segments[1, :] <= event.exons1[0, 1]))[0]
    else:
        segs_exon11 = sp.where(segs.seg_match[idx_exon11, :])[1]
    idx_exon12 = sp.where((sg.vertices[0, :] == event.exons1[1, 0])
                          & (sg.vertices[1, :] == event.exons1[1, 1]))[0]
    if idx_exon12.shape[0] == 0:
        segs_exon12 = sp.where((segs.segments[0, :] >= event.exons1[1, 0]) &
                               (segs.segments[1, :] <= event.exons1[1, 1]))[0]
    else:
        segs_exon12 = sp.where(segs.seg_match[idx_exon12, :])[1]
    idx_exon21 = sp.where((sg.vertices[0, :] == event.exons2[0, 0])
                          & (sg.vertices[1, :] == event.exons2[0, 1]))[0]
    if idx_exon21.shape[0] == 0:
        segs_exon21 = sp.where((segs.segments[0, :] >= event.exons2[0, 0]) &
                               (segs.segments[1, :] <= event.exons2[0, 1]))[0]
    else:
        segs_exon21 = sp.where(segs.seg_match[idx_exon21, :])[1]
    idx_exon22 = sp.where((sg.vertices[0, :] == event.exons2[1, 0])
                          & (sg.vertices[1, :] == event.exons2[1, 1]))[0]
    if idx_exon22.shape[0] == 0:
        segs_exon22 = sp.where((segs.segments[0, :] >= event.exons2[1, 0]) &
                               (segs.segments[1, :] <= event.exons2[1, 1]))[0]
    else:
        segs_exon22 = sp.where(segs.seg_match[idx_exon22, :] > 0)[1]

    assert (segs_exon11.shape[0] > 0)
    assert (segs_exon12.shape[0] > 0)
    assert (segs_exon21.shape[0] > 0)
    assert (segs_exon22.shape[0] > 0)

    if sp.all(segs_exon11 == segs_exon21):
        seg_exon_const = segs_exon11
        seg_diff = sp.setdiff1d(segs_exon12, segs_exon22)
        if seg_diff.shape[0] == 0:
            seg_diff = sp.setdiff1d(segs_exon22, segs_exon12)
        seg_const = sp.intersect1d(segs_exon12, segs_exon22)
    elif sp.all(segs_exon12 == segs_exon22):
        seg_exon_const = segs_exon12
        seg_diff = sp.setdiff1d(segs_exon11, segs_exon21)
        if seg_diff.shape[0] == 0:
            seg_diff = sp.setdiff1d(segs_exon21, segs_exon11)
        seg_const = sp.intersect1d(segs_exon21, segs_exon11)
    else:
        print >> sys.stderr, "ERROR: both exons differ in alt prime event in verify_alt_prime"
        sys.exit(1)
    seg_const = sp.r_[seg_exon_const, seg_const]

    seg_lens = segs.segments[1, :] - segs.segments[0, :]

    # exon_diff_cov
    info[1] = sp.sum(counts_segments[seg_diff] * seg_lens[seg_diff]) / sp.sum(
        seg_lens[seg_diff])
    # exon_const_cov
    info[2] = sp.sum(counts_segments[seg_const] *
                     seg_lens[seg_const]) / sp.sum(seg_lens[seg_const])

    if info[1] >= CFG['alt_prime']['min_diff_rel_cov'] * info[2]:
        verified[0] = 1

    ### check intron confirmations as sum of valid intron scores
    ### intron score is the number of reads confirming this intron
    # intron1_conf
    idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index(
        [segs_exon11[-1], segs_exon12[0]], segs.seg_edges.shape))[0]
    assert (idx.shape[0] > 0)
    info[3] = counts_edges[idx, 1]
    # intron2_conf
    idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index(
        [segs_exon21[-1], segs_exon22[0]], segs.seg_edges.shape))[0]
    assert (idx.shape[0] > 0)
    info[4] = counts_edges[idx, 1]

    if min(info[3], info[4]) >= CFG['alt_prime']['min_intron_count']:
        verified[1] = 1

    return (verified, info)
示例#48
0
文件: core.py 项目: mdurante1/scLVM
	def varianceDecomposition(self,K=None,tech_noise=None,idx=None,i0=None,i1=None,max_iter=10,verbose=False, cache=True):
		"""
		Args:
			K:				list of random effects to be considered in the analysis
			idx:			indices of the genes to be considered in the analysis
			i0:				gene index from which the anlysis starts
			i1:				gene index to which the analysis stops
			max_iter:		maximum number of random restarts
			verbose:		if True, print progresses
		"""

		if tech_noise!=None:		self.set_tech_noise(tech_noise)
		assert self.tech_noise!=None, 'scLVM:: specify technical noise'
		assert K!=None, 'scLVM:: specify K'

		if type(K)!=list:	K = [K]
		for k in K:
			assert k.shape[0]==self.N, 'scLVM:: K dimension dismatch'
			assert k.shape[1]==self.N, 'scLVM:: K dimension dismatch'

		if idx==None:
			if i0==None or i1==None:
				i0 = 0; i1 = self.G
			idx = SP.arange(i0,i1)
		elif type(idx)!=SP.ndarray:
			idx = SP.array(idx)
		idx = SP.intersect1d(SP.array(idx),SP.where(self.Y.std(0)>0)[0]) #only makes sense if gene is expressed in at least one cell
		_G	 = len(idx)
		var	= SP.zeros((_G,len(K)+2))
		_idx   = SP.zeros(_G)
		geneID = SP.zeros(_G,dtype=str)
		conv   = SP.zeros(_G)==1
		Ystar  = [SP.zeros((self.N,_G)) for i in range(len(K))]
		count  = 0
		Yidx = self.Y[:,idx]
		Ystd = Yidx-Yidx.mean(0) 
		Ystd/= Yidx.std(0) #delta optimization might be more efficient
		tech_noise = self.tech_noise[idx]/SP.array(Yidx.std(0))**2

		for ids in range(_G):
			if verbose:
				print '.. fitting gene %d'%ids
			# extract a single gene
			y = Ystd[:,ids:ids+1]
			# build and fit variance decomposition model
			vc= VAR.VarianceDecomposition(y)
			vc.addFixedEffect()
			for k in K:
				vc.addRandomEffect(k)
			vc.addRandomEffect(SP.eye(self.N))
			vc.addRandomEffect(SP.eye(self.N))
			vc.vd.getTerm(len(K)+1).getKcf().setParamMask(SP.zeros(1))
			for iter_i in range(max_iter):
				scales0 = y.std()*SP.randn(len(K)+2)
				scales0[len(K)+1]=SP.sqrt(tech_noise[ids]);
				_conv = vc.optimize(scales0=scales0)
				if _conv: break
				
			conv[count] = _conv
			if not _conv:
				var[count,-2] = SP.maximum(0,y.var()-tech_noise[ids])
				var[count,-1] = tech_noise[ids]
				count+=1;
				continue
			_var = vc.getVarianceComps()[0,:]
			KiY = vc.gp.agetKEffInvYCache().ravel()
			for ki in range(len(K)):
				Ystar[ki][:,count]=_var[ki]*SP.dot(K[ki],KiY)
			var[count,:] = _var
			count+=1;
		if self.geneID!=None:	geneID = SP.array(self.geneID)[idx]
		col_header = ['hidden_%d'%i for i in range(len(K))]
		col_header.append('biol_noise')
		col_header.append('tech_noise')
		col_header = SP.array(col_header)

		#annotate column and rows of var and Ystar
		var_info = {'gene_idx':idx,'col_header':col_header,'conv':conv}
		if geneID!=None:	var_info['geneID'] = SP.array(geneID)
		Ystar_info = {'gene_idx':idx,'conv':conv}
		if geneID!=None:	Ystar_info['geneID'] = SP.array(geneID)

		# cache stuff
		if cache == True:
			self.var   = var
			self.Ystar = Ystar
			self.var_info   = var_info
			self.Ystar_info = Ystar_info
		else:
			return var, var_info
示例#49
0
文件: core.py 项目: mdurante1/scLVM
	def fitLMM(self,expr = None,K=None,tech_noise=None,idx=None,i0=None,i1=None,verbose=False, recalc=True, standardize=True):
		"""
		Args:
			K:				list of random effects to be considered in the analysis
							if K is none, it does not consider any random effect
			expr:				correlations are calculated between the gene expression data (self.Y) and these measures provided in expr. If None, self.Y i sused 	
			idx:
			indices of the genes to be considered in the analysis
			i0:				gene index from which the anlysis starts
			i1:				gene index to which the analysis stops
			verbose:		if True, print progress
			recalc:			if True, re-do variance decomposition
			standardize:		if True, standardize also expression 
		Returns:
			pv:				matrix of pvalues
			beta:			matrix of correlations
			info:			dictionary annotates pv and beta rows and columns, containing
							gene_idx_row:	index of the genes in rows
							conv:		boolean vetor marking genes for which variance decomposition has converged
							gene_row:   annotate rows of matrices
		"""
		

		if idx==None:
			if i0==None or i1==None:
				i0 = 0; i1 = self.G
			idx = SP.arange(i0,i1)
		elif type(idx)!=SP.ndarray:
			idx = SP.array(idx)
		idx = SP.intersect1d(idx,SP.where(self.Y.std(0)>0)[0]) #only makes sense if gene is expressed in at least one cell

		
		if K!=None:
			if type(K)!=list:	K = [K]
			if (recalc==True and len(K)>1) or (recalc==True and self.var==None):
				print 'performing variance decomposition first...'
				var_raw,var_info = self.varianceDecomposition(K=K,idx=idx, cache=False) 
				var = var_raw/var_raw.sum(1)[:,SP.newaxis]
			elif recalc==False and len(K)>1:
				assert self.var!=None, 'scLVM:: when multiple hidden factors are considered, varianceDecomposition decomposition must be used prior to this method'
				warnings.warn('scLVM:: recalc should only be set to False by advanced users: scLVM then assumes that the random effects are the same as those for which the variance decompostion was performed earlier.')
				var_raw = self.var
 				var_info = self.var_info
				var = var_raw/var_raw.sum(1)[:,SP.newaxis]
		
		lmm_params = {'covs':SP.ones([self.N,1]),'NumIntervalsDeltaAlt':100,'NumIntervalsDelta0':100,'searchDelta':True}
				
				
		Yidx = self.Y[:,idx]
		Ystd = Yidx-Yidx.mean(0)
		Ystd/= Yidx.std(0) #delta optimization might be more efficient
		
		if expr==None:
			expr = Ystd		
		elif standardize==True:
			exprStd = expr
			exprStd = expr-expr.mean(0)
			exprStd/= expr.std(0)
			expr = exprStd

		_G1	  = idx.shape[0]
		_G2	 = expr.shape[1]

		geneID = SP.zeros(_G1,dtype=str)
		
		beta   = SP.zeros((_G1,_G2))
		pv	 = SP.zeros((_G1,_G2))
		count  = 0
		
		for ids in range(_G1):
			if verbose:
				print '.. fitting gene %d'%ids
			# extract a single gene
			if K!=None:
				if len(K)>1:
					if var_info['conv'][count]==True:
						_K = SP.sum([var[count,i]*K[i] for i in range(len(K))],0)
						_K/= _K.diagonal().mean()
					else:
						_K = None
				else:
					_K = K[0]
			else:
				_K = None
			lm = QTL.test_lmm(expr,Ystd[:,ids:ids+1],K=_K,**lmm_params)
			pv[count,:]   = lm.getPv()[0,:]
			beta[count,:] = lm.getBetaSNP()[0,:]
			count+=1

		if self.geneID!=None:   geneID = SP.array(self.geneID)[idx]
		if recalc==True and K!=None  and len(K)>1:	
			info = {'conv':var_info['conv'],'gene_idx_row':idx}
		else:	
			info = {'gene_idx_row':idx}
		if geneID!=None:	info['gene_row'] = geneID

		return pv, beta, info
示例#50
0
def coordinate_genotypes_ss_w_ld_ref(genotype_file=None,
                                     reference_genotype_file=None,
                                     hdf5_file=None,
                                     genetic_map_dir=None,
                                     check_mafs=False,
                                     min_maf=0.01,
                                     skip_coordination=False,
                                     debug=False):
    print('Coordinating things w genotype file: %s \nref. genot. file: %s' %
          (genotype_file, reference_genotype_file))

    from plinkio import plinkfile
    plinkf = plinkfile.PlinkFile(genotype_file)

    # Loads only the individuals...
    plinkf_dict = plinkfiles.get_phenotypes(plinkf)

    # Figure out chromosomes and positions.
    if debug:
        print('Parsing validation bim file')
    loci = plinkf.get_loci()
    plinkf.close()
    gf_chromosomes = [l.chromosome for l in loci]

    chromosomes = sp.unique(gf_chromosomes)
    chromosomes.sort()

    chr_dict = plinkfiles.get_chrom_dict(loci, chromosomes)

    if debug:
        print('Parsing LD reference bim file')
    plinkf_ref = plinkfile.PlinkFile(reference_genotype_file)
    loci_ref = plinkf_ref.get_loci()
    plinkf_ref.close()

    chr_dict_ref = plinkfiles.get_chrom_dict(loci_ref, chromosomes)

    # Open HDF5 file and prepare out data
    assert not 'iids' in hdf5_file, 'Something is wrong with the HDF5 file, no individuals IDs were found.'
    if plinkf_dict['has_phenotype']:
        hdf5_file.create_dataset('y', data=plinkf_dict['phenotypes'])

    hdf5_file.create_dataset('fids',
                             data=sp.array(plinkf_dict['fids'],
                                           dtype=util.fids_dtype))
    hdf5_file.create_dataset('iids',
                             data=sp.array(plinkf_dict['iids'],
                                           dtype=util.iids_dtype))
    ssf = hdf5_file['sum_stats']
    cord_data_g = hdf5_file.create_group('cord_data')

    maf_adj_risk_scores = sp.zeros(plinkf_dict['num_individs'])
    num_common_snps = 0
    # corr_list = []

    tot_g_ss_nt_concord_count = 0
    tot_rg_ss_nt_concord_count = 0
    tot_g_rg_nt_concord_count = 0
    tot_num_non_matching_nts = 0

    # Now iterate over chromosomes
    for chrom in chromosomes:
        ok_indices = {'g': [], 'rg': [], 'ss': []}

        chr_str = 'chrom_%d' % chrom
        print('Coordinating data for chromosome %s' % chr_str)

        chrom_d = chr_dict[chr_str]
        chrom_d_ref = chr_dict_ref[chr_str]
        try:
            ssg = ssf['chrom_%d' % chrom]
        except Exception as err_str:
            print(err_str)
            print('Did not find chromosome in SS dataset.')
            print('Continuing.')
            continue

        ssg = ssf['chrom_%d' % chrom]
        g_sids = chrom_d['sids']
        rg_sids = chrom_d_ref['sids']
        ss_sids = (ssg['sids'][...]).astype(util.sids_u_dtype)
        if debug:
            print(
                'Found %d SNPs in validation data, %d SNPs in LD reference data, and %d SNPs in summary statistics.'
                % (len(g_sids), len(rg_sids), len(ss_sids)))
        common_sids = sp.intersect1d(ss_sids, g_sids)
        common_sids = sp.intersect1d(common_sids, rg_sids)
        if debug:
            print(
                'Found %d SNPs on chrom %d that were common across all datasets'
                % (len(common_sids), chrom))

        ss_snp_map = []
        g_snp_map = []
        rg_snp_map = []

        ss_sid_dict = {}
        for i, sid in enumerate(ss_sids):
            ss_sid_dict[sid] = i

        g_sid_dict = {}
        for i, sid in enumerate(g_sids):
            g_sid_dict[sid] = i

        rg_sid_dict = {}
        for i, sid in enumerate(rg_sids):
            rg_sid_dict[sid] = i

        for sid in common_sids:
            g_snp_map.append(g_sid_dict[sid])

        # order by positions
        g_positions = sp.array(chrom_d['positions'])[g_snp_map]
        order = sp.argsort(g_positions)
        # order = order.tolist()
        g_snp_map = sp.array(g_snp_map)[order]
        g_snp_map = g_snp_map.tolist()
        common_sids = sp.array(common_sids)[order]

        # Get the other two maps
        for sid in common_sids:
            rg_snp_map.append(rg_sid_dict[sid])

        for sid in common_sids:
            ss_snp_map.append(ss_sid_dict[sid])

        g_nts = sp.array(chrom_d['nts'])
        rg_nts = sp.array(chrom_d_ref['nts'])
        rg_nts_ok = sp.array(rg_nts)[rg_snp_map]
        ss_nts = (ssg['nts'][...]).astype(util.nts_u_dtype)
        betas = ssg['betas'][...]
        log_odds = ssg['log_odds'][...]

        if 'freqs' in ssg:
            ss_freqs = ssg['freqs'][...]

        g_ss_nt_concord_count = sp.sum(
            g_nts[g_snp_map] == ss_nts[ss_snp_map]) / 2.0
        rg_ss_nt_concord_count = sp.sum(rg_nts_ok == ss_nts[ss_snp_map]) / 2.0
        g_rg_nt_concord_count = sp.sum(g_nts[g_snp_map] == rg_nts_ok) / 2.0
        if debug:
            print(
                'Nucleotide concordance counts out of %d genotypes: vg-g: %d, vg-ss: %d, g-ss: %d'
                % (len(g_snp_map), g_rg_nt_concord_count,
                   g_ss_nt_concord_count, rg_ss_nt_concord_count))
        tot_g_ss_nt_concord_count += g_ss_nt_concord_count
        tot_rg_ss_nt_concord_count += rg_ss_nt_concord_count
        tot_g_rg_nt_concord_count += g_rg_nt_concord_count

        num_non_matching_nts = 0
        num_ambig_nts = 0

        # Identifying which SNPs have nucleotides that are ok..
        ok_nts = []
        for g_i, rg_i, ss_i in zip(g_snp_map, rg_snp_map, ss_snp_map):

            # To make sure, is the SNP id the same?
            assert g_sids[g_i] == rg_sids[rg_i] == ss_sids[
                ss_i], 'Some issues with coordinating the genotypes.'

            g_nt = g_nts[g_i]
            if not skip_coordination:

                rg_nt = rg_nts[rg_i]
                ss_nt = ss_nts[ss_i]

                # Is the nucleotide ambiguous.
                g_nt = [g_nts[g_i][0], g_nts[g_i][1]]
                if tuple(g_nt) in util.ambig_nts:
                    num_ambig_nts += 1
                    tot_num_non_matching_nts += 1
                    continue

                # First check if nucleotide is sane?
                if (not g_nt[0] in util.valid_nts) or (not g_nt[1]
                                                       in util.valid_nts):
                    num_non_matching_nts += 1
                    tot_num_non_matching_nts += 1
                    continue

                os_g_nt = sp.array([
                    util.opp_strand_dict[g_nt[0]],
                    util.opp_strand_dict[g_nt[1]]
                ])

                flip_nts = False
                if not ((sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt)) and
                        (sp.all(g_nt == rg_nt) or sp.all(os_g_nt == rg_nt))):
                    if sp.all(g_nt == rg_nt) or sp.all(os_g_nt == rg_nt):
                        flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0]
                                    == ss_nt[1]) or (os_g_nt[1] == ss_nt[0] and
                                                     os_g_nt[0] == ss_nt[1])
                        # Try flipping the SS nt
                        if flip_nts:
                            betas[ss_i] = -betas[ss_i]
                            log_odds[ss_i] = -log_odds[ss_i]
                            if 'freqs' in ssg:
                                ss_freqs[ss_i] = 1 - ss_freqs[ss_i]
                        else:
                            if debug:
                                print("Nucleotides don't match after all?: g_sid=%s, ss_sid=%s, g_i=%d, ss_i=%d, g_nt=%s, ss_nt=%s" % \
                                      (g_sids[g_i], ss_sids[ss_i], g_i,
                                       ss_i, str(g_nt), str(ss_nt)))
                            num_non_matching_nts += 1
                            tot_num_non_matching_nts += 1
                            continue

                    else:
                        num_non_matching_nts += 1
                        tot_num_non_matching_nts += 1
                        continue
                        # Opposite strand nucleotides

            # everything seems ok.
            ok_indices['g'].append(g_i)
            ok_indices['rg'].append(rg_i)
            ok_indices['ss'].append(ss_i)

            ok_nts.append(g_nt)

        if debug:
            print('%d SNPs had ambiguous nucleotides.' % num_ambig_nts)
            print('%d SNPs were excluded due to nucleotide issues.' %
                  num_non_matching_nts)
            print('%d SNPs were retained on chromosome %d.' %
                  (len(ok_indices['g']), chrom))

        # Resorting by position
        positions = sp.array(chrom_d['positions'])[ok_indices['g']]

        # Now parse SNPs ..
        snp_indices = sp.array(chrom_d['snp_indices'])
        # Pinpoint where the SNPs are in the file.
        snp_indices = snp_indices[ok_indices['g']]
        raw_snps, freqs = plinkfiles.parse_plink_snps(genotype_file,
                                                      snp_indices)

        snp_indices_ref = sp.array(chrom_d_ref['snp_indices'])
        # Pinpoint where the SNPs are in the file.
        snp_indices_ref = snp_indices_ref[ok_indices['rg']]
        raw_ref_snps, freqs_ref = plinkfiles.parse_plink_snps(
            reference_genotype_file, snp_indices_ref)

        snp_stds_ref = sp.sqrt(2 * freqs_ref * (1 - freqs_ref))
        snp_means_ref = freqs_ref * 2

        snp_stds = sp.sqrt(2 * freqs * (1 - freqs))
        snp_means = freqs * 2

        betas = betas[ok_indices['ss']]
        log_odds = log_odds[ok_indices['ss']]

        ps = ssg['ps'][...][ok_indices['ss']]
        nts = sp.array(ok_nts)
        sids = (ssg['sids'][...]).astype(util.sids_u_dtype)
        sids = sids[ok_indices['ss']]

        # Check SNP frequencies..
        if check_mafs and 'freqs' in ssg:
            ss_freqs = ss_freqs[ok_indices['ss']]
            freq_discrepancy_snp = sp.absolute(
                ss_freqs - (1 - freqs)) > 0.15  #Array of np.bool values
            if sp.any(freq_discrepancy_snp):
                print(
                    'Warning: %d SNPs were filtered due to high allele frequency discrepancy between summary statistics and validation sample'
                    % sp.sum(freq_discrepancy_snp))

                # Filter freq_discrepancy_snps
                ok_freq_snps = sp.logical_not(freq_discrepancy_snp)
                raw_snps = raw_snps[ok_freq_snps]
                snp_stds = snp_stds[ok_freq_snps]
                snp_means = snp_means[ok_freq_snps]
                raw_ref_snps = raw_ref_snps[ok_freq_snps]
                snp_stds_ref = snp_stds_ref[ok_freq_snps]
                snp_means_ref = snp_means_ref[ok_freq_snps]
                freqs = freqs[ok_freq_snps]
                freqs_ref = freqs_ref[ok_freq_snps]
                ps = ps[ok_freq_snps]
                positions = positions[ok_freq_snps]
                nts = nts[ok_freq_snps]
                sids = sids[ok_freq_snps]
                betas = betas[ok_freq_snps]
                log_odds = log_odds[ok_freq_snps]

        # Filter minor allele frequency SNPs.
        maf_filter = (freqs > min_maf) * (freqs < (1 - min_maf))
        maf_filter_sum = sp.sum(maf_filter)
        n_snps = len(maf_filter)
        assert maf_filter_sum <= n_snps, "Problems when filtering SNPs with low minor allele frequencies"
        if sp.sum(maf_filter) < n_snps:
            raw_snps = raw_snps[maf_filter]
            snp_stds = snp_stds[maf_filter]
            snp_means = snp_means[maf_filter]
            raw_ref_snps = raw_ref_snps[maf_filter]
            snp_stds_ref = snp_stds_ref[maf_filter]
            snp_means_ref = snp_means_ref[maf_filter]
            freqs = freqs[maf_filter]
            freqs_ref = freqs_ref[maf_filter]
            ps = ps[maf_filter]
            positions = positions[maf_filter]
            nts = nts[maf_filter]
            sids = sids[maf_filter]
            betas = betas[maf_filter]
            log_odds = log_odds[maf_filter]

        maf_adj_prs = sp.dot(log_odds, raw_snps)
        if debug and plinkf_dict['has_phenotype']:
            maf_adj_corr = sp.corrcoef(plinkf_dict['phenotypes'],
                                       maf_adj_prs)[0, 1]
            print(
                'Log odds, per genotype PRS correlation w phenotypes for chromosome %d was %0.4f'
                % (chrom, maf_adj_corr))

        genetic_map = []
        if genetic_map_dir is not None:
            with gzip.open(genetic_map_dir +
                           'chr%d.interpolated_genetic_map.gz' % chrom) as f:
                for line in f:
                    l = line.split()


#                     if l[0] in sid_set:
#                         genetic_map.append(l[0])
        else:
            genetic_map = None

        coord_data_dict = {
            'chrom': 'chrom_%d' % chrom,
            'raw_snps_ref': raw_ref_snps,
            'snp_stds_ref': snp_stds_ref,
            'snp_means_ref': snp_means_ref,
            'freqs_ref': freqs_ref,
            'ps': ps,
            'positions': positions,
            'nts': nts,
            'sids': sids,
            'genetic_map': genetic_map,
            'betas': betas,
            'log_odds': log_odds,
            'log_odds_prs': maf_adj_prs,
            'raw_snps_val': raw_snps,
            'snp_stds_val': snp_stds,
            'snp_means_val': snp_means,
            'freqs_val': freqs
        }

        write_coord_data(cord_data_g, coord_data_dict)
        maf_adj_risk_scores += maf_adj_prs
        num_common_snps += len(betas)

    # Now calculate the prediction r^2
    if debug and plinkf_dict['has_phenotype']:
        maf_adj_corr = sp.corrcoef(plinkf_dict['phenotypes'],
                                   maf_adj_risk_scores)[0, 1]
        print(
            'Log odds, per PRS correlation for the whole genome was %0.4f (r^2=%0.4f)'
            % (maf_adj_corr, maf_adj_corr**2))
    print(
        'Overall nucleotide concordance counts: g_rg: %d, g_ss: %d, rg_ss: %d'
        % (tot_g_rg_nt_concord_count, tot_g_ss_nt_concord_count,
           tot_rg_ss_nt_concord_count))
    print('There were %d SNPs in common' % num_common_snps)
    print('In all, %d SNPs were excluded due to nucleotide issues.' %
          tot_num_non_matching_nts)
    print('Done!')