def test_mutliphase_partition_coef(self): m = op.phases.MultiPhase(network=self.net, phases=[self.water, self.air, self.oil]) x, y, z = self.net["pore.coords"].T ps_water = self.net.Ps[(y <= 3) + (y >= 8)] ps_air = self.net.Ps[(y > 3) * (y < 6)] ps_oil = self.net.Ps[(y >= 6) * (y < 8)] # Phase arrangement (y-axis): W | A | O | W m.set_occupancy(phase=self.water, pores=ps_water) m.set_occupancy(phase=self.air, pores=ps_air) m.set_occupancy(phase=self.oil, pores=ps_oil) const = op.models.misc.constant K_air_water = 2.0 K_air_oil = 1.8 K_water_oil = 0.73 m.set_binary_partition_coef(propname="throat.partition_coef", phases=[self.air, self.water], model=const, value=K_air_water) m.set_binary_partition_coef(propname="throat.partition_coef", phases=[self.air, self.oil], model=const, value=K_air_oil) m.set_binary_partition_coef(propname="throat.partition_coef", phases=[self.water, self.oil], model=const, value=K_water_oil) K_aw = m["throat.partition_coef.air:water"] K_ao = m["throat.partition_coef.air:oil"] K_wo = m["throat.partition_coef.water:oil"] K_global = m["throat.partition_coef.all"] assert sp.isclose(K_aw.mean(), K_air_water) assert sp.isclose(K_ao.mean(), K_air_oil) assert sp.isclose(K_wo.mean(), K_water_oil) # Get water-air interface throats tmp1 = self.net.find_neighbor_throats(ps_water, mode="xor") tmp2 = self.net.find_neighbor_throats(ps_air, mode="xor") Ts_water_air_interface = sp.intersect1d(tmp1, tmp2) # Get air-oil interface throats tmp1 = self.net.find_neighbor_throats(ps_air, mode="xor") tmp2 = self.net.find_neighbor_throats(ps_oil, mode="xor") Ts_air_oil_interface = sp.intersect1d(tmp1, tmp2) # Get oil-water interface throats tmp1 = self.net.find_neighbor_throats(ps_oil, mode="xor") tmp2 = self.net.find_neighbor_throats(ps_water, mode="xor") Ts_oil_water_interface = sp.intersect1d(tmp1, tmp2) # K_global for water-air interface must be 1/K_air_water assert sp.isclose(K_global[Ts_water_air_interface].mean(), 1 / K_air_water) # K_global for air-oil interface must be K_air_oil (not 1/K_air_oil) assert sp.isclose(K_global[Ts_air_oil_interface].mean(), K_air_oil) # K_global for oil-water interface must be 1/K_water_oil assert sp.isclose(K_global[Ts_oil_water_interface].mean(), 1 / K_water_oil) # K_global for single-phase regions must be 1.0 interface_throats = sp.hstack( (Ts_water_air_interface, Ts_air_oil_interface, Ts_oil_water_interface)) Ts_single_phase = sp.setdiff1d(self.net.Ts, interface_throats) assert sp.isclose(K_global[Ts_single_phase].mean(), 1.0)
def subsetsWithFits(fileNumString,onlyNew=False): """ Find data subsets (N) that have models that have been fit to all conditions. onlyNew (False) : Optionally include only subsets that have fits that are not included in the current combined fitProbs. """ fpd = loadFitProbData(fileNumString) saveFilename = fpd.values()[0]['saveFilename'] Nlist = [] for N in scipy.sort(fpd.keys()): # find models that have been fit to all conditions if len(fpd[N]['fitProbDataList']) == 1: fitModels = fpd[N]['fitProbDataList'][0]['logLikelihoodDict'].keys() else: fitModels = scipy.intersect1d([ fp['logLikelihoodDict'].keys() \ for fp in fpd[N]['fittingProblemList'] ]) if onlyNew: Nfilename = directoryPrefixNonly(fileNumString,N)+'/'+saveFilename fileExists = os.path.exists(Nfilename) if not fileExists: # no combined file exists if len(fitModels) > 0: Nlist.append(N) else: # check which fit models are currently included in the saved file fpMultiple = load(Nfilename) fitModelsSaved = fpMultiple.logLikelihoodDict.keys() if len(scipy.intersect1d(fitModels,fitModelsSaved)) < len(fitModels): Nlist.append(N) else: if len(fitModels) > 0: Nlist.append(N) return Nlist
def eval_func(self, gean): line_list = self.line_list.copy() count = 0 for i, empty in enumerate(self.empty_list): for j in empty[1]: line_list[i][j] = gean[count] count += 1 row_list = sci.array([ line_list[:, 0], line_list[:, 1], line_list[:, 2], line_list[:, 3], line_list[:, 4], line_list[:, 5], line_list[:, 6], line_list[:, 7], line_list[:, 8] ]) block_list = sci.array([ sci.append(line_list[0:3, 0:1], [line_list[0:3, 1:2], line_list[0:3, 2:3]]), sci.append(line_list[0:3, 3:4], [line_list[0:3, 4:5], line_list[0:3, 5:6]]), sci.append(line_list[0:3, 6:7], [line_list[0:3, 7:8], line_list[0:3, 8:9]]), sci.append(line_list[3:6, 0:1], [line_list[3:6, 1:2], line_list[3:6, 2:3]]), sci.append(line_list[3:6, 3:4], [line_list[3:6, 4:5], line_list[3:6, 5:6]]), sci.append(line_list[3:6, 6:7], [line_list[3:6, 7:8], line_list[3:6, 8:9]]), sci.append(line_list[6:9, 0:1], [line_list[6:9, 1:2], line_list[6:9, 2:3]]), sci.append(line_list[6:9, 3:4], [line_list[6:9, 4:5], line_list[6:9, 5:6]]), sci.append(line_list[6:9, 6:7], [line_list[6:9, 7:8], line_list[6:9, 8:9]]) ]) value = 0 """ 各縦、横、ブロックをそれぞれ見ていき、数字の種類につき加算する。 もし完成していたら、9×9×3=243のスコアになるはず(9×9の場合) """ "縦" for line in line_list: value += len(sci.intersect1d(line, [1, 2, 3, 4, 5, 6, 7, 8, 9])) "横" for row in row_list: value += len(sci.intersect1d(row, [1, 2, 3, 4, 5, 6, 7, 8, 9])) "ブロック" for block in block_list: value += len(sci.intersect1d(block, [1, 2, 3, 4, 5, 6, 7, 8, 9])) return value
def intersect_rows(array1, array2, index=None): """Return intersection of rows""" if (array1.shape[0] == 0): if index == True: return (array1, sp.zeros((0, )), sp.zeros((0, ))) else: return array1 if (array2.shape[0] == 0): if index == True: return (array2, sp.zeros((0, )), sp.zeros((0, ))) else: return array2 array1_v = array1.view([('', array1.dtype)] * array1.shape[1]) array2_v = array2.view([('', array2.dtype)] * array2.shape[1]) array_i = sp.intersect1d(array1_v, array2_v) if index == True: a1_i = sp.where(sp.in1d(array1_v, array_i))[0] a2_i = sp.where(sp.in1d(array2_v, array_i))[0] return (array_i.view(array1.dtype).reshape(array_i.shape[0], array1.shape[1]), a1_i, a2_i) else: return array_i.view(array1.dtype).reshape(array_i.shape[0], array1.shape[1])
def eval_func(line_list, empty_list, geanSize): # # gean = [0,0,1,0,1,1,1,0,1,1] line_list, empty_list, kouho_list, geanSize = nimotu_init() print (sorted(set(list(iter.permutations(kouho_list , 3))))) count = 0 for i, empty in enumerate(empty_list): for j in empty[1]: cell = "" for k in geanSize[count:count + 4]: cell += str(int(k)) if (0 < int(cell, 2) and int(cell, 2) < 10): line_list[i][j] = int(cell, 2) count += 4 row_list = sci.array([line_list[:, 0], line_list[:, 1], line_list[:, 2], line_list[:, 3], line_list[:, 4], line_list[:, 5], line_list[:, 6], line_list[:, 7], line_list[:, 8]]) block_list = sci.array([sci.append(line_list[0:3, 0:1], [line_list[0:3, 1:2], line_list[0:3, 2:3]]), sci.append(line_list[0:3, 3:4], [line_list[0:3, 4:5], line_list[0:3, 5:6]]), sci.append(line_list[0:3, 6:7], [line_list[0:3, 7:8], line_list[0:3, 8:9]]), sci.append(line_list[3:6, 0:1], [line_list[3:6, 1:2], line_list[3:6, 2:3]]), sci.append(line_list[3:6, 3:4], [line_list[3:6, 4:5], line_list[3:6, 5:6]]), sci.append(line_list[3:6, 6:7], [line_list[3:6, 7:8], line_list[3:6, 8:9]]), sci.append(line_list[6:9, 0:1], [line_list[6:9, 1:2], line_list[6:9, 2:3]]), sci.append(line_list[6:9, 3:4], [line_list[6:9, 4:5], line_list[6:9, 5:6]]), sci.append(line_list[6:9, 6:7], [line_list[6:9, 7:8], line_list[6:9, 8:9]])]) # print(line_list) value = 0 for line in line_list: value += len(sci.intersect1d(line, [1, 2, 3, 4, 5, 6, 7, 8, 9])) # value -= len(sci.where(line == 0)) for row in row_list: value += len(sci.intersect1d(row, [1, 2, 3, 4, 5, 6, 7, 8, 9])) # value -= len(sci.where(row == 0)) for block in block_list: value += len(sci.intersect1d(block, [1, 2, 3, 4, 5, 6, 7, 8, 9])) # value -= len(sci.where(block == 0)) # print ("value:" + str(value)) return value
def eval_func(self, gean): line_list = self.line_list.copy() count = 0 for i, empty in enumerate(self.empty_list): for j in empty[1]: line_list[i][j] = gean[count] count += 1 row_list = sci.array([line_list[:, 0], line_list[:, 1], line_list[:, 2], line_list[:, 3], line_list[:, 4], line_list[:, 5], line_list[:, 6], line_list[:, 7], line_list[:, 8]]) block_list = sci.array([sci.append(line_list[0:3, 0:1], [line_list[0:3, 1:2], line_list[0:3, 2:3]]), sci.append(line_list[0:3, 3:4], [line_list[0:3, 4:5], line_list[0:3, 5:6]]), sci.append(line_list[0:3, 6:7], [line_list[0:3, 7:8], line_list[0:3, 8:9]]), sci.append(line_list[3:6, 0:1], [line_list[3:6, 1:2], line_list[3:6, 2:3]]), sci.append(line_list[3:6, 3:4], [line_list[3:6, 4:5], line_list[3:6, 5:6]]), sci.append(line_list[3:6, 6:7], [line_list[3:6, 7:8], line_list[3:6, 8:9]]), sci.append(line_list[6:9, 0:1], [line_list[6:9, 1:2], line_list[6:9, 2:3]]), sci.append(line_list[6:9, 3:4], [line_list[6:9, 4:5], line_list[6:9, 5:6]]), sci.append(line_list[6:9, 6:7], [line_list[6:9, 7:8], line_list[6:9, 8:9]])]) value = 0 """ 各縦、横、ブロックをそれぞれ見ていき、数字の種類につき加算する。 もし完成していたら、9×9×3=243のスコアになるはず(9×9の場合) """ "縦" for line in line_list: value += len(sci.intersect1d(line, [1, 2, 3, 4, 5, 6, 7, 8, 9])) "横" for row in row_list: value += len(sci.intersect1d(row, [1, 2, 3, 4, 5, 6, 7, 8, 9])) "ブロック" for block in block_list: value += len(sci.intersect1d(block, [1, 2, 3, 4, 5, 6, 7, 8, 9])) return value
def plot_overlap_ps(result_file, ss_file='/Users/bjarnivilhjalmsson/data/GIANT/GIANT_HEIGHT_Wood_et_al_2014_publicrelease_HapMapCeuFreq.txt', fig_filename='/Users/bjarnivilhjalmsson/data/tmp/manhattan_combPC_HGT.png', method='combPC', ylabel='Comb. PC (HIP,WC,HGT,BMI) $-log_{10}(P$-value$)$', xlabel='Height $-log_{10}(P$-value$)$', p_thres=0.00001): # Parse results ans SS file res_table = pandas.read_table(result_file) ss_table = pandas.read_table(ss_file) # Parse res_sids = sp.array(res_table['SNPid']) if method == 'MVT': comb_ps = sp.array(res_table['pval']) elif method == 'combPC': comb_ps = sp.array(res_table['combPC']) if 'MarkerName' in ss_table.keys(): ss_sids = sp.array(ss_table['MarkerName']) elif 'SNP' in ss_table.keys(): ss_sids = sp.array(ss_table['SNP']) else: raise Exception("Don't know where to look for rs IDs") marg_ps = sp.array(ss_table['p']) # Filtering boring p-values res_p_filter = comb_ps < p_thres res_sids = res_sids[res_p_filter] comb_ps = comb_ps[res_p_filter] # ss_p_filter = marg_ps<p_thres # ss_sids = ss_sids[ss_p_filter] # marg_ps = marg_ps[ss_p_filter] common_sids = sp.intersect1d(res_sids, ss_sids) print 'Found %d SNPs in common' % (len(common_sids)) ss_filter = sp.in1d(ss_sids, common_sids) res_filter = sp.in1d(res_sids, common_sids) ss_sids = ss_sids[ss_filter] res_sids = res_sids[res_filter] marg_ps = marg_ps[ss_filter] comb_ps = comb_ps[res_filter] print 'Now sorting' ss_index = sp.argsort(ss_sids) res_index = sp.argsort(res_sids) marg_ps = -sp.log10(marg_ps[ss_index]) comb_ps = -sp.log10(comb_ps[res_index]) with plt.style.context('fivethirtyeight'): plt.plot(marg_ps, comb_ps, 'b.', alpha=0.2) (x_min, x_max) = plt.xlim() (y_min, y_max) = plt.ylim() plt.plot([x_min, x_max], [y_min, y_max], 'k--', alpha=0.2) plt.ylabel(ylabel) plt.xlabel(xlabel) plt.tight_layout() plt.savefig(fig_filename) plt.clf()
def match_samples(*sampleIDs): sampleID_common = sampleIDs[0] for sampleID in sampleIDs: sampleID_common = sp.intersect1d(sampleID, sampleID_common) idxs = [] for sampleID in sampleIDs: _idxs = sp.array( [sp.where(sampleID == sample)[0][0] for sample in sampleID_common]) assert (sampleID[_idxs] == sampleID_common).all() idxs.append(_idxs) return idxs
def reduce(PSI_l, Xl, coverage_threshold): """ Computes set cover reduction to get the most relevant samples that define the class Xl. :param PSI_l: (Nl x 2) matrix containing both the scale and the shape of the weibull distribution :param Xl: (Nl x dimension_feature_vector) matrix containing the feature vectors of each instance of a class :param coverage_threshold: Probability above which we consider an instance to be not enough representative of its class :return: The indexes of the most representative samples of a class """ #This matrix D is symmetric D = ppp_cosine_similarity(Xl, Xl) # Number of instances of the class Nl = np.shape(D)[0] S = [] for i in range(Nl): Si = [] for j in range(Nl): if (psi_i_dist(D[i, j], PSI_l[i, 0], PSI_l[i, 1]) >= coverage_threshold): # Sample i is redundant with respect to j Si.append(j) S.append(Si) # Universe U = list(range(0, Nl)) # Covered index C = [] # Final indexs I = [] #Set Cover Implementation while (len(scipy.intersect1d(C, U)) != len(U)): # punct_ref is a counter to find the maximum in every iteration punct_ref = 0 # ind represent the index that we will append to our index's list ind = 0 index_s = 0 for s in S: punct = 0 relative_inclusion = scipy.isin(s, C) for eleme in relative_inclusion: if (eleme is False): punct += 1 if (punct >= punct_ref): ind = index_s index_s += 1 C = scipy.union1d(C, S[ind]) I.append(ind) S.remove(S[ind]) if (len(S) == 0): break return I
def acceptableindices(self,list,min,max,datalimit): array = scipy.array(list) larger = scipy.where(array >= min) smaller = scipy.where(array <= max) goodarrayindices = scipy.intersect1d(larger[0],smaller[0]) goodindices = goodarrayindices.tolist() # The current tilt series may need an extra index to make up the number while len(goodindices) < datalimit and len(goodindices) > 0 and len(list) >= datalimit: nextindex = goodindices[-1]+1 if nextindex not in range(0,len(list)): break goodindices.append(goodindices[-1]+1) return goodindices
def subsetsWithFits(fileNumString, onlyNew=False): """ Find data subsets (N) that have models that have been fit to all conditions. onlyNew (False) : Optionally include only subsets that have fits that are not included in the current combined fitProbs. """ fpd = loadFitProbData(fileNumString) saveFilename = fpd.values()[0]['saveFilename'] Nlist = [] for N in scipy.sort(fpd.keys()): # find models that have been fit to all conditions if len(fpd[N]['fitProbDataList']) == 1: fitModels = fpd[N]['fitProbDataList'][0]['logLikelihoodDict'].keys( ) else: fitModels = scipy.intersect1d([ fp['logLikelihoodDict'].keys() \ for fp in fpd[N]['fittingProblemList'] ]) if onlyNew: Nfilename = directoryPrefixNonly(fileNumString, N) + '/' + saveFilename fileExists = os.path.exists(Nfilename) if not fileExists: # no combined file exists if len(fitModels) > 0: Nlist.append(N) else: # check which fit models are currently included in the saved file fpMultiple = load(Nfilename) fitModelsSaved = fpMultiple.logLikelihoodDict.keys() if len(scipy.intersect1d(fitModels, fitModelsSaved)) < len(fitModels): Nlist.append(N) else: if len(fitModels) > 0: Nlist.append(N) return Nlist
def get_event_ids_from_gene(gene_id, event_type, outdir, confidence): eids = [] IN = h5py.File( os.path.join( outdir, 'merge_graphs_%s_C%i.counts.hdf5' % (event_type, confidence)), 'r') if 'conf_idx' in IN and IN['conf_idx'].shape[0] > 0: cidx = IN['conf_idx'][:] gidx = sp.where(IN['gene_idx'][:] == gene_id)[0] eids.extend(sp.intersect1d(cidx, gidx)) IN.close() return eids
def acceptableindices(self, list, min, max, datalimit): array = scipy.array(list) larger = scipy.where(array >= min) smaller = scipy.where(array <= max) goodarrayindices = scipy.intersect1d(larger[0], smaller[0]) goodindices = goodarrayindices.tolist() # The current tilt series may need an extra index to make up the number while len(goodindices) < datalimit and len(goodindices) > 0 and len( list) >= datalimit: nextindex = goodindices[-1] + 1 if nextindex not in range(0, len(list)): break goodindices.append(goodindices[-1] + 1) return goodindices
def source_model_threshold_distance_subset(distances, source_model, atten_threshold_distance): """ source_model_threshold_distance_subset Calculate the distances of the event_set from the sites array. For those events less than or equal to the attenuation threshold, return a subset source model so that calc_and_save_SA only works on those events. calc_and_save_SA calculates an SA figure by getting a subset of event indices: for source in source_model: event_inds = source.get_event_set_indexes() if len(event_inds) == 0: continue sub_event_set = event_set[event_inds] Returns source_model_subset """ # A rethink of apply_threshold distance # Calculate the distances of the event_set from the sites array and # return an event_set where distance <= atten_threshold_distance Rjb = distances.distance('Joyner_Boore') # distances is an ndarray where [sites, events]. We only want the events # dimension for this function as we're trimming events (sites_to_keep, events_to_keep) = where(Rjb <= atten_threshold_distance) source_model_subset = copy.deepcopy(source_model) # Re-sync the event indices in the source model. As we don't want to add # events that may already be excluded by generate_synthetic_events_fault(), # do the following # 1. Grab the event set already calculated # 2. The intersection of this and events_to_keep is what we want for source in source_model_subset: source_indices = source.get_event_set_indexes() source.set_event_set_indexes( intersect1d(source_indices, events_to_keep)) return source_model_subset
def find_connecting_throat(self,P1,P2): r""" Return a the throat number connecting two given pores connected Parameters ---------- P1 , P2 : int The pore numbers connected by the desired throat Returns ------- Tnum : int Returns throat number, or empty array if pores are not connected Examples -------- >>> pn = OpenPNM.Network.Cubic(name='doc_test').generate(divisions=[5,5,5],lattice_spacing=[1]) >>> pn.find_connecting_throat(0,1) array([0]) """ return sp.intersect1d(self.find_neighbor_throats(P1),self.find_neighbor_throats(P2))
def find_connecting_throat(self, P1, P2): r""" Return the throat number connecting pairs of pores Parameters ---------- P1 , P2 : array_like The pore numbers whose throats are sought. These can be vectors of pore numbers, but must be the same length Returns ------- Tnum : list of list of int Returns throat number(s), or empty array if pores are not connected Examples -------- >>> import OpenPNM >>> pn = OpenPNM.Network.TestNet() >>> pn.find_connecting_throat([0, 1, 2], [2, 2, 2]) [[], [3], []] TODO: This now works on 'vector' inputs, but is not actually vectorized in the Numpy sense, so could be slow with large P1,P2 inputs """ P1 = self._parse_locations(P1) P2 = self._parse_locations(P2) Ts1 = self.find_neighbor_throats(P1, flatten=False) Ts2 = self.find_neighbor_throats(P2, flatten=False) Ts = [] for row in range(0, len(P1)): if P1[row] == P2[row]: throat = [] else: throat = sp.intersect1d(Ts1[row], Ts2[row]).tolist() Ts.insert(0, throat) Ts.reverse() return Ts
def intersect_rows(array1, array2, index = None): """Return intersection of rows""" if (array1.shape[0] == 0): if index == True: return (array1, sp.zeros((0,)), sp.zeros((0,))) else: return array1 if (array2.shape[0] == 0): if index == True: return (array2, sp.zeros((0,)), sp.zeros((0,))) else: return array2 array1_v = array1.view([('', array1.dtype)] * array1.shape[1]) array2_v = array2.view([('', array2.dtype)] * array2.shape[1]) array_i = sp.intersect1d(array1_v, array2_v) if index == True: a1_i = sp.where(sp.in1d(array1_v, array_i))[0] a2_i = sp.where(sp.in1d(array2_v, array_i))[0] return (array_i.view(array1.dtype).reshape(array_i.shape[0], array1.shape[1]), a1_i, a2_i) else: return array_i.view(array1.dtype).reshape(array_i.shape[0], array1.shape[1])
fw = open('Data/NTotalGenesAndGeneSetInfoOfAlpha.tsv','w') for Alpha in DataDict['AlphaLvls']: if(Alpha==0.0): continue NTotalGenesOfAlpha = 0 UniqGenes = [] TraitSetAtAlpha = [] for i in xrange(len(Traits)): GeneSetAtAlpha = DataDict[Traits[i]]['GeneSetAtAlpha_'+str(Alpha)] NTotalGenesOfAlpha += len(GeneSetAtAlpha) UniqGenes.extend(GeneSetAtAlpha) if(len(GeneSetAtAlpha)>0): TraitSetAtAlpha.append(Traits[i]) TraitSetAtAlpha = scipy.array(TraitSetAtAlpha) GWIntersection = scipy.intersect1d(ar1=TraitSetAtAlpha, ar2=GWSignTraits, assume_unique=False) GWMWIntersection = scipy.intersect1d(ar1=TraitSetAtAlpha, ar2=GWMWSignTraits, assume_unique=False) GWUnion = scipy.union1d(ar1=TraitSetAtAlpha, ar2=GWSignTraits) GWMWUnion = scipy.union1d(ar1=TraitSetAtAlpha, ar2=GWMWSignTraits) fw.write(str(Alpha)+'\t'+\ str(NTotalGenesOfAlpha)+'\t'+\ str(len(scipy.unique(scipy.array(UniqGenes))))+'\t'+\ str(len(TraitSetAtAlpha))+'\t'+\ str(len(GWSignTraits))+'\t'+\ str(len(GWIntersection))+'\t'+\ str(float(len(GWIntersection))/float(len(GWUnion)))+'\t'+\
def verify_alt_prime(event, gene, counts_segments, counts_edges, CFG): # [verified, info] = verify_exon_skip(event, fn_bam, cfg) # (0) valid, (1) exon_diff_cov, (2) exon_const_cov # (3) intron1_conf, (4) intron2_conf info = [1, 0, 0, 0, 0] verified = [0, 0] ### check validity of exon coordinates (>=0) if sp.any(event.exons1 < 0) or sp.any(event.exons2 < 0): info[0] = 0 return (verified, info) ### check validity of intron coordinates (only one side is differing) if (event.exons1[0, 1] != event.exons2[0, 1]) and (event.exons1[1, 0] != event.exons2[1, 0]): info[0] = 0 return (verified, info) sg = gene.splicegraph segs = gene.segmentgraph ### find exons corresponding to event idx_exon11 = sp.where((sg.vertices[0, :] == event.exons1[0, 0]) & (sg.vertices[1, :] == event.exons1[0, 1]))[0] if idx_exon11.shape[0] == 0: segs_exon11 = sp.where((segs.segments[0, :] >= event.exons1[0, 0]) & (segs.segments[1, :] <= event.exons1[0, 1]))[0] else: segs_exon11 = sp.where(segs.seg_match[idx_exon11, :])[1] idx_exon12 = sp.where((sg.vertices[0, :] == event.exons1[1, 0]) & (sg.vertices[1, :] == event.exons1[1, 1]))[0] if idx_exon12.shape[0] == 0: segs_exon12 = sp.where((segs.segments[0, :] >= event.exons1[1, 0]) & (segs.segments[1, :] <= event.exons1[1, 1]))[0] else: segs_exon12 = sp.where(segs.seg_match[idx_exon12, :])[1] idx_exon21 = sp.where((sg.vertices[0, :] == event.exons2[0, 0]) & (sg.vertices[1, :] == event.exons2[0, 1]))[0] if idx_exon21.shape[0] == 0: segs_exon21 = sp.where((segs.segments[0, :] >= event.exons2[0, 0]) & (segs.segments[1, :] <= event.exons2[0, 1]))[0] else: segs_exon21 = sp.where(segs.seg_match[idx_exon21, :])[1] idx_exon22 = sp.where((sg.vertices[0, :] == event.exons2[1, 0]) & (sg.vertices[1, :] == event.exons2[1, 1]))[0] if idx_exon22.shape[0] == 0: segs_exon22 = sp.where((segs.segments[0, :] >= event.exons2[1, 0]) & (segs.segments[1, :] <= event.exons2[1, 1]))[0] else: segs_exon22 = sp.where(segs.seg_match[idx_exon22, :] > 0)[1] assert(segs_exon11.shape[0] > 0) assert(segs_exon12.shape[0] > 0) assert(segs_exon21.shape[0] > 0) assert(segs_exon22.shape[0] > 0) if sp.all(segs_exon11 == segs_exon21): seg_exon_const = segs_exon11 seg_diff = sp.setdiff1d(segs_exon12, segs_exon22) if seg_diff.shape[0] == 0: seg_diff = sp.setdiff1d(segs_exon22, segs_exon12) seg_const = sp.intersect1d(segs_exon12, segs_exon22) elif sp.all(segs_exon12 == segs_exon22): seg_exon_const = segs_exon12 seg_diff = sp.setdiff1d(segs_exon11, segs_exon21) if seg_diff.shape[0] == 0: seg_diff = sp.setdiff1d(segs_exon21, segs_exon11) seg_const = sp.intersect1d(segs_exon21, segs_exon11) else: print >> sys.stderr, "ERROR: both exons differ in alt prime event in verify_alt_prime" sys.exit(1) seg_const = sp.r_[seg_exon_const, seg_const] seg_lens = segs.segments[1, :] - segs.segments[0, :] # exon_diff_cov info[1] = sp.sum(counts_segments[seg_diff] * seg_lens[seg_diff]) / sp.sum(seg_lens[seg_diff]) # exon_const_cov info[2] = sp.sum(counts_segments[seg_const] * seg_lens[seg_const]) / sp.sum(seg_lens[seg_const]) if info[1] >= CFG['alt_prime']['min_diff_rel_cov'] * info[2]: verified[0] = 1 ### check intron confirmations as sum of valid intron scores ### intron score is the number of reads confirming this intron # intron1_conf idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([segs_exon11[-1], segs_exon12[0]], segs.seg_edges.shape))[0] assert(idx.shape[0] > 0) info[3] = counts_edges[idx, 1] # intron2_conf idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index([segs_exon21[-1], segs_exon22[0]], segs.seg_edges.shape))[0] assert(idx.shape[0] > 0) info[4] = counts_edges[idx, 1] if min(info[3], info[4]) >= CFG['alt_prime']['min_intron_count']: verified[1] = 1 return (verified, info)
def AssignLikelyContentsToSudokuGridLookupDictAndCheckPredictions(sudokuGridLookupDict, \ updatedColonyPurificationDict): from numpy import arange from scipy import intersect1d, unique import pdb from copy import deepcopy cpKeys = list(updatedColonyPurificationDict.keys()) totalHopedForCorrect = 0 totalPredictedWells = 0 totalPredictionsCorrect = 0 hopedForTransposonsInCollection = [] # Go through the colony purification dict for key in cpKeys: line = updatedColonyPurificationDict[key] rowPool = line['rowPool'] colPool = str(int(line['colPool'])) pcPool = line['pcPool'] prPool = line['prPool'] # Takes the progenitor contents and expands them a little to allow for slop in # transposon position progenitorContentsExpanded = [] progenitorContentsUnexpanded = [] progenitorContentsUnexpandedLocatabilities = [] i = 0 while i < len(line['progenitorContents']): progCoord = line['progenitorContents'][i][0] progLocatability = line['progenitorContents'][i][1] progCoordExpanded = list( arange(progCoord - 1, progCoord + 2, 1, int)) progenitorContentsExpanded += progCoordExpanded progenitorContentsUnexpanded.append(progCoord) progenitorContentsUnexpandedLocatabilities.append(progLocatability) i += 1 hopedForCoord = int(line['hopedForTransposonCoord']) hopedForTransposonCoords = list( arange(hopedForCoord - 1, hopedForCoord + 2, 1, int)) sudokuWell = sudokuGridLookupDict[prPool][pcPool].wellGrid[rowPool][ colPool] sudokuWell.updateSimplifiedReadAlignmentCoords() sudokuWell.hasPredictionForContents = True sudokuWell.predictionsForContents = progenitorContentsExpanded sudokuWell.hopedForCoord = hopedForCoord sudokuWell.progenitorContents = progenitorContentsUnexpanded sudokuWell.progenitorLocatabilities = progenitorContentsUnexpandedLocatabilities sudokuWell.condensationType = line['condensationType'] progenitorCol = line['progenitorCol'] progenitorPlate = line['progenitorPlate'] progenitorRow = line['progenitorRow'] sudokuWell.addressDict['progenitor'] = {'plateName':progenitorPlate, \ 'row':progenitorRow, 'col':progenitorCol} # Intersect the predicted and hoped for transposon coords with the coords that intersect # with the wel location intersectPredictionsContents = intersect1d(sudokuWell.predictionsForContents, \ sudokuWell.simplifiedReadAlignmentCoords) intersectHopedForContents = intersect1d(hopedForTransposonCoords, \ sudokuWell.simplifiedReadAlignmentCoords) # Make some new read alignment coords # Check to see if the predicted and hoped for coords are there if len(intersectPredictionsContents) > 0: sudokuWell.predictionCorrect = True groupedIntersectPredictionsContents = \ GroupGenomicCoords(intersectPredictionsContents, maxGap=1) sudokuWell.simplifiedLikelyReadAlignmentCoords = groupedIntersectPredictionsContents totalPredictionsCorrect += 1 if len(intersectHopedForContents) > 0: sudokuWell.hopedForPresent = True sudokuWell.predictionCorrect = True groupedIntersectHopedForContents = \ GroupGenomicCoords(intersectHopedForContents, maxGap=1) sudokuWell.simplifiedLikelyReadAlignmentCoords = groupedIntersectHopedForContents hopedForTransposonsInCollection.append(hopedForCoord) totalHopedForCorrect += 1 totalPredictedWells += 1 newReadAlignmentCoords = [] if sudokuWell.predictionCorrect == True: for likelyCoord in sudokuWell.simplifiedLikelyReadAlignmentCoords: for readAlignmentCoord in sudokuWell.readAlignmentCoords: coord = readAlignmentCoord.coord if likelyCoord - 3 <= coord <= likelyCoord + 3: newReadAlignmentCoords.append( deepcopy(readAlignmentCoord)) if sudokuWell.predictionCorrect and len(newReadAlignmentCoords) == 0: print('No coordinates found for well with correct prediction') pdb.set_trace() sudokuWell.readAlignmentCoords = newReadAlignmentCoords hopedForTransposonsInCollection = unique(hopedForTransposonsInCollection) print('Total Predicted Wells: ' + str(totalPredictedWells)) print('Total Hoped for Correct: ' + str(totalHopedForCorrect)) print('Total Predictions Correct: ' + str(totalPredictionsCorrect)) print('Total Unique Hoped For Transposons: ' + str(len(hopedForTransposonsInCollection))) return
def AssignLikelyContentsToSudokuGridLookupDictAndCheckPredictions(sudokuGridLookupDict, \ updatedColonyPurificationDict): from numpy import arange from scipy import intersect1d, unique import pdb from copy import deepcopy cpKeys = list(updatedColonyPurificationDict.keys()) totalHopedForCorrect = 0 totalPredictedWells = 0 totalPredictionsCorrect = 0 hopedForTransposonsInCollection = [] # Go through the colony purification dict for key in cpKeys: line = updatedColonyPurificationDict[key] rowPool = line['rowPool'] colPool = str(int(line['colPool'])) pcPool = line['pcPool'] prPool = line['prPool'] # Takes the progenitor contents and expands them a little to allow for slop in # transposon position progenitorContentsExpanded = [] progenitorContentsUnexpanded = [] progenitorContentsUnexpandedLocatabilities = [] i = 0 while i < len(line['progenitorContents']): progCoord = line['progenitorContents'][i][0] progLocatability = line['progenitorContents'][i][1] progCoordExpanded = list(arange(progCoord - 1, progCoord + 2, 1, int)) progenitorContentsExpanded += progCoordExpanded progenitorContentsUnexpanded.append(progCoord) progenitorContentsUnexpandedLocatabilities.append(progLocatability) i += 1 hopedForCoord = int(line['hopedForTransposonCoord']) hopedForTransposonCoords = list(arange(hopedForCoord - 1, hopedForCoord + 2, 1, int)) sudokuWell = sudokuGridLookupDict[prPool][pcPool].wellGrid[rowPool][colPool] sudokuWell.updateSimplifiedReadAlignmentCoords() sudokuWell.hasPredictionForContents = True sudokuWell.predictionsForContents = progenitorContentsExpanded sudokuWell.hopedForCoord = hopedForCoord sudokuWell.progenitorContents = progenitorContentsUnexpanded sudokuWell.progenitorLocatabilities = progenitorContentsUnexpandedLocatabilities sudokuWell.condensationType = line['condensationType'] progenitorCol = line['progenitorCol'] progenitorPlate = line['progenitorPlate'] progenitorRow = line['progenitorRow'] sudokuWell.addressDict['progenitor'] = {'plateName':progenitorPlate, \ 'row':progenitorRow, 'col':progenitorCol} # Intersect the predicted and hoped for transposon coords with the coords that intersect # with the wel location intersectPredictionsContents = intersect1d(sudokuWell.predictionsForContents, \ sudokuWell.simplifiedReadAlignmentCoords) intersectHopedForContents = intersect1d(hopedForTransposonCoords, \ sudokuWell.simplifiedReadAlignmentCoords) # Make some new read alignment coords # Check to see if the predicted and hoped for coords are there if len(intersectPredictionsContents) > 0: sudokuWell.predictionCorrect = True groupedIntersectPredictionsContents = \ GroupGenomicCoords(intersectPredictionsContents, maxGap=1) sudokuWell.simplifiedLikelyReadAlignmentCoords = groupedIntersectPredictionsContents totalPredictionsCorrect += 1 if len(intersectHopedForContents) > 0: sudokuWell.hopedForPresent = True sudokuWell.predictionCorrect = True groupedIntersectHopedForContents = \ GroupGenomicCoords(intersectHopedForContents, maxGap=1) sudokuWell.simplifiedLikelyReadAlignmentCoords = groupedIntersectHopedForContents hopedForTransposonsInCollection.append(hopedForCoord) totalHopedForCorrect += 1 totalPredictedWells += 1 newReadAlignmentCoords = [] if sudokuWell.predictionCorrect == True: for likelyCoord in sudokuWell.simplifiedLikelyReadAlignmentCoords: for readAlignmentCoord in sudokuWell.readAlignmentCoords: coord = readAlignmentCoord.coord if likelyCoord - 3 <= coord <= likelyCoord + 3: newReadAlignmentCoords.append(deepcopy(readAlignmentCoord)) if sudokuWell.predictionCorrect and len(newReadAlignmentCoords) == 0: print('No coordinates found for well with correct prediction') pdb.set_trace() sudokuWell.readAlignmentCoords = newReadAlignmentCoords hopedForTransposonsInCollection = unique(hopedForTransposonsInCollection) print('Total Predicted Wells: ' + str(totalPredictedWells)) print('Total Hoped for Correct: ' + str(totalHopedForCorrect)) print('Total Predictions Correct: ' + str(totalPredictionsCorrect)) print('Total Unique Hoped For Transposons: ' + str(len(hopedForTransposonsInCollection))) return
def load_data(CFG, is_Ens=True, gene_set='GOCB', het_only = True, het_onlyCB=True, pairs=False, filter_median = True, combine=False, filter_expressed = 0): f = h5py.File(CFG['train_file'],'r') Y = f['LogNcountsMmus'][:] labels = f['labels'][:].ravel() futil = h5py.File(CFG['util_file'],'r') Y_util = futil['LogNcountsQuartz'][:] ftst = h5py.File(CFG['test_file'],'r') if is_Ens ==True: genes = f['EnsIds'][:] genes_util = futil['gene_names_all'][:] else: genes = SP.char.lower(f['sym_names'][:]) genes_util = SP.char.lower(futil['sym_namesQ'][:]) #test file labels_util = futil['phase_vecS'][:]*2+futil['phase_vecG2M'][:]*3+futil['phase_vecG1'][:] if CFG['util_file']==CFG['test_file']: genes_tst = genes_util YT = ftst['LogNcountsQuartz'][:] labels_tst = ftst['phase_vecS'][:]*2+ftst['phase_vecG2M'][:]*3+ftst['phase_vecG1'][:] elif is_Ens == False: ftst = h5py.File(CFG['test_file'],'r') YT = ftst['counts'][:] genes_tst = SP.char.lower(ftst['sym_names'][:]) #genes_tst = ftst['ensIds'][:] #labels_tst = SP.array([1,1,1,1,1])#ftst['labels'][:].ravel() labels_tst = ftst['labels'][:].ravel() elif is_Ens == True: ftst = h5py.File(CFG['test_file'],'r') YT = ftst['counts'][:] #genes_tst = ftst['sym_names'][:] genes_tst = ftst['ensIds'][:] #labels_tst = SP.array([1,1,1,1,1])#ftst['labels'][:].ravel() labels_tst = ftst['labels'][:].ravel() if 'class_labels' in ftst.keys(): class_labels = ftst['class_labels'][:] else: class_labels = [i.astype('str') for i in labels_tst] class_labels = SP.sort(SP.unique(class_labels)) heterogen_util = genes_util[SP.intersect1d(SP.where(Y_util.mean(0)>0)[0],SP.where(futil['genes_heterogen'][:]==1)[0])] heterogen_train = genes[SP.intersect1d(SP.where(Y.mean(0)>0)[0],SP.where(f['genes_heterogen'][:]==1)[0])] cellcyclegenes_GO = genes[SP.unique(f['cellcyclegenes_filter'][:].ravel() -1)] # idx of cell cycle genes cellcyclegenes_CB = genes[f['ccCBall_gene_indices'][:].ravel() -1] # idxof cell cycle genes ... if SP.any(gene_set=='GOCB'): cc_ens = SP.union1d(cellcyclegenes_GO,cellcyclegenes_CB) elif SP.any(gene_set=='GO'): cc_ens = cellcyclegenes_GO elif SP.any(gene_set=='CB'): cc_ens = cellcyclegenes_CB elif SP.any(gene_set=='all'): cc_ens = genes else: #assert(gene_set in CFG.keys()), str(gene_set+' does not exist. Chose different gene set.') cc_ens = gene_set if het_only==True: cc_ens = SP.intersect1d(cc_ens, heterogen_train) if pairs==True: Y = Y[:,SP.where(f['genes_heterogen'][:]==1)[0]] genes = genes[SP.where(f['genes_heterogen'][:]==1)[0]] if het_onlyCB==True: cc_ens = SP.intersect1d(cc_ens, heterogen_util) #filter_expressed = .2 lod = 0 if filter_expressed>0: medY = SP.sum(Y>lod,0)*1.0 idx_filter = (medY/SP.float_(Y.shape[0]))>filter_expressed Y = Y[:,idx_filter] genes = genes[idx_filter] #medY_tst = SP.sum(Y_tst>lod,0) #Y_tst = Y_tst[:,medY_tst>filter_expressed] #genes_tst = genes_tst[medY_tst>filter_expressed] medY_util = SP.sum(Y_util>lod,0) idx_filter = (medY_util/SP.float_(Y_util.shape[0]))>filter_expressed Y_util = Y_util[:,idx_filter] genes_util = genes_util[idx_filter] cc_ens = SP.intersect1d(cc_ens, genes) cc_ens = SP.intersect1d(cc_ens, genes_tst) cc_ens = SP.intersect1d(cc_ens, genes_util) if combine==True: genes = list(genes) genes_util = list(genes_util) genes_intersect = SP.intersect1d(genes,genes_util) cidx_tr = [ genes.index(x) for x in genes_intersect ] cidx_util = [genes_util.index(x) for x in genes_intersect] genes = SP.array(genes)[cidx_tr] genes_util = SP.array(genes_util)[cidx_util] Y = SP.vstack([Y[:,cidx_tr],Y_util[:,cidx_util]]) genes = genes_intersect labels = SP.hstack([labels, labels_util]) Y_tst = YT cc_data = {} cc_data['cc_ens'] = cc_ens cc_data['labels_tst'] = labels_tst cc_data['labels'] = labels cc_data['genes_tst'] = genes_tst cc_data['genes'] = genes cc_data['Y'] = Y cc_data['Y_test'] = Y_tst cc_data['class_labels'] = class_labels return cc_data
def multidim_intersect(arr1, arr2): arr1_view = arr1.view([('',arr1.dtype)]*arr1.shape[1]) arr2_view = arr2.view([('',arr2.dtype)]*arr2.shape[1]) intersected = sp.intersect1d(arr1_view, arr2_view) return intersected.view(arr1.dtype).reshape(-1, arr1.shape[1])
'REM.' :os.path.join(BASEDIR, 'gtex_tables/GTex_rest_wo_cells_samples.txt')} gt_dict = utils.get_gt_dict(sample_dict) for event_type in event_types: picklefile = '%s/pca_skl_%s.TN.conf_%.2f.pickle' % (datadir, event_type, 1.0 - conf) hdf5file = '%s/pca_skl_%s.TN.conf_%.2f.hdf5' % (datadir, event_type, 1.0 - conf) if not os.path.exists(picklefile): ### get indices of confident events IN = h5py.File('%s/merge_graphs_%s_C3.counts.hdf5' % (basedir_icgc, event_type), 'r') c_idx = IN['conf_idx'][:].astype('int') IN.close() IN = h5py.File('%s/merge_graphs_%s_C3.counts.hdf5' % (basedir_gtex, event_type), 'r') c_idx_gt = IN['conf_idx'][:].astype('int') c_idx_gt = sp.intersect1d(c_idx_gt, c_idx) c_idx = sp.intersect1d(c_idx, c_idx_gt) IN.close() assert sp.all(c_idx == c_idx_gt) ### load TCGA data from hdf5 print('Loading data from TCGA hdf5') IN = h5py.File('%s/merge_graphs_%s_C3.counts.hdf5' % (basedir_icgc, event_type), 'r') c_idx = IN['conf_idx'][:].astype('int') strains = sp.array([x.split('.')[0] for x in IN['strains'][:]], dtype='str') ### get psi values psi = sp.empty((IN['psi'].shape[0], c_idx.shape[0]), dtype='float') chunksize = IN['psi'].chunks[1] * 30 cum = 0 for c, chunk in enumerate(range(0, IN['psi'].shape[1], chunksize)):
def plot_overlap_ps( result_file, ss_file='/Users/bjarnivilhjalmsson/data/GIANT/GIANT_HEIGHT_Wood_et_al_2014_publicrelease_HapMapCeuFreq.txt', fig_filename='/Users/bjarnivilhjalmsson/data/tmp/manhattan_combPC_HGT.png', method='combPC', ylabel='Comb. PC (HIP,WC,HGT,BMI) $-log_{10}(P$-value$)$', xlabel='Height $-log_{10}(P$-value$)$', p_thres=0.00001): # Parse results ans SS file res_table = pandas.read_table(result_file) ss_table = pandas.read_table(ss_file) # Parse res_sids = sp.array(res_table['SNPid']) if method == 'MVT': comb_ps = sp.array(res_table['pval']) elif method == 'combPC': comb_ps = sp.array(res_table['combPC']) if 'MarkerName' in ss_table.keys(): ss_sids = sp.array(ss_table['MarkerName']) elif 'SNP' in ss_table.keys(): ss_sids = sp.array(ss_table['SNP']) else: raise Exception("Don't know where to look for rs IDs") marg_ps = sp.array(ss_table['p']) # Filtering boring p-values res_p_filter = comb_ps < p_thres res_sids = res_sids[res_p_filter] comb_ps = comb_ps[res_p_filter] # ss_p_filter = marg_ps<p_thres # ss_sids = ss_sids[ss_p_filter] # marg_ps = marg_ps[ss_p_filter] common_sids = sp.intersect1d(res_sids, ss_sids) print 'Found %d SNPs in common' % (len(common_sids)) ss_filter = sp.in1d(ss_sids, common_sids) res_filter = sp.in1d(res_sids, common_sids) ss_sids = ss_sids[ss_filter] res_sids = res_sids[res_filter] marg_ps = marg_ps[ss_filter] comb_ps = comb_ps[res_filter] print 'Now sorting' ss_index = sp.argsort(ss_sids) res_index = sp.argsort(res_sids) marg_ps = -sp.log10(marg_ps[ss_index]) comb_ps = -sp.log10(comb_ps[res_index]) with plt.style.context('fivethirtyeight'): plt.plot(marg_ps, comb_ps, 'b.', alpha=0.2) (x_min, x_max) = plt.xlim() (y_min, y_max) = plt.ylim() plt.plot([x_min, x_max], [y_min, y_max], 'k--', alpha=0.2) plt.ylabel(ylabel) plt.xlabel(xlabel) plt.tight_layout() plt.savefig(fig_filename) plt.clf()
sample_size)] tmp[j, k] = sc.unique(sample).size #~ pdb.set_trace() alpha[i] = tmp.mean(axis=1) # mean # spp per cell ## Per band, measure beta diversity tmp2 = sc.zeros((nrows, ncols)) for j in range(nrows): #~ pdb.set_trace() for k in range(ncols): a = community[j, k][community[j, k] > 0] b = community[j, (k + 15) % 30][community[j, (k + 15) % 30] > 0] # cell on opposite side of mountain to `a` shared_spp = sc.intersect1d(a, b) probability = shared_spp.size / cell_abundances[j] tmp2[j, k] = probability #~ pdb.set_trace() beta[i] = tmp2.mean(axis=1) gamma[i] = nspp_per_band[-10:].mean(axis=0) band_areas = cell_areas * T_theta gamma_area[i] = gamma[i] / band_areas alpha = alpha[alpha.sum(axis=1) > 0] beta = beta[beta.sum(axis=1) > 0] # Delete rows with no data (file ID doesn't exist). mean_alpha = alpha.mean(axis=0)
def phenotype_correlations(request, q=None): """ Return data for phenotype-phenotype correlations and between phenotype accession overlap --- produces: - application/json """ #id string to list pids = map(int, q.split(",")) pheno_dict = {} for i, pid in enumerate(pids): try: phenotype = Phenotype.objects.published().get(pk=pid) except: return Response({'message': 'FAILED', 'not_found': pid}) pheno_acc_infos = phenotype.phenotypevalue_set.prefetch_related( 'obs_unit__accession') values = sp.array(pheno_acc_infos.values_list('value', flat=True)) samples = sp.array( pheno_acc_infos.values_list('obs_unit__accession__id', flat=True)) name = str( phenotype.name.replace("<i>", "").replace("</i>", "") + " (" + str(phenotype.study.name) + ")") pheno_dict[str(phenotype.name) + "_" + str(phenotype.study.name) + "_" + str(i)] = { 'samples': samples, 'y': values, 'name': name, 'id': str(phenotype.id) } #str(phenotype.name) + "_" + str(phenotype.study.name) + "_" + str(i)} #compute correlation matrix corr_mat = sp.ones((len(pheno_dict), len(pheno_dict))) * sp.nan spear_mat = sp.ones((len(pheno_dict), len(pheno_dict))) * sp.nan pheno_keys = pheno_dict.keys() axes_data = [] scatter_data = [] sample_data = [] slabels = {} for i, pheno1 in enumerate(pheno_keys): axes_data.append({ "label": pheno_dict[pheno1]['name'], "index": str(i), "pheno_id": str(pheno_dict[pheno1]['id']) }) samples1 = pheno_dict[pheno1]['samples'] y1 = pheno_dict[pheno1]['y'] #store scatter data scatter_data.append({ "label": pheno_dict[pheno1]['name'], "pheno_id": str(pheno_dict[pheno1]['id']), "samples": samples1.tolist(), "values": y1.tolist() }) for j, pheno2 in enumerate(pheno_keys): samples2 = pheno_dict[pheno2]['samples'] y2 = pheno_dict[pheno2]['y'] #match accessions ind = (sp.reshape(samples1, (samples1.shape[0], 1)) == samples2).nonzero() y_tmp = y1[ind[0]] y2 = y2[ind[1]] if y1.shape[0] > 0 and y2.shape[0] > 0: corr_mat[i][j] = stats.pearsonr(y_tmp.flatten(), y2.flatten())[0] spear_mat[i][j] = stats.spearmanr(y_tmp.flatten(), y2.flatten())[0] #compute sample intersections if pheno1 == pheno2: continue if pheno1 + "_" + pheno2 in slabels: continue if pheno2 + "_" + pheno1 in slabels: continue slabels[pheno1 + "_" + pheno2] = True A = samples1.shape[0] B = samples2.shape[0] C = sp.intersect1d(samples1, samples2).shape[0] sample_data.append({ "labelA": pheno_dict[pheno1]['name'], "labelA_id": pheno_dict[pheno1]['id'], "labelB": pheno_dict[pheno2]['name'], "labelB_id": pheno_dict[pheno2]['id'], "A": A, "B": B, "C": C }) data = {} data['axes_data'] = axes_data data['scatter_data'] = scatter_data data['sample_data'] = sample_data data['corr_mat'] = str(corr_mat.tolist()).replace("nan", "NaN") data['spear_mat'] = str(spear_mat.tolist()).replace("nan", "NaN") if request.method == "GET": return Response(data)
os.makedirs(run_dir) #load data f = h5py.File(CFG['data_file'],'r') Y = f['LogNcountsQuartz'][:] tech_noise = f['LogVar_techQuartz_logfit'][:] genes_het_bool=f['genes_heterogen'][:] # index of heterogeneous(??!??) genes geneID = f['gene_names_all'][:] # gene names cellcyclegenes_filter = SP.unique(f['ccGO_gene_indices'][:].ravel() -1) # idx of cell cycle genes cellcyclegenes_filterCB600 = f['ccCBall_gene_indices'][:].ravel() -1 # idxof cell cycle genes ... # filter cell cycle genes idx_cell_cycle = SP.union1d(cellcyclegenes_filter,cellcyclegenes_filterCB600) Ymean2 = Y.mean(0)**2>0 idx_cell_cycle_noise_filtered = SP.intersect1d(idx_cell_cycle,SP.array(SP.where(Ymean2.ravel()>0))) Ycc = Y[:,idx_cell_cycle_noise_filtered] #Fit GPLVM to data k = 1 # number of latent factors file_name = CFG['panama_file']# name of the cache file recalc = True # recalculate X and Kconf sclvm = scLVM(Y) X,Kcc,varGPLVM = sclvm.fitGPLVM(idx=idx_cell_cycle_noise_filtered,k=1,out_dir='./cache',file_name=file_name,recalc=recalc) #3. load relevant dataset for analysis genes_het=SP.array(SP.where(f['genes_heterogen'][:].ravel()==1)) # considers only heterogeneous genes Ihet = genes_het_bool==1 Y = Y[:,Ihet]
fr = open("Yoshiko.csv", "r") YoshikoClusterDict = {} YoshikoClusters = [] fr.readline() for Line in fr: LSplit = Line.strip().split(",") YoshikoClusterDict[LSplit[0]] = LSplit[-1].split(";") YoshikoClusters.append(LSplit[0]) fr.close() fw = open("MtbClusterOverlappingGenes.csv", "w") fw.write("ClusterIndex,OverlappingGenes,Metabolites\n") for Cluster in YoshikoClusters: ClusterGeneDict = {} for Mtb in YoshikoClusterDict[Cluster]: ClusterGeneDict[Mtb] = HeinzGeneDict[Mtb] Cntr = 0 InterSectionArray = None CurrentArray = None for Value in ClusterGeneDict.itervalues(): if Cntr == 0: CurrentArray = Value InterSectionArray = scipy.intersect1d(CurrentArray, Value) Cntr += 1 fw.write( Cluster + "," + ";".join(InterSectionArray.tolist()) + "," + ";".join(YoshikoClusterDict[str(Cluster)]) + "\n" ) fw.close()
def coordinate_genotypes_ss_w_ld_ref(genotype_file = None, reference_genotype_file = None, hdf5_file = None, genetic_map_dir=None, check_mafs=False, min_maf=0.01): # recode_dict = {1:'A', 2:'T', 3:'C', 4:'G'} #1K genomes recoding.. print 'Coordinating things w genotype file: %s \nref. genot. file: %s'%(genotype_file, reference_genotype_file) plinkf = plinkfile.PlinkFile(genotype_file) #Loads only the individuals... (I think?) samples = plinkf.get_samples() num_individs = len(samples) Y = [s.phenotype for s in samples] fids = [s.fid for s in samples] iids = [s.iid for s in samples] unique_phens = sp.unique(Y) if len(unique_phens)==1: print 'Unable to find phenotype values.' has_phenotype=False elif len(unique_phens)==2: cc_bins = sp.bincount(Y) assert len(cc_bins)==2, 'Problems with loading phenotype' print 'Loaded %d controls and %d cases'%(cc_bins[0], cc_bins[1]) has_phenotype=True else: print 'Found quantitative phenotype values' has_phenotype=True #Figure out chromosomes and positions. print 'Parsing validation genotype bim file' loci = plinkf.get_loci() plinkf.close() gf_chromosomes = [l.chromosome for l in loci] chromosomes = sp.unique(gf_chromosomes) chromosomes.sort() chr_dict = _get_chrom_dict_(loci, chromosomes) print 'Parsing LD reference genotype bim file' plinkf_ref = plinkfile.PlinkFile(reference_genotype_file) loci_ref = plinkf_ref.get_loci() plinkf_ref.close() chr_dict_ref = _get_chrom_dict_(loci_ref, chromosomes) # chr_dict_ref = _get_chrom_dict_bim_(reference_genotype_file+'.bim', chromosomes) #Open HDF5 file and prepare out data assert not 'iids' in hdf5_file.keys(), 'Something is wrong with the HDF5 file?' if has_phenotype: hdf5_file.create_dataset('y', data=Y) hdf5_file.create_dataset('fids', data=fids) hdf5_file.create_dataset('iids', data=iids) ssf = hdf5_file['sum_stats'] cord_data_g = hdf5_file.create_group('cord_data') maf_adj_risk_scores = sp.zeros(num_individs) num_common_snps = 0 #corr_list = [] tot_g_ss_nt_concord_count = 0 tot_rg_ss_nt_concord_count = 0 tot_g_rg_nt_concord_count = 0 tot_num_non_matching_nts = 0 #Now iterate over chromosomes for chrom in chromosomes: ok_indices = {'g':[], 'rg':[], 'ss':[]} chr_str = 'chrom_%d'%chrom print 'Working on chromsome: %s'%chr_str chrom_d = chr_dict[chr_str] chrom_d_ref = chr_dict_ref[chr_str] try: ssg = ssf['chrom_%d' % chrom] except Exception, err_str: print err_str print 'Did not find chromsome in SS dataset.' print 'Continuing.' continue ssg = ssf['chrom_%d' % chrom] g_sids = chrom_d['sids'] rg_sids = chrom_d_ref['sids'] ss_sids = ssg['sids'][...] print 'Found %d SNPs in validation data, %d SNPs in LD reference data, and %d SNPs in summary statistics.'%(len(g_sids), len(rg_sids), len(ss_sids)) common_sids = sp.intersect1d(ss_sids, g_sids) common_sids = sp.intersect1d(common_sids, rg_sids) print 'Found %d SNPs on chrom %d that were common across all datasets'%(len(common_sids), chrom) ss_snp_map = [] g_snp_map = [] rg_snp_map = [] ss_sid_dict = {} for i, sid in enumerate(ss_sids): ss_sid_dict[sid]=i g_sid_dict = {} for i, sid in enumerate(g_sids): g_sid_dict[sid]=i rg_sid_dict = {} for i, sid in enumerate(rg_sids): rg_sid_dict[sid]=i for sid in common_sids: g_snp_map.append(g_sid_dict[sid]) #order by positions g_positions = sp.array(chrom_d['positions'])[g_snp_map] order = sp.argsort(g_positions) #order = order.tolist() g_snp_map = sp.array(g_snp_map)[order] g_snp_map = g_snp_map.tolist() common_sids = sp.array(common_sids)[order] #Get the other two maps for sid in common_sids: rg_snp_map.append(rg_sid_dict[sid]) for sid in common_sids: ss_snp_map.append(ss_sid_dict[sid]) g_nts = sp.array(chrom_d['nts']) rg_nts = sp.array(chrom_d_ref['nts']) rg_nts_ok = sp.array(rg_nts)[rg_snp_map] # rg_nts_l = [] # for nt in rg_nts_ok: # rg_nts_l.append([recode_dict[nt[0]],recode_dict[nt[1]]]) # rg_nts_ok = sp.array(rg_nts_l) ss_nts = ssg['nts'][...] betas = ssg['betas'][...] log_odds = ssg['log_odds'][...] if 'freqs' in ssg.keys(): ss_freqs = ssg['freqs'][...] g_ss_nt_concord_count = sp.sum(g_nts[g_snp_map] == ss_nts[ss_snp_map])/2.0 rg_ss_nt_concord_count = sp.sum(rg_nts_ok == ss_nts[ss_snp_map])/2.0 g_rg_nt_concord_count = sp.sum(g_nts[g_snp_map] == rg_nts_ok)/2.0 print 'Nucleotide concordance counts out of %d genotypes: vg-g: %d, vg-ss: %d, g-ss: %d'%(len(g_snp_map),g_rg_nt_concord_count, g_ss_nt_concord_count, rg_ss_nt_concord_count) tot_g_ss_nt_concord_count += g_ss_nt_concord_count tot_rg_ss_nt_concord_count += rg_ss_nt_concord_count tot_g_rg_nt_concord_count += g_rg_nt_concord_count num_non_matching_nts = 0 num_ambig_nts = 0 #Identifying which SNPs have nucleotides that are ok.. ok_nts = [] for g_i, rg_i, ss_i in it.izip(g_snp_map, rg_snp_map, ss_snp_map): #To make sure, is the SNP id the same? assert g_sids[g_i]==rg_sids[rg_i]==ss_sids[ss_i], 'Some issues with coordinating the genotypes.' g_nt = g_nts[g_i] rg_nt = rg_nts[rg_i] # rg_nt = [recode_dict[rg_nts[rg_i][0]],recode_dict[rg_nts[rg_i][1]]] ss_nt = ss_nts[ss_i] #Is the nucleotide ambiguous. g_nt = [g_nts[g_i][0],g_nts[g_i][1]] if tuple(g_nt) in ambig_nts: num_ambig_nts +=1 tot_num_non_matching_nts += 1 continue #First check if nucleotide is sane? if (not g_nt[0] in valid_nts) or (not g_nt[1] in valid_nts): num_non_matching_nts += 1 tot_num_non_matching_nts += 1 continue os_g_nt = sp.array([opp_strand_dict[g_nt[0]], opp_strand_dict[g_nt[1]]]) flip_nts = False if not ((sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt)) and (sp.all(g_nt == rg_nt) or sp.all(os_g_nt == rg_nt))): if sp.all(g_nt == rg_nt) or sp.all(os_g_nt == rg_nt): flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0] == ss_nt[1]) or (os_g_nt[1] == ss_nt[0] and os_g_nt[0] == ss_nt[1]) #Try flipping the SS nt if flip_nts: betas[ss_i] = -betas[ss_i] log_odds[ss_i] = -log_odds[ss_i] if 'freqs' in ssg.keys(): ss_freqs[ss_i] = 1-ss_freqs[ss_i] else: print "Nucleotides don't match after all?: g_sid=%s, ss_sid=%s, g_i=%d, ss_i=%d, g_nt=%s, ss_nt=%s" % \ (g_sids[g_i], ss_sids[ss_i], g_i, ss_i, str(g_nt), str(ss_nt)) num_non_matching_nts += 1 tot_num_non_matching_nts += 1 continue else: num_non_matching_nts += 1 tot_num_non_matching_nts += 1 continue # Opposite strand nucleotides # everything seems ok. ok_indices['g'].append(g_i) ok_indices['rg'].append(rg_i) ok_indices['ss'].append(ss_i) ok_nts.append(g_nt) # if flip_nts: # ok_nts.append([ss_nt[1],ss_nt[0]]) # else: # ok_nts.append(ss_nt) #print '%d SNPs in LD references to be flipped.'%((len(ref_snp_directions)-sp.sum(ref_snp_directions))/2.0) print '%d SNPs had ambiguous nucleotides.' % num_ambig_nts print '%d SNPs were excluded due to nucleotide issues.' % num_non_matching_nts print '%d SNPs were retained on chromosome %d.' % (len(ok_indices['g']), chrom) #Resorting by position positions = sp.array(chrom_d['positions'])[ok_indices['g']] # order = sp.argsort(positions) # sorted_positions = positions[order] # assert sp.all(sorted_positions==positions), 'Perhaps something is wrong here?' # ok_indices['g'] = list(sp.array(ok_indices['g'])[order]) # ok_indices['ss'] = list(sp.array(ok_indices['ss'])[order]) #Now parse SNPs .. snp_indices = sp.array(chrom_d['snp_indices']) snp_indices = snp_indices[ok_indices['g']] #Pinpoint where the SNPs are in the file. raw_snps,freqs = _parse_plink_snps_(genotype_file, snp_indices) snp_indices_ref = sp.array(chrom_d_ref['snp_indices']) snp_indices_ref = snp_indices_ref[ok_indices['rg']] #Pinpoint where the SNPs are in the file. raw_ref_snps, freqs_ref = _parse_plink_snps_(reference_genotype_file, snp_indices_ref) snp_stds_ref = sp.sqrt(2*freqs_ref*(1-freqs_ref)) snp_means_ref = freqs_ref*2 snp_stds = sp.sqrt(2*freqs*(1-freqs)) snp_means = freqs*2 betas = betas[ok_indices['ss']] # * sp.sqrt(freqs * (1 - freqs)) log_odds = log_odds[ok_indices['ss']] # * sp.sqrt(freqs * (1 - freqs)) ps = ssg['ps'][...][ok_indices['ss']] nts = sp.array(ok_nts)#[order] sids = ssg['sids'][...][ok_indices['ss']] #For debugging... # g_sids = sp.array(chrom_d['sids'])[ok_indices['g']] # rg_sids = sp.array(chrom_d_ref['sids'])[ok_indices['rg']] # ss_sids = ssg['sids'][...][ok_indices['ss']] # assert sp.all(g_sids==rg_sids) and sp.all(rg_sids==ss_sids), 'WTF?' #Check SNP frequencies.. if check_mafs and 'freqs' in ssg.keys(): ss_freqs = ss_freqs[ok_indices['ss']] freq_discrepancy_snp = sp.absolute(ss_freqs-(1-freqs))>0.15 if sp.any(freq_discrepancy_snp): print 'Warning: %d SNPs were filtered due to high allele frequency discrepancy between summary statistics and validation sample'%sp.sum(freq_discrepancy_snp) # print freqs[freq_discrepancy_snp] # print ss_freqs[freq_discrepancy_snp] #Filter freq_discrepancy_snps ok_freq_snps = sp.negative(freq_discrepancy_snp) raw_snps = raw_snps[ok_freq_snps] snp_stds = snp_stds[ok_freq_snps] snp_means = snp_means[ok_freq_snps] raw_ref_snps = raw_ref_snps[ok_freq_snps] snp_stds_ref = snp_stds_ref[ok_freq_snps] snp_means_ref = snp_means_ref[ok_freq_snps] freqs = freqs[ok_freq_snps] freqs_ref = freqs_ref[ok_freq_snps] ps = ps[ok_freq_snps] positions = positions[ok_freq_snps] nts = nts[ok_freq_snps] sids = sids[ok_freq_snps] betas = betas[ok_freq_snps] log_odds = log_odds[ok_freq_snps] #For debugging... # if sp.any(freq_discrepancy_snp): # g_sids = g_sids[ok_freq_snps] # rg_sids = rg_sids[ok_freq_snps] # ss_sids = ss_sids[ok_freq_snps] # assert sp.all(g_sids==rg_sids) and sp.all(rg_sids==ss_sids), 'WTF?' #Filter minor allele frequency SNPs. maf_filter = (freqs>min_maf)*(freqs<(1-min_maf)) maf_filter_sum = sp.sum(maf_filter) n_snps = len(maf_filter) assert maf_filter_sum<=n_snps, "WTF?" if sp.sum(maf_filter)<n_snps: raw_snps = raw_snps[maf_filter] snp_stds = snp_stds[maf_filter] snp_means = snp_means[maf_filter] raw_ref_snps = raw_ref_snps[maf_filter] snp_stds_ref = snp_stds_ref[maf_filter] snp_means_ref = snp_means_ref[maf_filter] freqs = freqs[maf_filter] freqs_ref = freqs_ref[maf_filter] ps = ps[maf_filter] positions = positions[maf_filter] nts = nts[maf_filter] sids = sids[maf_filter] betas = betas[maf_filter] log_odds = log_odds[maf_filter] # if sp.sum(maf_filter)<n_snps: # g_sids = g_sids[maf_filter] # rg_sids = rg_sids[maf_filter] # ss_sids = ss_sids[maf_filter] # assert sp.all(g_sids==rg_sids) and sp.all(rg_sids==ss_sids), 'WTF?' maf_adj_prs = sp.dot(log_odds, raw_snps) if has_phenotype: maf_adj_corr = sp.corrcoef(Y, maf_adj_prs)[0, 1] print 'Log odds, per genotype PRS correlation w phenotypes for chromosome %d was %0.4f' % (chrom, maf_adj_corr) genetic_map = [] if genetic_map_dir is not None: with gzip.open(genetic_map_dir+'chr%d.interpolated_genetic_map.gz'%chrom) as f: for line in f: l = line.split() if l[0] in sid_set: genetic_map.append(l[0]) print 'Now storing coordinated data to HDF5 file.' ofg = cord_data_g.create_group('chrom_%d' % chrom) ofg.create_dataset('raw_snps_val', data=raw_snps, compression='lzf') ofg.create_dataset('snp_stds_val', data=snp_stds) ofg.create_dataset('snp_means_val', data=snp_means) ofg.create_dataset('freqs_val', data=freqs) ofg.create_dataset('raw_snps_ref', data=raw_ref_snps, compression='lzf') ofg.create_dataset('snp_stds_ref', data=snp_stds_ref) ofg.create_dataset('snp_means_ref', data=snp_means_ref) ofg.create_dataset('freqs_ref', data=freqs_ref) ofg.create_dataset('nts', data=nts) ofg.create_dataset('ps', data=ps) ofg.create_dataset('positions', data=positions) ofg.create_dataset('sids', data=sids) if genetic_map_dir is not None: ofg.create_dataset('genetic_map', data=genetic_map) ofg.create_dataset('betas', data=betas) ofg.create_dataset('log_odds', data=log_odds) ofg.create_dataset('log_odds_prs', data=maf_adj_prs) # print 'Sum betas', sp.sum(betas ** 2) #ofg.create_dataset('prs', data=prs) #risk_scores += prs maf_adj_risk_scores += maf_adj_prs num_common_snps += len(betas)
strains_tcga_short = strains_tcga_short[lkidx] ctypes = ctypes[lkidx] tcga_is_tumor = tcga_is_tumor[lkidx] else: lkidx = sp.arange(strains_tcga.shape[0]) print 'loading data for GTEx' IN_GT = h5py.File(os.path.join(paths.basedir_as_gtex, 'spladder', 'genes_graph_conf%i.merge_graphs.validated.count.hdf5' % CONF), 'r') gids_gtex = IN_GT['gene_ids_edges'][:, 0] gnames_gtex = IN_GT['gene_names'][:] strains_gtex = IN_GT['strains'][:] gtypes = sp.array([gt_dict[x.split('.')[0]] if x.split('.')[0] in gt_dict else 'NA' for x in strains_gtex], dtype='str') gid_names_tcga = sp.array([gnames_tcga[i] for i in gids_tcga], dtype='str') gid_names_gtex = sp.array([gnames_gtex[i] for i in gids_gtex], dtype='str') gid_names_common = sp.intersect1d(gid_names_tcga, gid_names_gtex) kidx_tcga = sp.where(sp.in1d(gid_names_tcga, gid_names_common))[0] kidx_gtex = sp.where(sp.in1d(gid_names_gtex, gid_names_common))[0] gids_tcga = gids_tcga[kidx_tcga] gids_gtex = gids_gtex[kidx_gtex] if not os.path.exists(os.path.join(paths.basedir_tss, 'tss_size_factors%s%s.cpickle' % (wl_tag, fl_tag))): ### compute total edge count for GTEx samples print 'Computing total edge count for GTEx samples' ### get gene intervals s_idx = sp.argsort(gids_gtex, kind='mergesort') _, f_idx = sp.unique(gids_gtex[s_idx], return_index=True) l_idx = sp.r_[f_idx[1:], gids_gtex.shape[0]] ### get counts genecounts_gtex = sp.zeros((f_idx.shape[0], IN_GT['edges'].shape[1]), dtype='int')
from barplot import * from default import * from IPython.display import Latex data = '/home/zhen/box/Manuscript/Tarik/Tarik2.h5f' f = h5py.File(data,'r') Y = f['Y'][:] # gene expression matrix tech_noise = f['tech_noise'][:] # technical noise genes_het_bool=f['genes_het_bool'][:] # index of heterogeneous genes geneID = f['gene_names'][:] # gene names idx_cell_cycle = f['idx_cellcyclegenes'][:] # determine non-zero counts idx_nonzero = SP.nonzero((Y.mean(0)**2)>0)[0] idx_cell_cycle_noise_filtered = SP.intersect1d(idx_cell_cycle,idx_nonzero) # subset gene expression matrix Ycc = Y[:,idx_cell_cycle_noise_filtered] k = 20 # number of latent factors out_dir = '/home/zhen/box/Manuscript/Tarik/cache' # folder where results are cached file_name = 'Kcc.hdf5' # name of the cache file recalc = True # recalculate X and Kconf use_ard = True # use automatic relevance detection sclvm = scLVM(Y) #Fit model with 80 factors X_ARD,Kcc_ARD,varGPLVM_ARD = sclvm.fitGPLVM(idx=idx_cell_cycle_noise_filtered, k=k, out_dir=out_dir, file_name=file_name, recalc=recalc,
sd_segs = scipy.std(segs, 0) for i in xrange(lsegs): R = scipy.argmax(segs[i]) if R != 200: outliers.append(i) elif max(segs[i]) > M: outliers.append(i) elif min(segs[i]) < m: outliers.append(i) else: up = pylab.find(segs[i] > mean_curve - sd_segs) down = pylab.find(segs[i] < mean_curve + sd_segs) ins = len(scipy.intersect1d(up, down)) if ins < 540: outliers.append(i) goods = list(alls - set(outliers)) if goods != []: ax1.plot(segs[goods].T, 'k') ax1.axis('tight') ax1.grid() if outliers != []: ax2.plot(segs[outliers].T, 'r') ax2.axis('tight') ax2.grid() fig.savefig('falc_temp/temp/output-%d.png' % rid)
### annotated counts = [] curr_xloc = [] for i, t in enumerate(dsets_plot): if p[0] == 'ALL': t_idx = sp.arange(tids[t].shape[0]) else: if t == 'gt': t_idx = sp.where(tids[t] == p[1])[0] else: t_idx = sp.where(tids[t] == p[0])[0] if count[(t, p)] == 0: counts.append(0) else: counts.append( sp.intersect1d(anno_idx, count[(t, p)][thresh]).shape[0]) curr_xloc.append(j * len(dsets_plot) + i + buff) if t == 'tn': labels.append(label_dict[t][p[0]] + '\nN=%i' % (sp.sum(~is_tumor[t_idx]))) elif t == 'tc': labels.append(label_dict[t][p[0]] + '\nN=%i' % (sp.sum(is_tumor[t_idx]))) else: labels.append(label_dict[t][p[0]] + '\nN=%i' % (t_idx.shape[0])) ax.bar(curr_xloc, counts, 0.5, color=colors[p[0]]) ### not annotated counts2 = []
def coordinate_datasets(reference_genotype_file, hdf5_file, summary_dict, validation_genotype_file=None, genetic_map_dir=None, min_maf=0.01, skip_coordination=False, max_freq_discrep = 0.15, debug=False): summary_dict[3.9]={'name':'dash', 'value':'Coordination'} t0 = time.time() if validation_genotype_file is not None: print('Coordinating datasets (Summary statistics, LD reference genotypes, and Validation genotypes).') else: print('Coordinating datasets (Summary statistics and LD reference genotypes).') plinkf = plinkfile.PlinkFile(reference_genotype_file) # Figure out chromosomes and positions. if debug: print('Parsing plinkf_dict_val reference genotypes') loci = plinkf.get_loci() plinkf.close() summary_dict[4]={'name':'Num individuals in LD Reference data:','value':plinkfiles.get_num_indivs(reference_genotype_file)} summary_dict[4.1]={'name':'SNPs in LD Reference data:','value':len(loci)} gf_chromosomes = [l.chromosome for l in loci] chromosomes = sp.unique(gf_chromosomes) chromosomes.sort() chr_dict = plinkfiles.get_chrom_dict(loci, chromosomes) if validation_genotype_file is not None: if debug: print('Parsing LD validation bim file') plinkf_val = plinkfile.PlinkFile(validation_genotype_file) # Loads only the individuals... plinkf_dict_val = plinkfiles.get_phenotypes(plinkf_val) loci_val = plinkf_val.get_loci() plinkf_val.close() summary_dict[5]={'name':'SNPs in Validation data:','value':len(loci_val)} chr_dict_val = plinkfiles.get_chrom_dict(loci_val, chromosomes) # Open HDF5 file and prepare out data assert not 'iids' in hdf5_file, 'Something is wrong with the HDF5 file, no individuals IDs were found.' if plinkf_dict_val['has_phenotype']: hdf5_file.create_dataset('y', data=plinkf_dict_val['phenotypes']) summary_dict[6]={'name':'Num validation phenotypes:','value':plinkf_dict_val['num_individs']} hdf5_file.create_dataset('fids', data=sp.array(plinkf_dict_val['fids'], dtype=util.fids_dtype)) hdf5_file.create_dataset('iids', data=sp.array(plinkf_dict_val['iids'], dtype=util.iids_dtype)) maf_adj_risk_scores = sp.zeros(plinkf_dict_val['num_individs']) # Now summary statistics ssf = hdf5_file['sum_stats'] cord_data_g = hdf5_file.create_group('cord_data') num_common_snps = 0 # corr_list = [] chromosomes_found = set() num_snps_common_before_filtering =0 num_snps_common_after_filtering =0 tot_num_non_matching_nts = 0 tot_num_non_supported_nts = 0 tot_num_ambig_nts = 0 tot_num_freq_discrep_filtered_snps = 0 tot_num_maf_filtered_snps = 0 tot_g_ss_nt_concord_count = 0 if validation_genotype_file is not None: tot_g_vg_nt_concord_count = 0 tot_vg_ss_nt_concord_count = 0 # Now iterate over chromosomes chrom_i = 0 for chrom in chromosomes: chrom_i +=1 if not debug: sys.stdout.write('\b\b\b\b\b\b\b%0.2f%%' % (100.0 * (float(chrom_i) / (len(chromosomes)+1)))) sys.stdout.flush() try: chr_str = 'chrom_%d' % chrom ssg = ssf[chr_str] except Exception as err_str: print(err_str) print('Did not find chromosome %d in SS dataset.'%chrom) print('Continuing.') continue if debug: print('Coordinating data for chromosome %s' % chr_str) chromosomes_found.add(chrom) #Get summary statistics chromosome group ssg = ssf['chrom_%d' % chrom] ss_sids = (ssg['sids'][...]).astype(util.sids_u_dtype) if validation_genotype_file is not None: chrom_d_val = chr_dict_val[chr_str] vg_sids = chrom_d_val['sids'] common_sids = sp.intersect1d(ss_sids, vg_sids) # A map from sid to index for validation data vg_sid_dict = {} for i, sid in enumerate(vg_sids): vg_sid_dict[sid] = i else: common_sids = ss_sids # A map from sid to index for summary stats ss_sid_dict = {} for i, sid in enumerate(ss_sids): ss_sid_dict[sid] = i #The indices to retain for the LD reference genotypes chrom_d = chr_dict[chr_str] g_sids = chrom_d['sids'] common_sids = sp.intersect1d(common_sids, g_sids) # A map from sid to index for LD reference data g_sid_dict = {} for i, sid in enumerate(g_sids): g_sid_dict[sid] = i if debug: print('Found %d SNPs on chrom %d that were common across all datasets' % (len(common_sids), chrom)) print('Ordering SNPs by genomic positions (based on LD reference genotypes).') g_snp_map = [] for sid in common_sids: g_snp_map.append(g_sid_dict[sid]) # order by positions (based on LD reference file) g_positions = sp.array(chrom_d['positions'])[g_snp_map] order = sp.argsort(g_positions) g_snp_map = sp.array(g_snp_map)[order] g_snp_map = g_snp_map.tolist() common_sids = sp.array(common_sids)[order] # Get the ordered sum stats SNPs indices. ss_snp_map = [] for sid in common_sids: ss_snp_map.append(ss_sid_dict[sid]) # Get the ordered validation SNPs indices if validation_genotype_file is not None: vg_snp_map = [] for sid in common_sids: vg_snp_map.append(vg_sid_dict[sid]) vg_nts = sp.array(chrom_d_val['nts']) vg_nts_ok = sp.array(vg_nts)[vg_snp_map] g_nts = sp.array(chrom_d['nts']) ss_nts = (ssg['nts'][...]).astype(util.nts_u_dtype) betas = ssg['betas'][...] log_odds = ssg['log_odds'][...] if 'freqs' in ssg: ss_freqs = ssg['freqs'][...] g_ss_nt_concord_count = sp.sum( g_nts[g_snp_map] == ss_nts[ss_snp_map]) / 2.0 if validation_genotype_file is not None: vg_ss_nt_concord_count = sp.sum(vg_nts_ok == ss_nts[ss_snp_map]) / 2.0 g_vg_nt_concord_count = sp.sum(g_nts[g_snp_map] == vg_nts_ok) / 2.0 if debug: print('Nucleotide concordance counts out of %d genotypes, vg-rg: %d ; vg-ss: %d' % (len(g_snp_map), g_vg_nt_concord_count, vg_ss_nt_concord_count)) tot_vg_ss_nt_concord_count += vg_ss_nt_concord_count tot_g_vg_nt_concord_count += g_vg_nt_concord_count tot_g_ss_nt_concord_count += g_ss_nt_concord_count if debug: print('Nucleotide concordance counts out of %d genotypes, rg-ss: %d' % (len(g_snp_map), g_ss_nt_concord_count)) num_freq_discrep_filtered_snps = 0 num_non_matching_nts = 0 num_non_supported_nts = 0 num_ambig_nts = 0 # Identifying which SNPs have nucleotides that are ok.. ok_nts = [] ok_indices = {'g': [], 'ss': []} if validation_genotype_file is not None: ok_indices['vg']=[] #Now loop over SNPs to coordinate nucleotides. if validation_genotype_file is not None: for g_i, vg_i, ss_i in zip(g_snp_map, vg_snp_map, ss_snp_map): # To make sure, is the SNP id the same? assert g_sids[g_i] == vg_sids[vg_i] == ss_sids[ss_i], 'Some issues with coordinating the genotypes.' g_nt = g_nts[g_i] if not skip_coordination: vg_nt = vg_nts[vg_i] ss_nt = ss_nts[ss_i] # Is the nucleotide ambiguous. g_nt = [g_nts[g_i][0], g_nts[g_i][1]] if tuple(g_nt) in util.ambig_nts: num_ambig_nts += 1 continue # First check if nucleotide is sane? if (not g_nt[0] in util.valid_nts) or (not g_nt[1] in util.valid_nts): num_non_supported_nts += 1 continue os_g_nt = sp.array( [util.opp_strand_dict[g_nt[0]], util.opp_strand_dict[g_nt[1]]]) flip_nts = False #Coordination is a bit more complicate when validation genotypes are provided.. if not ((sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt)) and (sp.all(g_nt == vg_nt) or sp.all(os_g_nt == vg_nt))): if sp.all(g_nt == vg_nt) or sp.all(os_g_nt == vg_nt): flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0] == ss_nt[1]) or ( os_g_nt[1] == ss_nt[0] and os_g_nt[0] == ss_nt[1]) # Try flipping the SS nt if flip_nts: betas[ss_i] = -betas[ss_i] log_odds[ss_i] = -log_odds[ss_i] if 'freqs' in ssg: ss_freqs[ss_i] = 1 - ss_freqs[ss_i] else: if debug: print("Nucleotides don't match after all?: g_sid=%s, ss_sid=%s, g_i=%d, ss_i=%d, g_nt=%s, ss_nt=%s" % \ (g_sids[g_i], ss_sids[ss_i], g_i, ss_i, str(g_nt), str(ss_nt))) num_non_matching_nts += 1 continue else: num_non_matching_nts += 1 continue # Opposite strand nucleotides # everything seems ok. ok_indices['g'].append(g_i) ok_indices['vg'].append(vg_i) ok_indices['ss'].append(ss_i) ok_nts.append(g_nt) else: for g_i, ss_i in zip(g_snp_map, ss_snp_map): # To make sure, is the SNP id the same? assert g_sids[g_i] == ss_sids[ss_i], 'Some issues with coordinating the genotypes.' g_nt = g_nts[g_i] if not skip_coordination: ss_nt = ss_nts[ss_i] # Is the nucleotide ambiguous. g_nt = [g_nts[g_i][0], g_nts[g_i][1]] if tuple(g_nt) in util.ambig_nts: num_ambig_nts += 1 continue # First check if nucleotide is sane? if (not g_nt[0] in util.valid_nts) or (not g_nt[1] in util.valid_nts): num_non_matching_nts += 1 continue os_g_nt = sp.array( [util.opp_strand_dict[g_nt[0]], util.opp_strand_dict[g_nt[1]]]) flip_nts = False #Coordination is a bit more complicate when validation genotypes are provided.. if not sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt): flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0] == ss_nt[1]) or ( os_g_nt[1] == ss_nt[0] and os_g_nt[0] == ss_nt[1]) # Try flipping the SS nt if flip_nts: betas[ss_i] = -betas[ss_i] log_odds[ss_i] = -log_odds[ss_i] if 'freqs' in ssg and ss_freqs[ss_i]>0: ss_freqs[ss_i] = 1.0 - ss_freqs[ss_i] else: if debug: print("Nucleotides don't match after all?: g_sid=%s, ss_sid=%s, g_i=%d, ss_i=%d, g_nt=%s, ss_nt=%s" % \ (g_sids[g_i], ss_sids[ss_i], g_i, ss_i, str(g_nt), str(ss_nt))) num_non_matching_nts += 1 continue # everything seems ok. ok_indices['g'].append(g_i) ok_indices['ss'].append(ss_i) ok_nts.append(g_nt) if debug: print('%d SNPs had ambiguous nucleotides.' % num_ambig_nts) print('%d SNPs were excluded due to nucleotide issues.' % num_non_matching_nts) # Resorting by position positions = sp.array(chrom_d['positions'])[ok_indices['g']] # Now parse SNPs .. snp_indices = sp.array(chrom_d['snp_indices']) # Pinpoint where the SNPs are in the file. snp_indices = snp_indices[ok_indices['g']] raw_snps, freqs = plinkfiles.parse_plink_snps( reference_genotype_file, snp_indices) snp_stds = sp.sqrt(2 * freqs * (1 - freqs)) snp_means = freqs * 2 betas = betas[ok_indices['ss']] log_odds = log_odds[ok_indices['ss']] ps = ssg['ps'][...][ok_indices['ss']] nts = sp.array(ok_nts) sids = (ssg['sids'][...]).astype(util.sids_u_dtype) sids = sids[ok_indices['ss']] #Parse validation genotypes, if available if validation_genotype_file is not None: snp_indices_val = sp.array(chrom_d_val['snp_indices']) # Pinpoint where the SNPs are in the file. snp_indices_val = snp_indices_val[ok_indices['vg']] raw_snps_val, freqs_val = plinkfiles.parse_plink_snps( validation_genotype_file, snp_indices_val) snp_stds_val = sp.sqrt(2 * freqs_val * (1 - freqs_val)) snp_means_val = freqs_val * 2 # Check SNP frequencies, screen for possible problems.. if max_freq_discrep<1 and 'freqs' in ssg: ss_freqs = ss_freqs[ok_indices['ss']] ok_freq_snps = sp.logical_or(sp.absolute(ss_freqs - freqs) < max_freq_discrep,sp.absolute(ss_freqs + freqs-1) < max_freq_discrep) #Array of np.bool values ok_freq_snps = sp.logical_or(ok_freq_snps,ss_freqs<=0) #Only consider SNPs that actually have frequencies num_freq_discrep_filtered_snps = len(ok_freq_snps)- sp.sum(ok_freq_snps) assert num_freq_discrep_filtered_snps>=0, "Problems when filtering SNPs with frequency discrepencies" if num_freq_discrep_filtered_snps>0: # Filter freq_discrepancy_snps raw_snps = raw_snps[ok_freq_snps] snp_stds = snp_stds[ok_freq_snps] snp_means = snp_means[ok_freq_snps] freqs = freqs[ok_freq_snps] ps = ps[ok_freq_snps] positions = positions[ok_freq_snps] nts = nts[ok_freq_snps] sids = sids[ok_freq_snps] betas = betas[ok_freq_snps] log_odds = log_odds[ok_freq_snps] if validation_genotype_file is not None: raw_snps_val = raw_snps_val[ok_freq_snps] snp_stds_val = snp_stds_val[ok_freq_snps] snp_means_val = snp_means_val[ok_freq_snps] freqs_val = freqs_val[ok_freq_snps] if debug: print('Filtered %d SNPs due to frequency discrepancies'%num_freq_discrep_filtered_snps) # Filter minor allele frequency SNPs. maf_filter = (freqs > min_maf) * (freqs < (1 - min_maf)) num_maf_filtered_snps = len(maf_filter)-sp.sum(maf_filter) assert num_maf_filtered_snps>=0, "Problems when filtering SNPs with low minor allele frequencies" if num_maf_filtered_snps>0: raw_snps = raw_snps[maf_filter] snp_stds = snp_stds[maf_filter] snp_means = snp_means[maf_filter] freqs = freqs[maf_filter] ps = ps[maf_filter] positions = positions[maf_filter] nts = nts[maf_filter] sids = sids[maf_filter] betas = betas[maf_filter] log_odds = log_odds[maf_filter] if validation_genotype_file is not None: raw_snps_val = raw_snps_val[maf_filter] snp_stds_val = snp_stds_val[maf_filter] snp_means_val = snp_means_val[maf_filter] freqs_val = freqs_val[maf_filter] if debug: print('Filtered %d SNPs due to low MAF'%num_maf_filtered_snps) genetic_map = [] if genetic_map_dir is not None: with gzip.open(genetic_map_dir + 'chr%d.interpolated_genetic_map.gz' % chrom) as f: for line in f: l = line.split() # if l[0] in sid_set: # genetic_map.append(l[0]) else: genetic_map = None coord_data_dict = {'chrom': 'chrom_%d' % chrom, 'raw_snps_ref': raw_snps, 'snp_stds_ref': snp_stds, 'snp_means_ref': snp_means, 'freqs_ref': freqs, 'ps': ps, 'positions': positions, 'nts': nts, 'sids': sids, 'genetic_map': genetic_map, 'betas': betas, 'log_odds': log_odds} if validation_genotype_file is not None: maf_adj_prs = sp.dot(log_odds, raw_snps_val) if debug and plinkf_dict_val['has_phenotype']: maf_adj_corr = sp.corrcoef(plinkf_dict_val['phenotypes'], maf_adj_prs)[0, 1] print('Log odds, per genotype PRS correlation w phenotypes for chromosome %d was %0.4f' % (chrom, maf_adj_corr)) coord_data_dict['raw_snps_val']=raw_snps_val coord_data_dict['snp_stds_val']=snp_stds_val coord_data_dict['snp_means_val']=snp_means_val coord_data_dict['freqs_val']=freqs_val coord_data_dict['log_odds_prs']=maf_adj_prs maf_adj_risk_scores += maf_adj_prs write_coord_data(cord_data_g, coord_data_dict, debug=debug) if debug: print('%d SNPs were retained on chromosome %d.' % (len(sids), chrom)) num_snps_common_before_filtering += len(common_sids) num_snps_common_after_filtering += len(sids) tot_num_ambig_nts += num_ambig_nts tot_num_non_supported_nts += num_non_supported_nts tot_num_non_matching_nts += num_non_matching_nts tot_num_freq_discrep_filtered_snps += num_freq_discrep_filtered_snps tot_num_maf_filtered_snps += num_maf_filtered_snps if not debug: sys.stdout.write('\b\b\b\b\b\b\b%0.2f%%\n' % (100.0)) sys.stdout.flush() # Now calculate the prediction r^2 if validation_genotype_file: if debug and plinkf_dict_val['has_phenotype']: maf_adj_corr = sp.corrcoef( plinkf_dict_val['phenotypes'], maf_adj_risk_scores)[0, 1] print('Log odds, per PRS correlation for the whole genome was %0.4f (r^2=%0.4f)' % (maf_adj_corr, maf_adj_corr ** 2)) print('Overall nucleotide concordance counts: rg_vg: %d, rg_ss: %d, vg_ss: %d' % (tot_g_vg_nt_concord_count, tot_g_ss_nt_concord_count, tot_vg_ss_nt_concord_count)) else: if debug: print('Overall nucleotide concordance counts, rg_ss: %d' % (tot_g_ss_nt_concord_count)) summary_dict[7]={'name':'Num chromosomes used:','value':len(chromosomes_found)} summary_dict[8]={'name':'SNPs common across datasets:','value':num_snps_common_before_filtering} summary_dict[9]={'name':'SNPs retained after filtering:','value':num_snps_common_after_filtering} if tot_num_ambig_nts>0: summary_dict[10]={'name':'SNPs w ambiguous nucleotides filtered:','value':tot_num_ambig_nts} if tot_num_non_supported_nts>0: summary_dict[10.1]={'name':'SNPs w unknown/unsupported nucleotides filtered:','value':tot_num_non_supported_nts} if tot_num_non_matching_nts>0: summary_dict[11]={'name':'SNPs w other nucleotide discrepancies filtered:','value':tot_num_non_matching_nts} if min_maf>0: summary_dict[12]={'name':'SNPs w MAF<%0.3f filtered:'%min_maf,'value':tot_num_maf_filtered_snps} if max_freq_discrep<0.5: summary_dict[13]={'name':'SNPs w allele freq discrepancy > %0.3f filtered:'%max_freq_discrep,'value':tot_num_freq_discrep_filtered_snps} t1 = time.time() t = (t1 - t0) summary_dict[13.9]={'name':'dash', 'value':'Running times'} summary_dict[15]={'name':'Run time for coordinating datasets:','value': '%d min and %0.2f sec'%(t / 60, t % 60)}
for iph in range(Y.shape[0]): for jph in range(Y.shape[0]): if SP.bitwise_and(phase_vec[iph]==phase_vec[jph], phase_vec[iph]==3): KG2M[iph,jph]=1 #intra-phase variations in cell size sfCellSize = SP.log10(f['ratioEndo'][:]) sfCellSize -= sfCellSize.mean() sfCellSize = sfCellSize.reshape(1,sfCellSize.shape[0]) Ksize = SP.dot(sfCellSize.transpose(), sfCellSize) Ksize /= Ksize.diagonal().mean() # filter cell cycle genes idx_cell_cycle = SP.union1d(cellcyclegenes_filter,cellcyclegenes_filterCB600) Ymean2 = Y.mean(0)**2>0 idx_cell_cycle_noise_filtered = SP.intersect1d(idx_cell_cycle,SP.array(SP.where(Ymean2.ravel()>0))) Ycc = Y[:,idx_cell_cycle_noise_filtered] #Fit GPLVM to data k = 1 # number of latent factors file_name = CFG['panama_file']# name of the cache file recalc = True # recalculate X and Kconf sclvm = scLVM(Y) pdb.set_trace() X,Kcc,varGPLVM = sclvm.fitGPLVM(idx=idx_cell_cycle_noise_filtered,k=1,out_dir='./cache',file_name=file_name,recalc=recalc) #3. load relevant dataset for analysis genes_het=SP.array(SP.where(f['genes_heterogen'][:].ravel()==1)) tech_noise=f['LogVar_techMmus'][:] # considers only heterogeneous genes
def main(): figs = dict() figs['stats'] = plt.figure(figsize=(12, 8)) figs['stats_log'] = plt.figure(figsize=(12, 8)) figs['stats_full'] = plt.figure(figsize=(12, 8)) figs['stats_full_log'] = plt.figure(figsize=(12, 8)) gss = dict() gss['stats'] = gridspec.GridSpec(2, 3) #, wspace=0.0, hspace=0.0) gss['stats_log'] = gridspec.GridSpec(2, 3) #, wspace=0.0, hspace=0.0) gss['stats_full'] = gridspec.GridSpec(2, 3) #, wspace=0.0, hspace=0.0) gss['stats_full_log'] = gridspec.GridSpec(2, 3) #, wspace=0.0, hspace=0.0) for e, event_type in enumerate(event_types): print('Handling %s' % event_type, file=sys.stderr) ### load events detected in annotation only anno = pickle.load(open(os.path.join(BASEDIR_ANNO, 'merge_graphs_%s_C%i.pickle' % (event_type, CONF)), 'r')) if isinstance(anno, tuple): anno = anno[0] ### load annotation index is_anno_gtex = pickle.load(open(os.path.join(BASEDIR_GTEX, 'merge_graphs_%s_C%i.anno_only.pickle' % (event_type, CONF)), 'r')) is_anno_icgc_t = pickle.load(open(os.path.join(BASEDIR_ICGC_T, 'merge_graphs_%s_C%i.anno_only.pickle' % (event_type, CONF)), 'r')) is_anno_icgc_n = pickle.load(open(os.path.join(BASEDIR_ICGC_N, 'merge_graphs_%s_C%i.anno_only.pickle' % (event_type, CONF)), 'r')) ### load confident events IN = h5py.File(os.path.join(BASEDIR_GTEX, 'merge_graphs_%s_C%i.counts.hdf5' % (event_type, CONF)), 'r') idx_conf_gtex = IN['conf_idx'][:] IN.close() IN = h5py.File(os.path.join(BASEDIR_ICGC_T, 'merge_graphs_%s_C%i.counts.hdf5' % (event_type, CONF)), 'r') idx_conf_icgc_t = IN['conf_idx'][:] IN.close() IN = h5py.File(os.path.join(BASEDIR_ICGC_N, 'merge_graphs_%s_C%i.counts.hdf5' % (event_type, CONF)), 'r') idx_conf_icgc_n = IN['conf_idx'][:] IN.close() ### load filtered events #IN = h5py.File(os.path.join(BASEDIR_GTEX, 'merge_graphs_%s_C%i.counts.r10_s50_V10.hdf5' % (event_type, CONF)), 'r') #idx_filt_gtex = IN['filter_idx'][:] #IN.close() #IN = h5py.File(os.path.join(BASEDIR_ICGC, 'merge_graphs_%s_C%i.counts.r10_s50_V10.hdf5' % (event_type, CONF)), 'r') #idx_filt_icgc = IN['filter_idx'][:] #IN.close() ### load psi filtered events idx_psi_gtex = pickle.load(open(os.path.join(BASEDIR_GTEX, 'merge_graphs_%s_C%i.counts.hdf5.psi_filt.pickle' % (event_type, CONF)), 'r'))[1] idx_psi_icgc_t = pickle.load(open(os.path.join(BASEDIR_ICGC_T, 'merge_graphs_%s_C%i.counts.hdf5.psi_filt.pickle' % (event_type, CONF)), 'r'))[1] idx_psi_icgc_n = pickle.load(open(os.path.join(BASEDIR_ICGC_N, 'merge_graphs_%s_C%i.counts.hdf5.psi_filt.pickle' % (event_type, CONF)), 'r'))[1] ### plot stats for normal counts (FULL) ax = figs['stats_full'].add_subplot(gss['stats_full'][e / 3, e % 3]) xlabels_full = ['detected', 'confident'] xlabels_part = ['confident'] xlabels_full.extend(['dpsi > %.1f' % _ for _ in sorted(idx_psi_gtex.keys())]) xlabels_part.extend(['dpsi > %.1f' % _ for _ in sorted(idx_psi_gtex.keys())]) # all confirmed events, further filtered by PSI - GTEX data1_gtex = [is_anno_gtex.shape[0], idx_conf_gtex.shape[0]] data1_gtex.extend([sp.intersect1d(idx_conf_gtex, idx_psi_gtex[_]).shape[0] for _ in sorted(idx_psi_gtex.keys())]) data1_gtex = sp.array(data1_gtex) lg, = ax.plot(sp.arange(data1_gtex.shape[0]), data1_gtex, '-b', label='GTEx') # all annotated confirmed events, further filtered by PSI - GTEX data2_gtex = [sp.sum(is_anno_gtex), sp.sum(is_anno_gtex[idx_conf_gtex])] data2_gtex.extend([sp.sum(is_anno_gtex[sp.intersect1d(idx_conf_gtex, idx_psi_gtex[_])]) for _ in sorted(idx_psi_gtex.keys())]) data2_gtex = sp.array(data2_gtex) lga, = ax.plot(sp.arange(data2_gtex.shape[0]), data2_gtex, '--b', label='GTEx (anno)') # all confirmed events, further filtered by PSI - ICGC_T data1_icgc_t = [is_anno_icgc_t.shape[0], idx_conf_icgc_t.shape[0]] data1_icgc_t.extend([sp.intersect1d(idx_conf_icgc_t, idx_psi_icgc_t[_]).shape[0] for _ in sorted(idx_psi_icgc_t.keys())]) data1_icgc_t = sp.array(data1_icgc_t) lit, = ax.plot(sp.arange(data1_icgc_t.shape[0]), data1_icgc_t, '-r', label='ICGC Tumor') # all annotated confirmed events, further filtered by PSI - ICGC data2_icgc_t = [sp.sum(is_anno_icgc_t), sp.sum(is_anno_icgc_t[idx_conf_icgc_t])] data2_icgc_t.extend([sp.sum(is_anno_icgc_t[sp.intersect1d(idx_conf_icgc_t, idx_psi_icgc_t[_])]) for _ in sorted(idx_psi_icgc_t.keys())]) data2_icgc_t = sp.array(data2_icgc_t) lita, = ax.plot(sp.arange(data2_icgc_t.shape[0]), data2_icgc_t, '--r', label='ICGC Tumor (anno)') # all confirmed events, further filtered by PSI - ICGC_T data1_icgc_n = [is_anno_icgc_n.shape[0], idx_conf_icgc_n.shape[0]] data1_icgc_n.extend([sp.intersect1d(idx_conf_icgc_n, idx_psi_icgc_n[_]).shape[0] for _ in sorted(idx_psi_icgc_n.keys())]) data1_icgc_n = sp.array(data1_icgc_n) lin, = ax.plot(sp.arange(data1_icgc_n.shape[0]), data1_icgc_n, '-g', label='ICGC Normal') # all annotated confirmed events, further filtered by PSI - ICGC data2_icgc_n = [sp.sum(is_anno_icgc_n), sp.sum(is_anno_icgc_n[idx_conf_icgc_n])] data2_icgc_n.extend([sp.sum(is_anno_icgc_n[sp.intersect1d(idx_conf_icgc_n, idx_psi_icgc_n[_])]) for _ in sorted(idx_psi_icgc_n.keys())]) data2_icgc_n = sp.array(data2_icgc_n) lina, = ax.plot(sp.arange(data2_icgc_n.shape[0]), data2_icgc_n, '--g', label='ICGC Normal (anno)') axs.set_ticks_outer(ax) axs.clean_axis(ax) if e == len(event_types) - 1: ax.legend(handles=[lit, lita, lin, lina, lg, lga], loc='upper right', frameon=False, fontsize=10) ax.set_xticks(list(range(len(xlabels_full)))) if e < len(event_types) - 3: ax.set_xticklabels([]) else: ax.set_xticklabels(xlabels_full, rotation=90, fontsize=10) ax.set_title(event_dict[event_type]) ax.xaxis.grid(True) ### plots stats for log10 counts (FULL) ax = figs['stats_full_log'].add_subplot(gss['stats_full_log'][e / 3, e % 3]) lg, = ax.plot(sp.arange(data1_gtex.shape[0]), sp.log10(data1_gtex + 1), '-b', label='GTEx') lga, = ax.plot(sp.arange(data2_gtex.shape[0]), sp.log10(data2_gtex + 1), '--b', label='GTEx (anno)') lit, = ax.plot(sp.arange(data1_icgc_t.shape[0]), sp.log10(data1_icgc_t + 1), '-r', label='ICGC Tumor') lita, = ax.plot(sp.arange(data2_icgc_t.shape[0]), sp.log10(data2_icgc_t + 1), '--r', label='ICGC Tumor (anno)') lin, = ax.plot(sp.arange(data1_icgc_n.shape[0]), sp.log10(data1_icgc_n + 1), '-g', label='ICGC Normal') lina, = ax.plot(sp.arange(data2_icgc_n.shape[0]), sp.log10(data2_icgc_n + 1), '--g', label='ICGC Normal (anno)') axs.set_ticks_outer(ax) axs.clean_axis(ax) if e == len(event_types) - 1: ax.legend(handles=[lit, lita, lin, lina, lg, lga], loc='lower left', frameon=False, fontsize=10) ax.set_xticks(list(range(len(xlabels_full)))) if e < len(event_types) - 3: ax.set_xticklabels([]) else: ax.set_xticklabels(xlabels_full, rotation=90, fontsize=10) ax.set_title(event_dict[event_type]) ax.xaxis.grid(True) ### plot stats for normal counts (only conf) ax = figs['stats'].add_subplot(gss['stats'][e / 3, e % 3]) lg, = ax.plot(sp.arange(data1_gtex.shape[0] - 1), data1_gtex[1:], '-b', label='GTEx') lga, = ax.plot(sp.arange(data2_gtex.shape[0] - 1), data2_gtex[1:], '--b', label='GTEx (anno)') lit, = ax.plot(sp.arange(data1_icgc_t.shape[0] - 1), data1_icgc_t[1:], '-r', label='ICGC Tumor') lita, = ax.plot(sp.arange(data2_icgc_t.shape[0] - 1), data2_icgc_t[1:], '--r', label='ICGC Tumor (anno)') lin, = ax.plot(sp.arange(data1_icgc_n.shape[0] - 1), data1_icgc_n[1:], '-g', label='ICGC Normal') lina, = ax.plot(sp.arange(data2_icgc_n.shape[0] - 1), data2_icgc_n[1:], '--g', label='ICGC Normal (anno)') axs.set_ticks_outer(ax) axs.clean_axis(ax) if e == len(event_types) - 1: ax.legend(handles=[lit, lita, lin, lina, lg, lga], loc='upper right', frameon=False, fontsize=10) ax.set_xticks(list(range(len(xlabels_part)))) if e < len(event_types) - 3: ax.set_xticklabels([]) else: ax.set_xticklabels(xlabels_part, rotation=90, fontsize=10) ax.set_title(event_dict[event_type]) ax.xaxis.grid(True) ### plots stats for log10 counts (only cony) ax = figs['stats_log'].add_subplot(gss['stats_log'][e / 3, e % 3]) lg, = ax.plot(sp.arange(data1_gtex.shape[0] - 1), sp.log10(data1_gtex[1:] + 1), '-b', label='GTEx') lga, = ax.plot(sp.arange(data2_gtex.shape[0] - 1), sp.log10(data2_gtex[1:] + 1), '--b', label='GTEx (anno)') lit, = ax.plot(sp.arange(data1_icgc_t.shape[0] - 1), sp.log10(data1_icgc_t[1:] + 1), '-r', label='ICGC Tumor') lita, = ax.plot(sp.arange(data2_icgc_t.shape[0] - 1), sp.log10(data2_icgc_t[1:] + 1), '--r', label='ICGC Tumor (anno)') lin, = ax.plot(sp.arange(data1_icgc_n.shape[0] - 1), sp.log10(data1_icgc_n[1:] + 1), '-g', label='ICGC Normal') lina, = ax.plot(sp.arange(data2_icgc_n.shape[0] - 1), sp.log10(data2_icgc_n[1:] + 1), '--g', label='ICGC Normal (anno)') axs.set_ticks_outer(ax) axs.clean_axis(ax) if e == len(event_types) - 1: ax.legend(handles=[lit, lita, lin, lina, lg, lga], loc='lower left', frameon=False, fontsize=10) ax.set_xticks(list(range(len(xlabels_part)))) if e < len(event_types) - 3: ax.set_xticklabels([]) else: ax.set_xticklabels(xlabels_part, rotation=90, fontsize=10) ax.set_title(event_dict[event_type]) ax.xaxis.grid(True) for p in figs: figs[p].tight_layout() figs[p].savefig(os.path.join(PLOTDIR, 'event_overview_cumm_Liver_C%i_%s.pdf' % (CONF, p)), format='pdf', bbox_inches='tight') figs[p].savefig(os.path.join(PLOTDIR, 'event_overview_cumm_Liver_C%i_%s.png' % (CONF, p)), format='png', bbox_inches='tight') plt.close(figs[p])
directory = "." #Directory of the simulation data #list_fields=['Ex','Ey','Ez','Bz_m','Rho_electron1','Rho_proton','Jx','Jy','Jz'] #List of the fields you want to extract (Ei,Bi,Ji_sn,Rho,rho_sn, where n is the number of the species and i the direction (x,y,z)) list_fields=['Ey','Ex','Rho_electron'] first_cycle = 0 last_cycle = 11000 cycle_step = 5000 # step between two displayed cycles (minimum is given by outputcyle of the inputfile) plot_on_axis = 0 # Also plot 1D graph of the quantities on axis if ==1. suffix = "" #Suffix to be added in produced files name. #################################################################################### filename = directory + "/Fields_folded.h5" # Proc file name h5file = tables.openFile(filename, mode = "r", title = "Fields_file") existing_files = scipy.arange(0,last_cycle,cycle_step) asked_files = scipy.arange(first_cycle, last_cycle) list_files=scipy.intersect1d(existing_files,asked_files) nb_files=len(list_files) Filebyproc=nb_files/p #Minimum Nomber of files you have to read reste=nb_files-Filebyproc*p if (rank<reste): Filerange=scipy.arange(rank*(Filebyproc+1),(rank+1)*(Filebyproc+1)) else: Filerange=scipy.arange((reste*(Filebyproc+1)+(rank-reste)*Filebyproc),(reste*(Filebyproc+1)+(rank-reste+1)*Filebyproc)) #print nb_files, 'filerange =', Filerange, rank list_cycle = list(list_files[Filerange]) print "cycle", list_cycle print h5file.root._f_listNodes if (rank==0): print h5file.root._f_getChild(repr(list_cycle[0]).rjust(10,"0"))._f_listNodes
def coordinate_genotypes_ss_w_ld_ref(genotype_file=None, reference_genotype_file=None, hdf5_file=None, genetic_map_dir=None, check_mafs=False, min_maf=0.01): # recode_dict = {1:'A', 2:'T', 3:'C', 4:'G'} #1K genomes recoding.. print 'Coordinating things w genotype file: %s \nref. genot. file: %s' % ( genotype_file, reference_genotype_file) plinkf = plinkfile.PlinkFile(genotype_file) #Loads only the individuals... (I think?) samples = plinkf.get_samples() num_individs = len(samples) Y = [s.phenotype for s in samples] fids = [s.fid for s in samples] iids = [s.iid for s in samples] unique_phens = sp.unique(Y) if len(unique_phens) == 1: print 'Unable to find phenotype values.' has_phenotype = False elif len(unique_phens) == 2: cc_bins = sp.bincount(Y) assert len(cc_bins) == 2, 'Problems with loading phenotype' print 'Loaded %d controls and %d cases' % (cc_bins[0], cc_bins[1]) has_phenotype = True else: print 'Found quantitative phenotype values' has_phenotype = True #Figure out chromosomes and positions. print 'Parsing validation genotype bim file' loci = plinkf.get_loci() plinkf.close() gf_chromosomes = [l.chromosome for l in loci] chromosomes = sp.unique(gf_chromosomes) chromosomes.sort() chr_dict = _get_chrom_dict_(loci, chromosomes) print 'Parsing LD reference genotype bim file' plinkf_ref = plinkfile.PlinkFile(reference_genotype_file) loci_ref = plinkf_ref.get_loci() plinkf_ref.close() chr_dict_ref = _get_chrom_dict_(loci_ref, chromosomes) # chr_dict_ref = _get_chrom_dict_bim_(reference_genotype_file+'.bim', chromosomes) #Open HDF5 file and prepare out data assert not 'iids' in hdf5_file.keys( ), 'Something is wrong with the HDF5 file?' if has_phenotype: hdf5_file.create_dataset('y', data=Y) hdf5_file.create_dataset('fids', data=fids) hdf5_file.create_dataset('iids', data=iids) ssf = hdf5_file['sum_stats'] cord_data_g = hdf5_file.create_group('cord_data') maf_adj_risk_scores = sp.zeros(num_individs) num_common_snps = 0 #corr_list = [] tot_g_ss_nt_concord_count = 0 tot_rg_ss_nt_concord_count = 0 tot_g_rg_nt_concord_count = 0 tot_num_non_matching_nts = 0 #Now iterate over chromosomes for chrom in chromosomes: ok_indices = {'g': [], 'rg': [], 'ss': []} chr_str = 'chrom_%d' % chrom print 'Working on chromsome: %s' % chr_str chrom_d = chr_dict[chr_str] chrom_d_ref = chr_dict_ref[chr_str] try: ssg = ssf['chrom_%d' % chrom] except Exception, err_str: print err_str print 'Did not find chromsome in SS dataset.' print 'Continuing.' continue ssg = ssf['chrom_%d' % chrom] g_sids = chrom_d['sids'] rg_sids = chrom_d_ref['sids'] ss_sids = ssg['sids'][...] print 'Found %d SNPs in validation data, %d SNPs in LD reference data, and %d SNPs in summary statistics.' % ( len(g_sids), len(rg_sids), len(ss_sids)) common_sids = sp.intersect1d(ss_sids, g_sids) common_sids = sp.intersect1d(common_sids, rg_sids) print 'Found %d SNPs on chrom %d that were common across all datasets' % ( len(common_sids), chrom) ss_snp_map = [] g_snp_map = [] rg_snp_map = [] ss_sid_dict = {} for i, sid in enumerate(ss_sids): ss_sid_dict[sid] = i g_sid_dict = {} for i, sid in enumerate(g_sids): g_sid_dict[sid] = i rg_sid_dict = {} for i, sid in enumerate(rg_sids): rg_sid_dict[sid] = i for sid in common_sids: g_snp_map.append(g_sid_dict[sid]) #order by positions g_positions = sp.array(chrom_d['positions'])[g_snp_map] order = sp.argsort(g_positions) #order = order.tolist() g_snp_map = sp.array(g_snp_map)[order] g_snp_map = g_snp_map.tolist() common_sids = sp.array(common_sids)[order] #Get the other two maps for sid in common_sids: rg_snp_map.append(rg_sid_dict[sid]) for sid in common_sids: ss_snp_map.append(ss_sid_dict[sid]) g_nts = sp.array(chrom_d['nts']) rg_nts = sp.array(chrom_d_ref['nts']) rg_nts_ok = sp.array(rg_nts)[rg_snp_map] # rg_nts_l = [] # for nt in rg_nts_ok: # rg_nts_l.append([recode_dict[nt[0]],recode_dict[nt[1]]]) # rg_nts_ok = sp.array(rg_nts_l) ss_nts = ssg['nts'][...] betas = ssg['betas'][...] log_odds = ssg['log_odds'][...] if 'freqs' in ssg.keys(): ss_freqs = ssg['freqs'][...] g_ss_nt_concord_count = sp.sum( g_nts[g_snp_map] == ss_nts[ss_snp_map]) / 2.0 rg_ss_nt_concord_count = sp.sum(rg_nts_ok == ss_nts[ss_snp_map]) / 2.0 g_rg_nt_concord_count = sp.sum(g_nts[g_snp_map] == rg_nts_ok) / 2.0 print 'Nucleotide concordance counts out of %d genotypes: vg-g: %d, vg-ss: %d, g-ss: %d' % ( len(g_snp_map), g_rg_nt_concord_count, g_ss_nt_concord_count, rg_ss_nt_concord_count) tot_g_ss_nt_concord_count += g_ss_nt_concord_count tot_rg_ss_nt_concord_count += rg_ss_nt_concord_count tot_g_rg_nt_concord_count += g_rg_nt_concord_count num_non_matching_nts = 0 num_ambig_nts = 0 #Identifying which SNPs have nucleotides that are ok.. ok_nts = [] for g_i, rg_i, ss_i in it.izip(g_snp_map, rg_snp_map, ss_snp_map): #To make sure, is the SNP id the same? assert g_sids[g_i] == rg_sids[rg_i] == ss_sids[ ss_i], 'Some issues with coordinating the genotypes.' g_nt = g_nts[g_i] rg_nt = rg_nts[rg_i] # rg_nt = [recode_dict[rg_nts[rg_i][0]],recode_dict[rg_nts[rg_i][1]]] ss_nt = ss_nts[ss_i] #Is the nucleotide ambiguous. g_nt = [g_nts[g_i][0], g_nts[g_i][1]] if tuple(g_nt) in ambig_nts: num_ambig_nts += 1 tot_num_non_matching_nts += 1 continue #First check if nucleotide is sane? if (not g_nt[0] in valid_nts) or (not g_nt[1] in valid_nts): num_non_matching_nts += 1 tot_num_non_matching_nts += 1 continue os_g_nt = sp.array( [opp_strand_dict[g_nt[0]], opp_strand_dict[g_nt[1]]]) flip_nts = False if not ((sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt)) and (sp.all(g_nt == rg_nt) or sp.all(os_g_nt == rg_nt))): if sp.all(g_nt == rg_nt) or sp.all(os_g_nt == rg_nt): flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0] == ss_nt[1]) or (os_g_nt[1] == ss_nt[0] and os_g_nt[0] == ss_nt[1]) #Try flipping the SS nt if flip_nts: betas[ss_i] = -betas[ss_i] log_odds[ss_i] = -log_odds[ss_i] if 'freqs' in ssg.keys(): ss_freqs[ss_i] = 1 - ss_freqs[ss_i] else: print "Nucleotides don't match after all?: g_sid=%s, ss_sid=%s, g_i=%d, ss_i=%d, g_nt=%s, ss_nt=%s" % \ (g_sids[g_i], ss_sids[ss_i], g_i, ss_i, str(g_nt), str(ss_nt)) num_non_matching_nts += 1 tot_num_non_matching_nts += 1 continue else: num_non_matching_nts += 1 tot_num_non_matching_nts += 1 continue # Opposite strand nucleotides # everything seems ok. ok_indices['g'].append(g_i) ok_indices['rg'].append(rg_i) ok_indices['ss'].append(ss_i) ok_nts.append(g_nt) # if flip_nts: # ok_nts.append([ss_nt[1],ss_nt[0]]) # else: # ok_nts.append(ss_nt) #print '%d SNPs in LD references to be flipped.'%((len(ref_snp_directions)-sp.sum(ref_snp_directions))/2.0) print '%d SNPs had ambiguous nucleotides.' % num_ambig_nts print '%d SNPs were excluded due to nucleotide issues.' % num_non_matching_nts print '%d SNPs were retained on chromosome %d.' % (len( ok_indices['g']), chrom) #Resorting by position positions = sp.array(chrom_d['positions'])[ok_indices['g']] # order = sp.argsort(positions) # sorted_positions = positions[order] # assert sp.all(sorted_positions==positions), 'Perhaps something is wrong here?' # ok_indices['g'] = list(sp.array(ok_indices['g'])[order]) # ok_indices['ss'] = list(sp.array(ok_indices['ss'])[order]) #Now parse SNPs .. snp_indices = sp.array(chrom_d['snp_indices']) snp_indices = snp_indices[ ok_indices['g']] #Pinpoint where the SNPs are in the file. raw_snps, freqs = _parse_plink_snps_(genotype_file, snp_indices) snp_indices_ref = sp.array(chrom_d_ref['snp_indices']) snp_indices_ref = snp_indices_ref[ ok_indices['rg']] #Pinpoint where the SNPs are in the file. raw_ref_snps, freqs_ref = _parse_plink_snps_(reference_genotype_file, snp_indices_ref) snp_stds_ref = sp.sqrt(2 * freqs_ref * (1 - freqs_ref)) snp_means_ref = freqs_ref * 2 snp_stds = sp.sqrt(2 * freqs * (1 - freqs)) snp_means = freqs * 2 betas = betas[ok_indices['ss']] # * sp.sqrt(freqs * (1 - freqs)) log_odds = log_odds[ok_indices['ss']] # * sp.sqrt(freqs * (1 - freqs)) ps = ssg['ps'][...][ok_indices['ss']] nts = sp.array(ok_nts) #[order] sids = ssg['sids'][...][ok_indices['ss']] #For debugging... # g_sids = sp.array(chrom_d['sids'])[ok_indices['g']] # rg_sids = sp.array(chrom_d_ref['sids'])[ok_indices['rg']] # ss_sids = ssg['sids'][...][ok_indices['ss']] # assert sp.all(g_sids==rg_sids) and sp.all(rg_sids==ss_sids), 'WTF?' #Check SNP frequencies.. if check_mafs and 'freqs' in ssg.keys(): ss_freqs = ss_freqs[ok_indices['ss']] freq_discrepancy_snp = sp.absolute(ss_freqs - (1 - freqs)) > 0.15 if sp.any(freq_discrepancy_snp): print 'Warning: %d SNPs were filtered due to high allele frequency discrepancy between summary statistics and validation sample' % sp.sum( freq_discrepancy_snp) # print freqs[freq_discrepancy_snp] # print ss_freqs[freq_discrepancy_snp] #Filter freq_discrepancy_snps ok_freq_snps = sp.negative(freq_discrepancy_snp) raw_snps = raw_snps[ok_freq_snps] snp_stds = snp_stds[ok_freq_snps] snp_means = snp_means[ok_freq_snps] raw_ref_snps = raw_ref_snps[ok_freq_snps] snp_stds_ref = snp_stds_ref[ok_freq_snps] snp_means_ref = snp_means_ref[ok_freq_snps] freqs = freqs[ok_freq_snps] freqs_ref = freqs_ref[ok_freq_snps] ps = ps[ok_freq_snps] positions = positions[ok_freq_snps] nts = nts[ok_freq_snps] sids = sids[ok_freq_snps] betas = betas[ok_freq_snps] log_odds = log_odds[ok_freq_snps] #For debugging... # if sp.any(freq_discrepancy_snp): # g_sids = g_sids[ok_freq_snps] # rg_sids = rg_sids[ok_freq_snps] # ss_sids = ss_sids[ok_freq_snps] # assert sp.all(g_sids==rg_sids) and sp.all(rg_sids==ss_sids), 'WTF?' #Filter minor allele frequency SNPs. maf_filter = (freqs > min_maf) * (freqs < (1 - min_maf)) maf_filter_sum = sp.sum(maf_filter) n_snps = len(maf_filter) assert maf_filter_sum <= n_snps, "WTF?" if sp.sum(maf_filter) < n_snps: raw_snps = raw_snps[maf_filter] snp_stds = snp_stds[maf_filter] snp_means = snp_means[maf_filter] raw_ref_snps = raw_ref_snps[maf_filter] snp_stds_ref = snp_stds_ref[maf_filter] snp_means_ref = snp_means_ref[maf_filter] freqs = freqs[maf_filter] freqs_ref = freqs_ref[maf_filter] ps = ps[maf_filter] positions = positions[maf_filter] nts = nts[maf_filter] sids = sids[maf_filter] betas = betas[maf_filter] log_odds = log_odds[maf_filter] # if sp.sum(maf_filter)<n_snps: # g_sids = g_sids[maf_filter] # rg_sids = rg_sids[maf_filter] # ss_sids = ss_sids[maf_filter] # assert sp.all(g_sids==rg_sids) and sp.all(rg_sids==ss_sids), 'WTF?' maf_adj_prs = sp.dot(log_odds, raw_snps) if has_phenotype: maf_adj_corr = sp.corrcoef(Y, maf_adj_prs)[0, 1] print 'Log odds, per genotype PRS correlation w phenotypes for chromosome %d was %0.4f' % ( chrom, maf_adj_corr) genetic_map = [] if genetic_map_dir is not None: with gzip.open(genetic_map_dir + 'chr%d.interpolated_genetic_map.gz' % chrom) as f: for line in f: l = line.split() if l[0] in sid_set: genetic_map.append(l[0]) print 'Now storing coordinated data to HDF5 file.' ofg = cord_data_g.create_group('chrom_%d' % chrom) ofg.create_dataset('raw_snps_val', data=raw_snps, compression='lzf') ofg.create_dataset('snp_stds_val', data=snp_stds) ofg.create_dataset('snp_means_val', data=snp_means) ofg.create_dataset('freqs_val', data=freqs) ofg.create_dataset('raw_snps_ref', data=raw_ref_snps, compression='lzf') ofg.create_dataset('snp_stds_ref', data=snp_stds_ref) ofg.create_dataset('snp_means_ref', data=snp_means_ref) ofg.create_dataset('freqs_ref', data=freqs_ref) ofg.create_dataset('nts', data=nts) ofg.create_dataset('ps', data=ps) ofg.create_dataset('positions', data=positions) ofg.create_dataset('sids', data=sids) if genetic_map_dir is not None: ofg.create_dataset('genetic_map', data=genetic_map) ofg.create_dataset('betas', data=betas) ofg.create_dataset('log_odds', data=log_odds) ofg.create_dataset('log_odds_prs', data=maf_adj_prs) # print 'Sum betas', sp.sum(betas ** 2) #ofg.create_dataset('prs', data=prs) #risk_scores += prs maf_adj_risk_scores += maf_adj_prs num_common_snps += len(betas)
def varianceDecomposition(self, K=None, tech_noise=None, idx=None, i0=None, i1=None, max_iter=10, verbose=False, cache=True): """ Args: K: list of random effects to be considered in the analysis idx: indices of the genes to be considered in the analysis i0: gene index from which the anlysis starts i1: gene index to which the analysis stops max_iter: maximum number of random restarts verbose: if True, print progresses """ if tech_noise != None: self.set_tech_noise(tech_noise) assert self.tech_noise != None, 'scLVM:: specify technical noise' assert K != None, 'scLVM:: specify K' if type(K) != list: K = [K] for k in K: assert k.shape[0] == self.N, 'scLVM:: K dimension dismatch' assert k.shape[1] == self.N, 'scLVM:: K dimension dismatch' if idx == None: if i0 == None or i1 == None: i0 = 0 i1 = self.G idx = SP.arange(i0, i1) elif type(idx) != SP.ndarray: idx = SP.array(idx) idx = SP.intersect1d( SP.array(idx), SP.where(self.Y.std(0) > 0) [0]) #only makes sense if gene is expressed in at least one cell _G = len(idx) var = SP.zeros((_G, len(K) + 2)) _idx = SP.zeros(_G) geneID = SP.zeros(_G, dtype=str) conv = SP.zeros(_G) == 1 Ystar = [SP.zeros((self.N, _G)) for i in range(len(K))] count = 0 Yidx = self.Y[:, idx] Ystd = Yidx - Yidx.mean(0) Ystd /= Yidx.std(0) #delta optimization might be more efficient tech_noise = self.tech_noise[idx] / SP.array(Yidx.std(0))**2 for ids in range(_G): if verbose: print '.. fitting gene %d' % ids # extract a single gene y = Ystd[:, ids:ids + 1] # build and fit variance decomposition model vc = VAR.VarianceDecomposition(y) vc.addFixedEffect() for k in K: vc.addRandomEffect(k) vc.addRandomEffect(SP.eye(self.N)) vc.addRandomEffect(SP.eye(self.N)) vc.vd.getTerm(len(K) + 1).getKcf().setParamMask(SP.zeros(1)) for iter_i in range(max_iter): scales0 = y.std() * SP.randn(len(K) + 2) scales0[len(K) + 1] = SP.sqrt(tech_noise[ids]) _conv = vc.optimize(scales0=scales0) if _conv: break conv[count] = _conv if not _conv: var[count, -2] = SP.maximum(0, y.var() - tech_noise[ids]) var[count, -1] = tech_noise[ids] count += 1 continue _var = vc.getVarianceComps()[0, :] KiY = vc.gp.agetKEffInvYCache().ravel() for ki in range(len(K)): Ystar[ki][:, count] = _var[ki] * SP.dot(K[ki], KiY) var[count, :] = _var count += 1 if self.geneID != None: geneID = SP.array(self.geneID)[idx] col_header = ['hidden_%d' % i for i in range(len(K))] col_header.append('biol_noise') col_header.append('tech_noise') col_header = SP.array(col_header) #annotate column and rows of var and Ystar var_info = {'gene_idx': idx, 'col_header': col_header, 'conv': conv} if geneID != None: var_info['geneID'] = SP.array(geneID) Ystar_info = {'gene_idx': idx, 'conv': conv} if geneID != None: Ystar_info['geneID'] = SP.array(geneID) # cache stuff if cache == True: self.var = var self.Ystar = Ystar self.var_info = var_info self.Ystar_info = Ystar_info else: return var, var_info
def coordinate_datasets(reference_genotype_file, hdf5_file, summary_dict, validation_genotype_file=None, genetic_map_dir=None, min_maf=0.01, skip_coordination=False, max_freq_discrep = 0.15, debug=False): summary_dict[3.9]={'name':'dash', 'value':'Coordination'} t0 = time.time() if validation_genotype_file is not None: print('Coordinating datasets (Summary statistics, LD reference genotypes, and Validation genotypes).') else: print('Coordinating datasets (Summary statistics and LD reference genotypes).') plinkf = plinkfile.PlinkFile(reference_genotype_file) # Figure out chromosomes and positions. if debug: print('Parsing plinkf_dict_val reference genotypes') loci = plinkf.get_loci() plinkf.close() summary_dict[4]={'name':'Num individuals in LD Reference data:','value':plinkfiles.get_num_indivs(reference_genotype_file)} summary_dict[4.1]={'name':'SNPs in LD Reference data:','value':len(loci)} gf_chromosomes = [l.chromosome for l in loci] chromosomes = sp.unique(gf_chromosomes) chromosomes.sort() chr_dict = plinkfiles.get_chrom_dict(loci, chromosomes) if validation_genotype_file is not None: if debug: print('Parsing LD validation bim file') plinkf_val = plinkfile.PlinkFile(validation_genotype_file) # Loads only the individuals... plinkf_dict_val = plinkfiles.get_phenotypes(plinkf_val) loci_val = plinkf_val.get_loci() plinkf_val.close() summary_dict[5]={'name':'SNPs in Validation data:','value':len(loci_val)} chr_dict_val = plinkfiles.get_chrom_dict(loci_val, chromosomes) # Open HDF5 file and prepare out data assert not 'iids' in hdf5_file, 'Something is wrong with the HDF5 file, no individuals IDs were found.' if plinkf_dict_val['has_phenotype']: hdf5_file.create_dataset('y', data=plinkf_dict_val['phenotypes']) summary_dict[6]={'name':'Num validation phenotypes:','value':plinkf_dict_val['num_individs']} hdf5_file.create_dataset('fids', data=sp.array(plinkf_dict_val['fids'], dtype=util.fids_dtype)) hdf5_file.create_dataset('iids', data=sp.array(plinkf_dict_val['iids'], dtype=util.iids_dtype)) maf_adj_risk_scores = sp.zeros(plinkf_dict_val['num_individs']) # Now summary statistics ssf = hdf5_file['sum_stats'] cord_data_g = hdf5_file.create_group('cord_data') num_common_snps = 0 # corr_list = [] chromosomes_found = set() num_snps_common_before_filtering =0 num_snps_common_after_filtering =0 tot_num_non_matching_nts = 0 tot_num_non_supported_nts = 0 tot_num_ambig_nts = 0 tot_num_freq_discrep_filtered_snps = 0 tot_num_maf_filtered_snps = 0 tot_g_ss_nt_concord_count = 0 if validation_genotype_file is not None: tot_g_vg_nt_concord_count = 0 tot_vg_ss_nt_concord_count = 0 # Now iterate over chromosomes chrom_i = 0 for chrom in chromosomes: chrom_i +=1 if not debug: sys.stdout.write('\r%0.2f%%' % (100.0 * (float(chrom_i) / (len(chromosomes)+1)))) sys.stdout.flush() try: chr_str = 'chrom_%d' % chrom ssg = ssf[chr_str] except Exception as err_str: print(err_str) print('Did not find chromosome %d in SS dataset.'%chrom) print('Continuing.') continue if debug: print('Coordinating data for chromosome %s' % chr_str) chromosomes_found.add(chrom) #Get summary statistics chromosome group ssg = ssf['chrom_%d' % chrom] ss_sids = (ssg['sids'][...]).astype(util.sids_u_dtype) if validation_genotype_file is not None: chrom_d_val = chr_dict_val[chr_str] vg_sids = chrom_d_val['sids'] common_sids = sp.intersect1d(ss_sids, vg_sids) # A map from sid to index for validation data vg_sid_dict = {} for i, sid in enumerate(vg_sids): vg_sid_dict[sid] = i else: common_sids = ss_sids # A map from sid to index for summary stats ss_sid_dict = {} for i, sid in enumerate(ss_sids): ss_sid_dict[sid] = i #The indices to retain for the LD reference genotypes chrom_d = chr_dict[chr_str] g_sids = chrom_d['sids'] common_sids = sp.intersect1d(common_sids, g_sids) # A map from sid to index for LD reference data g_sid_dict = {} for i, sid in enumerate(g_sids): g_sid_dict[sid] = i if debug: print('Found %d SNPs on chrom %d that were common across all datasets' % (len(common_sids), chrom)) print('Ordering SNPs by genomic positions (based on LD reference genotypes).') g_snp_map = [] for sid in common_sids: g_snp_map.append(g_sid_dict[sid]) # order by positions (based on LD reference file) g_positions = sp.array(chrom_d['positions'])[g_snp_map] order = sp.argsort(g_positions) g_snp_map = sp.array(g_snp_map)[order] g_snp_map = g_snp_map.tolist() common_sids = sp.array(common_sids)[order] # Get the ordered sum stats SNPs indices. ss_snp_map = [] for sid in common_sids: ss_snp_map.append(ss_sid_dict[sid]) # Get the ordered validation SNPs indices if validation_genotype_file is not None: vg_snp_map = [] for sid in common_sids: vg_snp_map.append(vg_sid_dict[sid]) vg_nts = sp.array(chrom_d_val['nts']) vg_nts_ok = sp.array(vg_nts)[vg_snp_map] g_nts = sp.array(chrom_d['nts']) ss_nts = (ssg['nts'][...]).astype(util.nts_u_dtype) betas = ssg['betas'][...] log_odds = ssg['log_odds'][...] if 'freqs' in ssg: ss_freqs = ssg['freqs'][...] g_ss_nt_concord_count = sp.sum( g_nts[g_snp_map] == ss_nts[ss_snp_map]) / 2.0 if validation_genotype_file is not None: vg_ss_nt_concord_count = sp.sum(vg_nts_ok == ss_nts[ss_snp_map]) / 2.0 g_vg_nt_concord_count = sp.sum(g_nts[g_snp_map] == vg_nts_ok) / 2.0 if debug: print('Nucleotide concordance counts out of %d genotypes, vg-rg: %d ; vg-ss: %d' % (len(g_snp_map), g_vg_nt_concord_count, vg_ss_nt_concord_count)) tot_vg_ss_nt_concord_count += vg_ss_nt_concord_count tot_g_vg_nt_concord_count += g_vg_nt_concord_count tot_g_ss_nt_concord_count += g_ss_nt_concord_count if debug: print('Nucleotide concordance counts out of %d genotypes, rg-ss: %d' % (len(g_snp_map), g_ss_nt_concord_count)) num_freq_discrep_filtered_snps = 0 num_non_matching_nts = 0 num_non_supported_nts = 0 num_ambig_nts = 0 # Identifying which SNPs have nucleotides that are ok.. ok_nts = [] ok_indices = {'g': [], 'ss': []} if validation_genotype_file is not None: ok_indices['vg']=[] #Now loop over SNPs to coordinate nucleotides. if validation_genotype_file is not None: for g_i, vg_i, ss_i in zip(g_snp_map, vg_snp_map, ss_snp_map): # To make sure, is the SNP id the same? assert g_sids[g_i] == vg_sids[vg_i] == ss_sids[ss_i], 'Some issues with coordinating the genotypes.' g_nt = g_nts[g_i] if not skip_coordination: vg_nt = vg_nts[vg_i] ss_nt = ss_nts[ss_i] # Is the nucleotide ambiguous. g_nt = [g_nts[g_i][0], g_nts[g_i][1]] if tuple(g_nt) in util.ambig_nts: num_ambig_nts += 1 continue # First check if nucleotide is sane? if (not g_nt[0] in util.valid_nts) or (not g_nt[1] in util.valid_nts): num_non_supported_nts += 1 continue os_g_nt = sp.array( [util.opp_strand_dict[g_nt[0]], util.opp_strand_dict[g_nt[1]]]) flip_nts = False #Coordination is a bit more complicate when validation genotypes are provided.. if not ((sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt)) and (sp.all(g_nt == vg_nt) or sp.all(os_g_nt == vg_nt))): if sp.all(g_nt == vg_nt) or sp.all(os_g_nt == vg_nt): flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0] == ss_nt[1]) or ( os_g_nt[1] == ss_nt[0] and os_g_nt[0] == ss_nt[1]) # Try flipping the SS nt if flip_nts: betas[ss_i] = -betas[ss_i] log_odds[ss_i] = -log_odds[ss_i] if 'freqs' in ssg: ss_freqs[ss_i] = 1 - ss_freqs[ss_i] else: if debug: print("Nucleotides don't match after all?: g_sid=%s, ss_sid=%s, g_i=%d, ss_i=%d, g_nt=%s, ss_nt=%s" % \ (g_sids[g_i], ss_sids[ss_i], g_i, ss_i, str(g_nt), str(ss_nt))) num_non_matching_nts += 1 continue else: num_non_matching_nts += 1 continue # Opposite strand nucleotides # everything seems ok. ok_indices['g'].append(g_i) ok_indices['vg'].append(vg_i) ok_indices['ss'].append(ss_i) ok_nts.append(g_nt) else: for g_i, ss_i in zip(g_snp_map, ss_snp_map): # To make sure, is the SNP id the same? assert g_sids[g_i] == ss_sids[ss_i], 'Some issues with coordinating the genotypes.' g_nt = g_nts[g_i] if not skip_coordination: ss_nt = ss_nts[ss_i] # Is the nucleotide ambiguous. g_nt = [g_nts[g_i][0], g_nts[g_i][1]] if tuple(g_nt) in util.ambig_nts: num_ambig_nts += 1 continue # First check if nucleotide is sane? if (not g_nt[0] in util.valid_nts) or (not g_nt[1] in util.valid_nts): num_non_matching_nts += 1 continue os_g_nt = sp.array( [util.opp_strand_dict[g_nt[0]], util.opp_strand_dict[g_nt[1]]]) flip_nts = False #Coordination is a bit more complicate when validation genotypes are provided.. if not sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt): flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0] == ss_nt[1]) or ( os_g_nt[1] == ss_nt[0] and os_g_nt[0] == ss_nt[1]) # Try flipping the SS nt if flip_nts: betas[ss_i] = -betas[ss_i] log_odds[ss_i] = -log_odds[ss_i] if 'freqs' in ssg and ss_freqs[ss_i]>0: ss_freqs[ss_i] = 1.0 - ss_freqs[ss_i] else: if debug: print("Nucleotides don't match after all?: g_sid=%s, ss_sid=%s, g_i=%d, ss_i=%d, g_nt=%s, ss_nt=%s" % \ (g_sids[g_i], ss_sids[ss_i], g_i, ss_i, str(g_nt), str(ss_nt))) num_non_matching_nts += 1 continue # everything seems ok. ok_indices['g'].append(g_i) ok_indices['ss'].append(ss_i) ok_nts.append(g_nt) if debug: print('%d SNPs had ambiguous nucleotides.' % num_ambig_nts) print('%d SNPs were excluded due to nucleotide issues.' % num_non_matching_nts) # Resorting by position positions = sp.array(chrom_d['positions'])[ok_indices['g']] # Now parse SNPs .. snp_indices = sp.array(chrom_d['snp_indices']) # Pinpoint where the SNPs are in the file. snp_indices = snp_indices[ok_indices['g']] raw_snps, freqs = plinkfiles.parse_plink_snps( reference_genotype_file, snp_indices) snp_stds = sp.sqrt(2 * freqs * (1 - freqs)) snp_means = freqs * 2 betas = betas[ok_indices['ss']] log_odds = log_odds[ok_indices['ss']] ns = ssg['ns'][...][ok_indices['ss']] ps = ssg['ps'][...][ok_indices['ss']] nts = sp.array(ok_nts) sids = (ssg['sids'][...]).astype(util.sids_u_dtype) sids = sids[ok_indices['ss']] #Parse validation genotypes, if available if validation_genotype_file is not None: snp_indices_val = sp.array(chrom_d_val['snp_indices']) # Pinpoint where the SNPs are in the file. snp_indices_val = snp_indices_val[ok_indices['vg']] raw_snps_val, freqs_val = plinkfiles.parse_plink_snps( validation_genotype_file, snp_indices_val) snp_stds_val = sp.sqrt(2 * freqs_val * (1 - freqs_val)) snp_means_val = freqs_val * 2 # Check SNP frequencies, screen for possible problems.. if max_freq_discrep<1 and 'freqs' in ssg: ss_freqs = ss_freqs[ok_indices['ss']] ok_freq_snps = sp.logical_or(sp.absolute(ss_freqs - freqs) < max_freq_discrep,sp.absolute(ss_freqs + freqs-1) < max_freq_discrep) #Array of np.bool values ok_freq_snps = sp.logical_or(ok_freq_snps,ss_freqs<=0) #Only consider SNPs that actually have frequencies num_freq_discrep_filtered_snps = len(ok_freq_snps)- sp.sum(ok_freq_snps) assert num_freq_discrep_filtered_snps>=0, "Problems when filtering SNPs with frequency discrepencies" if num_freq_discrep_filtered_snps>0: # Filter freq_discrepancy_snps raw_snps = raw_snps[ok_freq_snps] snp_stds = snp_stds[ok_freq_snps] snp_means = snp_means[ok_freq_snps] freqs = freqs[ok_freq_snps] ps = ps[ok_freq_snps] ns = ns[ok_freq_snps] positions = positions[ok_freq_snps] nts = nts[ok_freq_snps] sids = sids[ok_freq_snps] betas = betas[ok_freq_snps] log_odds = log_odds[ok_freq_snps] if validation_genotype_file is not None: raw_snps_val = raw_snps_val[ok_freq_snps] snp_stds_val = snp_stds_val[ok_freq_snps] snp_means_val = snp_means_val[ok_freq_snps] freqs_val = freqs_val[ok_freq_snps] if debug: print('Filtered %d SNPs due to frequency discrepancies'%num_freq_discrep_filtered_snps) # Filter minor allele frequency SNPs. maf_filter = (freqs > min_maf) * (freqs < (1 - min_maf)) num_maf_filtered_snps = len(maf_filter)-sp.sum(maf_filter) assert num_maf_filtered_snps>=0, "Problems when filtering SNPs with low minor allele frequencies" if num_maf_filtered_snps>0: raw_snps = raw_snps[maf_filter] snp_stds = snp_stds[maf_filter] snp_means = snp_means[maf_filter] freqs = freqs[maf_filter] ps = ps[maf_filter] ns = ns[maf_filter] positions = positions[maf_filter] nts = nts[maf_filter] sids = sids[maf_filter] betas = betas[maf_filter] log_odds = log_odds[maf_filter] if validation_genotype_file is not None: raw_snps_val = raw_snps_val[maf_filter] snp_stds_val = snp_stds_val[maf_filter] snp_means_val = snp_means_val[maf_filter] freqs_val = freqs_val[maf_filter] if debug: print('Filtered %d SNPs due to low MAF'%num_maf_filtered_snps) genetic_map = [] if genetic_map_dir is not None: with gzip.open(genetic_map_dir + 'chr%d.interpolated_genetic_map.gz' % chrom) as f: for line in f: l = line.split() # if l[0] in sid_set: # genetic_map.append(l[0]) else: genetic_map = None coord_data_dict = {'chrom': 'chrom_%d' % chrom, 'raw_snps_ref': raw_snps, 'snp_stds_ref': snp_stds, 'snp_means_ref': snp_means, 'freqs_ref': freqs, 'ps': ps, 'ns': ns, 'positions': positions, 'nts': nts, 'sids': sids, 'genetic_map': genetic_map, 'betas': betas, 'log_odds': log_odds} if validation_genotype_file is not None: maf_adj_prs = sp.dot(log_odds, raw_snps_val) if debug and plinkf_dict_val['has_phenotype']: maf_adj_corr = sp.corrcoef(plinkf_dict_val['phenotypes'], maf_adj_prs)[0, 1] print('Log odds, per genotype PRS correlation w phenotypes for chromosome %d was %0.4f' % (chrom, maf_adj_corr)) coord_data_dict['raw_snps_val']=raw_snps_val coord_data_dict['snp_stds_val']=snp_stds_val coord_data_dict['snp_means_val']=snp_means_val coord_data_dict['freqs_val']=freqs_val coord_data_dict['log_odds_prs']=maf_adj_prs maf_adj_risk_scores += maf_adj_prs write_coord_data(cord_data_g, coord_data_dict, debug=debug) if debug: print('%d SNPs were retained on chromosome %d.' % (len(sids), chrom)) num_snps_common_before_filtering += len(common_sids) num_snps_common_after_filtering += len(sids) tot_num_ambig_nts += num_ambig_nts tot_num_non_supported_nts += num_non_supported_nts tot_num_non_matching_nts += num_non_matching_nts tot_num_freq_discrep_filtered_snps += num_freq_discrep_filtered_snps tot_num_maf_filtered_snps += num_maf_filtered_snps if not debug: sys.stdout.write('\r%0.2f%%\n' % (100.0)) sys.stdout.flush() # Now calculate the prediction r^2 if validation_genotype_file: if debug and plinkf_dict_val['has_phenotype']: maf_adj_corr = sp.corrcoef( plinkf_dict_val['phenotypes'], maf_adj_risk_scores)[0, 1] print('Log odds, per PRS correlation for the whole genome was %0.4f (r^2=%0.4f)' % (maf_adj_corr, maf_adj_corr ** 2)) print('Overall nucleotide concordance counts: rg_vg: %d, rg_ss: %d, vg_ss: %d' % (tot_g_vg_nt_concord_count, tot_g_ss_nt_concord_count, tot_vg_ss_nt_concord_count)) else: if debug: print('Overall nucleotide concordance counts, rg_ss: %d' % (tot_g_ss_nt_concord_count)) summary_dict[7]={'name':'Num chromosomes used:','value':len(chromosomes_found)} summary_dict[8]={'name':'SNPs common across datasets:','value':num_snps_common_before_filtering} summary_dict[9]={'name':'SNPs retained after filtering:','value':num_snps_common_after_filtering} if tot_num_ambig_nts>0: summary_dict[10]={'name':'SNPs w ambiguous nucleotides filtered:','value':tot_num_ambig_nts} if tot_num_non_supported_nts>0: summary_dict[10.1]={'name':'SNPs w unknown/unsupported nucleotides filtered:','value':tot_num_non_supported_nts} if tot_num_non_matching_nts>0: summary_dict[11]={'name':'SNPs w other nucleotide discrepancies filtered:','value':tot_num_non_matching_nts} if min_maf>0: summary_dict[12]={'name':'SNPs w MAF<%0.3f filtered:'%min_maf,'value':tot_num_maf_filtered_snps} if max_freq_discrep<0.5: summary_dict[13]={'name':'SNPs w allele freq discrepancy > %0.3f filtered:'%max_freq_discrep,'value':tot_num_freq_discrep_filtered_snps} t1 = time.time() t = (t1 - t0) summary_dict[13.9]={'name':'dash', 'value':'Running times'} summary_dict[15]={'name':'Run time for coordinating datasets:','value': '%d min and %0.2f sec'%(t / 60, t % 60)}
def fitLMM(self, expr=None, K=None, tech_noise=None, idx=None, i0=None, i1=None, verbose=False, recalc=True, standardize=True): """ Args: K: list of random effects to be considered in the analysis if K is none, it does not consider any random effect expr: correlations are calculated between the gene expression data (self.Y) and these measures provided in expr. If None, self.Y i sused idx: indices of the genes to be considered in the analysis i0: gene index from which the anlysis starts i1: gene index to which the analysis stops verbose: if True, print progress recalc: if True, re-do variance decomposition standardize: if True, standardize also expression Returns: pv: matrix of pvalues beta: matrix of correlations info: dictionary annotates pv and beta rows and columns, containing gene_idx_row: index of the genes in rows conv: boolean vetor marking genes for which variance decomposition has converged gene_row: annotate rows of matrices """ if idx == None: if i0 == None or i1 == None: i0 = 0 i1 = self.G idx = SP.arange(i0, i1) elif type(idx) != SP.ndarray: idx = SP.array(idx) idx = SP.intersect1d( idx, SP.where(self.Y.std(0) > 0) [0]) #only makes sense if gene is expressed in at least one cell if K != None: if type(K) != list: K = [K] if (recalc == True and len(K) > 1) or (recalc == True and self.var == None): print 'performing variance decomposition first...' var_raw, var_info = self.varianceDecomposition(K=K, idx=idx, cache=False) var = var_raw / var_raw.sum(1)[:, SP.newaxis] elif recalc == False and len(K) > 1: assert self.var != None, 'scLVM:: when multiple hidden factors are considered, varianceDecomposition decomposition must be used prior to this method' warnings.warn( 'scLVM:: recalc should only be set to False by advanced users: scLVM then assumes that the random effects are the same as those for which the variance decompostion was performed earlier.' ) var_raw = self.var var_info = self.var_info var = var_raw / var_raw.sum(1)[:, SP.newaxis] lmm_params = { 'covs': SP.ones([self.N, 1]), 'NumIntervalsDeltaAlt': 100, 'NumIntervalsDelta0': 100, 'searchDelta': True } Yidx = self.Y[:, idx] Ystd = Yidx - Yidx.mean(0) Ystd /= Yidx.std(0) #delta optimization might be more efficient if expr == None: expr = Ystd elif standardize == True: exprStd = expr exprStd = expr - expr.mean(0) exprStd /= expr.std(0) expr = exprStd _G1 = idx.shape[0] _G2 = expr.shape[1] geneID = SP.zeros(_G1, dtype=str) beta = SP.zeros((_G1, _G2)) pv = SP.zeros((_G1, _G2)) count = 0 for ids in range(_G1): if verbose: print '.. fitting gene %d' % ids # extract a single gene if K != None: if len(K) > 1: if var_info['conv'][count] == True: _K = SP.sum( [var[count, i] * K[i] for i in range(len(K))], 0) _K /= _K.diagonal().mean() else: _K = None else: _K = K[0] else: _K = None lm = QTL.test_lmm(expr, Ystd[:, ids:ids + 1], K=_K, verbose=False, **lmm_params) pv[count, :] = lm.getPv()[0, :] beta[count, :] = lm.getBetaSNP()[0, :] count += 1 if self.geneID != None: geneID = SP.array(self.geneID)[idx] if recalc == True and K != None and len(K) > 1: info = {'conv': var_info['conv'], 'gene_idx_row': idx} else: info = {'gene_idx_row': idx} if geneID != None: info['gene_row'] = geneID return pv, beta, info
QueryGOIDs = [] NQueryGeneEntrezIDs = 0 for i in xrange(len(QueryGeneEntrezIDs)): if(QueryGeneEntrezIDs=='None'): continue NQueryGeneEntrezIDs += 1 Ind = scipy.where(BGSData[BGSHeader.index('GeneEntrezID')]==QueryGeneEntrezIDs[i])[0] QueryGOIDs.extend(BGSData[BGSHeader.index('GOId'),Ind].tolist()) QueryGOIDs = scipy.unique(scipy.array(QueryGOIDs)) for g in xrange(len(QueryGOIDs)): Indices = scipy.where(BGSData[BGSHeader.index('GOId')]==QueryGOIDs[g])[0] EntrezGenes = BGSData[BGSHeader.index('GeneEntrezID'),Indices] NGenesWithThisGOID = len(Indices) QueryGenesWithThisGOID = scipy.intersect1d(EntrezGenes,QueryGeneEntrezIDs) NQueryGenesWithThisGOID = len(QueryGenesWithThisGOID) QueryGSymbols = [] for EID in QueryGenesWithThisGOID: QueryGSymbols.append(QueryGeneSymbols[QueryGeneEntrezIDs.index(EID)]) EnrichmentFactor = (float(NQueryGenesWithThisGOID)/float(len(QueryGeneEntrezIDs)))/(float(NGenesWithThisGOID)/float(NGenesWithGOEntry)) GODescr = BGSData[BGSHeader.index('GODescr'),Indices[0]] PValue = scipy.stats.hypergeom.sf(NQueryGenesWithThisGOID,NGenesWithGOEntry,NQueryGeneEntrezIDs,NGenesWithThisGOID) print g,\ QueryGOIDs[g],\ GODescr,\ NGenesWithThisGOID,\ NGenesWithGOEntry,\ NQueryGenesWithThisGOID,\ len(QueryGeneEntrezIDs),\ ','.join(QueryGenesWithThisGOID),\
if ((not os.path.exists(count_file_GRCH37)) or (not os.path.exists(count_file_SNP_maternal)) or (not os.path.exists(count_file_SNP_paternal)) or (not os.path.exists(count_file_SV_maternal)) or (not os.path.exists(count_file_SV_paternal))): print "skip: %s" % element_id RV_file_exist.append([element_id,os.path.exists(count_file_GRCH37),os.path.exists(count_file_SNP_maternal),os.path.exists(count_file_SNP_paternal),os.path.exists(count_file_SV_maternal),os.path.exists(count_file_SV_paternal)]) RV_file.append([element_id,count_file_GRCH37,count_file_SNP_maternal,count_file_SNP_paternal,count_file_SV_maternal,count_file_SV_paternal]) continue #1. load lists count_GRCH37 = cPickle.load(open(count_file_GRCH37,'rb')) count_SNP_maternal = cPickle.load(open(count_file_SNP_maternal,'rb')) count_SNP_paternal = cPickle.load(open(count_file_SNP_paternal,'rb')) count_SV_maternal = cPickle.load(open(count_file_SV_maternal,'rb')) count_SV_paternal = cPickle.load(open(count_file_SV_paternal,'rb')) count_SNP = SP.union1d(count_SNP_maternal,count_SNP_paternal) count_SV = SP.union1d(count_SV_maternal,count_SV_paternal) count_intersect_GRCH37_SNP = SP.intersect1d(count_SNP,count_GRCH37) count_intersect_GRCH37_SV = SP.intersect1d(count_SV,count_GRCH37) count_intersect_SNP_SV = SP.intersect1d(count_SNP,count_SV) count_ex_GRCH37_SNP = SP.setdiff1d(count_GRCH37,count_SNP) count_ex_GRCH37_SV = SP.setdiff1d(count_GRCH37,count_SV) count_ex_SNP_GRCH37 = SP.setdiff1d(count_SNP,count_GRCH37) count_ex_SV_GRCH37 = SP.setdiff1d(count_SV,count_GRCH37) count_ex_SNP_SV = SP.setdiff1d(count_SNP,count_SV) count_ex_SV_SNP = SP.setdiff1d(count_SV,count_SNP) #store a couple of things rv = [] rv = {'element_id': element_id,'count_ref': len(count_GRCH37),'count_SNP_maternal':len(count_SNP_maternal),'count_SNP_paternal':len(count_SNP_paternal),'count_SV_maternal':len(count_SV_maternal),'count_SV_paternal':len(count_SV_paternal),'count_SNP':len(count_SNP),'count_SV':len(count_SV),'count_intersect_GRCH37_SNP':len(count_intersect_GRCH37_SNP),'count_intersect_GRCH37_SV':len(count_intersect_GRCH37_SV),'count_intersect_SNP_SV':len(count_intersect_SNP_SV),'count_ex_GRCH37_SNP':len(count_ex_GRCH37_SNP),'count_ex_GRCH37_SV':len(count_ex_GRCH37_SV),'count_ex_SNP_GRCH37':len(count_ex_SNP_GRCH37),'count_ex_SV_GRCH37':len(count_ex_SV_GRCH37),'count_ex_SNP_SV':len(count_ex_SNP_SV),'count_ex_SV_SNP':len(count_ex_SV_SNP)} RV.append(rv) pass
def verify_alt_prime(event, gene, counts_segments, counts_edges, CFG): # [verified, info] = verify_exon_skip(event, fn_bam, cfg) # (0) valid, (1) exon_diff_cov, (2) exon_const_cov # (3) intron1_conf, (4) intron2_conf info = [1, 0, 0, 0, 0] verified = [0, 0] ### check validity of exon coordinates (>=0) if sp.any(event.exons1 < 0) or sp.any(event.exons2 < 0): info[0] = 0 return (verified, info) ### check validity of intron coordinates (only one side is differing) if (event.exons1[0, 1] != event.exons2[0, 1]) and (event.exons1[1, 0] != event.exons2[1, 0]): info[0] = 0 return (verified, info) sg = gene.splicegraph segs = gene.segmentgraph ### find exons corresponding to event idx_exon11 = sp.where((sg.vertices[0, :] == event.exons1[0, 0]) & (sg.vertices[1, :] == event.exons1[0, 1]))[0] if idx_exon11.shape[0] == 0: segs_exon11 = sp.where((segs.segments[0, :] >= event.exons1[0, 0]) & (segs.segments[1, :] <= event.exons1[0, 1]))[0] else: segs_exon11 = sp.where(segs.seg_match[idx_exon11, :])[1] idx_exon12 = sp.where((sg.vertices[0, :] == event.exons1[1, 0]) & (sg.vertices[1, :] == event.exons1[1, 1]))[0] if idx_exon12.shape[0] == 0: segs_exon12 = sp.where((segs.segments[0, :] >= event.exons1[1, 0]) & (segs.segments[1, :] <= event.exons1[1, 1]))[0] else: segs_exon12 = sp.where(segs.seg_match[idx_exon12, :])[1] idx_exon21 = sp.where((sg.vertices[0, :] == event.exons2[0, 0]) & (sg.vertices[1, :] == event.exons2[0, 1]))[0] if idx_exon21.shape[0] == 0: segs_exon21 = sp.where((segs.segments[0, :] >= event.exons2[0, 0]) & (segs.segments[1, :] <= event.exons2[0, 1]))[0] else: segs_exon21 = sp.where(segs.seg_match[idx_exon21, :])[1] idx_exon22 = sp.where((sg.vertices[0, :] == event.exons2[1, 0]) & (sg.vertices[1, :] == event.exons2[1, 1]))[0] if idx_exon22.shape[0] == 0: segs_exon22 = sp.where((segs.segments[0, :] >= event.exons2[1, 0]) & (segs.segments[1, :] <= event.exons2[1, 1]))[0] else: segs_exon22 = sp.where(segs.seg_match[idx_exon22, :] > 0)[1] assert (segs_exon11.shape[0] > 0) assert (segs_exon12.shape[0] > 0) assert (segs_exon21.shape[0] > 0) assert (segs_exon22.shape[0] > 0) if sp.all(segs_exon11 == segs_exon21): seg_exon_const = segs_exon11 seg_diff = sp.setdiff1d(segs_exon12, segs_exon22) if seg_diff.shape[0] == 0: seg_diff = sp.setdiff1d(segs_exon22, segs_exon12) seg_const = sp.intersect1d(segs_exon12, segs_exon22) elif sp.all(segs_exon12 == segs_exon22): seg_exon_const = segs_exon12 seg_diff = sp.setdiff1d(segs_exon11, segs_exon21) if seg_diff.shape[0] == 0: seg_diff = sp.setdiff1d(segs_exon21, segs_exon11) seg_const = sp.intersect1d(segs_exon21, segs_exon11) else: print >> sys.stderr, "ERROR: both exons differ in alt prime event in verify_alt_prime" sys.exit(1) seg_const = sp.r_[seg_exon_const, seg_const] seg_lens = segs.segments[1, :] - segs.segments[0, :] # exon_diff_cov info[1] = sp.sum(counts_segments[seg_diff] * seg_lens[seg_diff]) / sp.sum( seg_lens[seg_diff]) # exon_const_cov info[2] = sp.sum(counts_segments[seg_const] * seg_lens[seg_const]) / sp.sum(seg_lens[seg_const]) if info[1] >= CFG['alt_prime']['min_diff_rel_cov'] * info[2]: verified[0] = 1 ### check intron confirmations as sum of valid intron scores ### intron score is the number of reads confirming this intron # intron1_conf idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index( [segs_exon11[-1], segs_exon12[0]], segs.seg_edges.shape))[0] assert (idx.shape[0] > 0) info[3] = counts_edges[idx, 1] # intron2_conf idx = sp.where(counts_edges[:, 0] == sp.ravel_multi_index( [segs_exon21[-1], segs_exon22[0]], segs.seg_edges.shape))[0] assert (idx.shape[0] > 0) info[4] = counts_edges[idx, 1] if min(info[3], info[4]) >= CFG['alt_prime']['min_intron_count']: verified[1] = 1 return (verified, info)
def varianceDecomposition(self,K=None,tech_noise=None,idx=None,i0=None,i1=None,max_iter=10,verbose=False, cache=True): """ Args: K: list of random effects to be considered in the analysis idx: indices of the genes to be considered in the analysis i0: gene index from which the anlysis starts i1: gene index to which the analysis stops max_iter: maximum number of random restarts verbose: if True, print progresses """ if tech_noise!=None: self.set_tech_noise(tech_noise) assert self.tech_noise!=None, 'scLVM:: specify technical noise' assert K!=None, 'scLVM:: specify K' if type(K)!=list: K = [K] for k in K: assert k.shape[0]==self.N, 'scLVM:: K dimension dismatch' assert k.shape[1]==self.N, 'scLVM:: K dimension dismatch' if idx==None: if i0==None or i1==None: i0 = 0; i1 = self.G idx = SP.arange(i0,i1) elif type(idx)!=SP.ndarray: idx = SP.array(idx) idx = SP.intersect1d(SP.array(idx),SP.where(self.Y.std(0)>0)[0]) #only makes sense if gene is expressed in at least one cell _G = len(idx) var = SP.zeros((_G,len(K)+2)) _idx = SP.zeros(_G) geneID = SP.zeros(_G,dtype=str) conv = SP.zeros(_G)==1 Ystar = [SP.zeros((self.N,_G)) for i in range(len(K))] count = 0 Yidx = self.Y[:,idx] Ystd = Yidx-Yidx.mean(0) Ystd/= Yidx.std(0) #delta optimization might be more efficient tech_noise = self.tech_noise[idx]/SP.array(Yidx.std(0))**2 for ids in range(_G): if verbose: print '.. fitting gene %d'%ids # extract a single gene y = Ystd[:,ids:ids+1] # build and fit variance decomposition model vc= VAR.VarianceDecomposition(y) vc.addFixedEffect() for k in K: vc.addRandomEffect(k) vc.addRandomEffect(SP.eye(self.N)) vc.addRandomEffect(SP.eye(self.N)) vc.vd.getTerm(len(K)+1).getKcf().setParamMask(SP.zeros(1)) for iter_i in range(max_iter): scales0 = y.std()*SP.randn(len(K)+2) scales0[len(K)+1]=SP.sqrt(tech_noise[ids]); _conv = vc.optimize(scales0=scales0) if _conv: break conv[count] = _conv if not _conv: var[count,-2] = SP.maximum(0,y.var()-tech_noise[ids]) var[count,-1] = tech_noise[ids] count+=1; continue _var = vc.getVarianceComps()[0,:] KiY = vc.gp.agetKEffInvYCache().ravel() for ki in range(len(K)): Ystar[ki][:,count]=_var[ki]*SP.dot(K[ki],KiY) var[count,:] = _var count+=1; if self.geneID!=None: geneID = SP.array(self.geneID)[idx] col_header = ['hidden_%d'%i for i in range(len(K))] col_header.append('biol_noise') col_header.append('tech_noise') col_header = SP.array(col_header) #annotate column and rows of var and Ystar var_info = {'gene_idx':idx,'col_header':col_header,'conv':conv} if geneID!=None: var_info['geneID'] = SP.array(geneID) Ystar_info = {'gene_idx':idx,'conv':conv} if geneID!=None: Ystar_info['geneID'] = SP.array(geneID) # cache stuff if cache == True: self.var = var self.Ystar = Ystar self.var_info = var_info self.Ystar_info = Ystar_info else: return var, var_info
def fitLMM(self,expr = None,K=None,tech_noise=None,idx=None,i0=None,i1=None,verbose=False, recalc=True, standardize=True): """ Args: K: list of random effects to be considered in the analysis if K is none, it does not consider any random effect expr: correlations are calculated between the gene expression data (self.Y) and these measures provided in expr. If None, self.Y i sused idx: indices of the genes to be considered in the analysis i0: gene index from which the anlysis starts i1: gene index to which the analysis stops verbose: if True, print progress recalc: if True, re-do variance decomposition standardize: if True, standardize also expression Returns: pv: matrix of pvalues beta: matrix of correlations info: dictionary annotates pv and beta rows and columns, containing gene_idx_row: index of the genes in rows conv: boolean vetor marking genes for which variance decomposition has converged gene_row: annotate rows of matrices """ if idx==None: if i0==None or i1==None: i0 = 0; i1 = self.G idx = SP.arange(i0,i1) elif type(idx)!=SP.ndarray: idx = SP.array(idx) idx = SP.intersect1d(idx,SP.where(self.Y.std(0)>0)[0]) #only makes sense if gene is expressed in at least one cell if K!=None: if type(K)!=list: K = [K] if (recalc==True and len(K)>1) or (recalc==True and self.var==None): print 'performing variance decomposition first...' var_raw,var_info = self.varianceDecomposition(K=K,idx=idx, cache=False) var = var_raw/var_raw.sum(1)[:,SP.newaxis] elif recalc==False and len(K)>1: assert self.var!=None, 'scLVM:: when multiple hidden factors are considered, varianceDecomposition decomposition must be used prior to this method' warnings.warn('scLVM:: recalc should only be set to False by advanced users: scLVM then assumes that the random effects are the same as those for which the variance decompostion was performed earlier.') var_raw = self.var var_info = self.var_info var = var_raw/var_raw.sum(1)[:,SP.newaxis] lmm_params = {'covs':SP.ones([self.N,1]),'NumIntervalsDeltaAlt':100,'NumIntervalsDelta0':100,'searchDelta':True} Yidx = self.Y[:,idx] Ystd = Yidx-Yidx.mean(0) Ystd/= Yidx.std(0) #delta optimization might be more efficient if expr==None: expr = Ystd elif standardize==True: exprStd = expr exprStd = expr-expr.mean(0) exprStd/= expr.std(0) expr = exprStd _G1 = idx.shape[0] _G2 = expr.shape[1] geneID = SP.zeros(_G1,dtype=str) beta = SP.zeros((_G1,_G2)) pv = SP.zeros((_G1,_G2)) count = 0 for ids in range(_G1): if verbose: print '.. fitting gene %d'%ids # extract a single gene if K!=None: if len(K)>1: if var_info['conv'][count]==True: _K = SP.sum([var[count,i]*K[i] for i in range(len(K))],0) _K/= _K.diagonal().mean() else: _K = None else: _K = K[0] else: _K = None lm = QTL.test_lmm(expr,Ystd[:,ids:ids+1],K=_K,**lmm_params) pv[count,:] = lm.getPv()[0,:] beta[count,:] = lm.getBetaSNP()[0,:] count+=1 if self.geneID!=None: geneID = SP.array(self.geneID)[idx] if recalc==True and K!=None and len(K)>1: info = {'conv':var_info['conv'],'gene_idx_row':idx} else: info = {'gene_idx_row':idx} if geneID!=None: info['gene_row'] = geneID return pv, beta, info
def coordinate_genotypes_ss_w_ld_ref(genotype_file=None, reference_genotype_file=None, hdf5_file=None, genetic_map_dir=None, check_mafs=False, min_maf=0.01, skip_coordination=False, debug=False): print('Coordinating things w genotype file: %s \nref. genot. file: %s' % (genotype_file, reference_genotype_file)) from plinkio import plinkfile plinkf = plinkfile.PlinkFile(genotype_file) # Loads only the individuals... plinkf_dict = plinkfiles.get_phenotypes(plinkf) # Figure out chromosomes and positions. if debug: print('Parsing validation bim file') loci = plinkf.get_loci() plinkf.close() gf_chromosomes = [l.chromosome for l in loci] chromosomes = sp.unique(gf_chromosomes) chromosomes.sort() chr_dict = plinkfiles.get_chrom_dict(loci, chromosomes) if debug: print('Parsing LD reference bim file') plinkf_ref = plinkfile.PlinkFile(reference_genotype_file) loci_ref = plinkf_ref.get_loci() plinkf_ref.close() chr_dict_ref = plinkfiles.get_chrom_dict(loci_ref, chromosomes) # Open HDF5 file and prepare out data assert not 'iids' in hdf5_file, 'Something is wrong with the HDF5 file, no individuals IDs were found.' if plinkf_dict['has_phenotype']: hdf5_file.create_dataset('y', data=plinkf_dict['phenotypes']) hdf5_file.create_dataset('fids', data=sp.array(plinkf_dict['fids'], dtype=util.fids_dtype)) hdf5_file.create_dataset('iids', data=sp.array(plinkf_dict['iids'], dtype=util.iids_dtype)) ssf = hdf5_file['sum_stats'] cord_data_g = hdf5_file.create_group('cord_data') maf_adj_risk_scores = sp.zeros(plinkf_dict['num_individs']) num_common_snps = 0 # corr_list = [] tot_g_ss_nt_concord_count = 0 tot_rg_ss_nt_concord_count = 0 tot_g_rg_nt_concord_count = 0 tot_num_non_matching_nts = 0 # Now iterate over chromosomes for chrom in chromosomes: ok_indices = {'g': [], 'rg': [], 'ss': []} chr_str = 'chrom_%d' % chrom print('Coordinating data for chromosome %s' % chr_str) chrom_d = chr_dict[chr_str] chrom_d_ref = chr_dict_ref[chr_str] try: ssg = ssf['chrom_%d' % chrom] except Exception as err_str: print(err_str) print('Did not find chromosome in SS dataset.') print('Continuing.') continue ssg = ssf['chrom_%d' % chrom] g_sids = chrom_d['sids'] rg_sids = chrom_d_ref['sids'] ss_sids = (ssg['sids'][...]).astype(util.sids_u_dtype) if debug: print( 'Found %d SNPs in validation data, %d SNPs in LD reference data, and %d SNPs in summary statistics.' % (len(g_sids), len(rg_sids), len(ss_sids))) common_sids = sp.intersect1d(ss_sids, g_sids) common_sids = sp.intersect1d(common_sids, rg_sids) if debug: print( 'Found %d SNPs on chrom %d that were common across all datasets' % (len(common_sids), chrom)) ss_snp_map = [] g_snp_map = [] rg_snp_map = [] ss_sid_dict = {} for i, sid in enumerate(ss_sids): ss_sid_dict[sid] = i g_sid_dict = {} for i, sid in enumerate(g_sids): g_sid_dict[sid] = i rg_sid_dict = {} for i, sid in enumerate(rg_sids): rg_sid_dict[sid] = i for sid in common_sids: g_snp_map.append(g_sid_dict[sid]) # order by positions g_positions = sp.array(chrom_d['positions'])[g_snp_map] order = sp.argsort(g_positions) # order = order.tolist() g_snp_map = sp.array(g_snp_map)[order] g_snp_map = g_snp_map.tolist() common_sids = sp.array(common_sids)[order] # Get the other two maps for sid in common_sids: rg_snp_map.append(rg_sid_dict[sid]) for sid in common_sids: ss_snp_map.append(ss_sid_dict[sid]) g_nts = sp.array(chrom_d['nts']) rg_nts = sp.array(chrom_d_ref['nts']) rg_nts_ok = sp.array(rg_nts)[rg_snp_map] ss_nts = (ssg['nts'][...]).astype(util.nts_u_dtype) betas = ssg['betas'][...] log_odds = ssg['log_odds'][...] if 'freqs' in ssg: ss_freqs = ssg['freqs'][...] g_ss_nt_concord_count = sp.sum( g_nts[g_snp_map] == ss_nts[ss_snp_map]) / 2.0 rg_ss_nt_concord_count = sp.sum(rg_nts_ok == ss_nts[ss_snp_map]) / 2.0 g_rg_nt_concord_count = sp.sum(g_nts[g_snp_map] == rg_nts_ok) / 2.0 if debug: print( 'Nucleotide concordance counts out of %d genotypes: vg-g: %d, vg-ss: %d, g-ss: %d' % (len(g_snp_map), g_rg_nt_concord_count, g_ss_nt_concord_count, rg_ss_nt_concord_count)) tot_g_ss_nt_concord_count += g_ss_nt_concord_count tot_rg_ss_nt_concord_count += rg_ss_nt_concord_count tot_g_rg_nt_concord_count += g_rg_nt_concord_count num_non_matching_nts = 0 num_ambig_nts = 0 # Identifying which SNPs have nucleotides that are ok.. ok_nts = [] for g_i, rg_i, ss_i in zip(g_snp_map, rg_snp_map, ss_snp_map): # To make sure, is the SNP id the same? assert g_sids[g_i] == rg_sids[rg_i] == ss_sids[ ss_i], 'Some issues with coordinating the genotypes.' g_nt = g_nts[g_i] if not skip_coordination: rg_nt = rg_nts[rg_i] ss_nt = ss_nts[ss_i] # Is the nucleotide ambiguous. g_nt = [g_nts[g_i][0], g_nts[g_i][1]] if tuple(g_nt) in util.ambig_nts: num_ambig_nts += 1 tot_num_non_matching_nts += 1 continue # First check if nucleotide is sane? if (not g_nt[0] in util.valid_nts) or (not g_nt[1] in util.valid_nts): num_non_matching_nts += 1 tot_num_non_matching_nts += 1 continue os_g_nt = sp.array([ util.opp_strand_dict[g_nt[0]], util.opp_strand_dict[g_nt[1]] ]) flip_nts = False if not ((sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt)) and (sp.all(g_nt == rg_nt) or sp.all(os_g_nt == rg_nt))): if sp.all(g_nt == rg_nt) or sp.all(os_g_nt == rg_nt): flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0] == ss_nt[1]) or (os_g_nt[1] == ss_nt[0] and os_g_nt[0] == ss_nt[1]) # Try flipping the SS nt if flip_nts: betas[ss_i] = -betas[ss_i] log_odds[ss_i] = -log_odds[ss_i] if 'freqs' in ssg: ss_freqs[ss_i] = 1 - ss_freqs[ss_i] else: if debug: print("Nucleotides don't match after all?: g_sid=%s, ss_sid=%s, g_i=%d, ss_i=%d, g_nt=%s, ss_nt=%s" % \ (g_sids[g_i], ss_sids[ss_i], g_i, ss_i, str(g_nt), str(ss_nt))) num_non_matching_nts += 1 tot_num_non_matching_nts += 1 continue else: num_non_matching_nts += 1 tot_num_non_matching_nts += 1 continue # Opposite strand nucleotides # everything seems ok. ok_indices['g'].append(g_i) ok_indices['rg'].append(rg_i) ok_indices['ss'].append(ss_i) ok_nts.append(g_nt) if debug: print('%d SNPs had ambiguous nucleotides.' % num_ambig_nts) print('%d SNPs were excluded due to nucleotide issues.' % num_non_matching_nts) print('%d SNPs were retained on chromosome %d.' % (len(ok_indices['g']), chrom)) # Resorting by position positions = sp.array(chrom_d['positions'])[ok_indices['g']] # Now parse SNPs .. snp_indices = sp.array(chrom_d['snp_indices']) # Pinpoint where the SNPs are in the file. snp_indices = snp_indices[ok_indices['g']] raw_snps, freqs = plinkfiles.parse_plink_snps(genotype_file, snp_indices) snp_indices_ref = sp.array(chrom_d_ref['snp_indices']) # Pinpoint where the SNPs are in the file. snp_indices_ref = snp_indices_ref[ok_indices['rg']] raw_ref_snps, freqs_ref = plinkfiles.parse_plink_snps( reference_genotype_file, snp_indices_ref) snp_stds_ref = sp.sqrt(2 * freqs_ref * (1 - freqs_ref)) snp_means_ref = freqs_ref * 2 snp_stds = sp.sqrt(2 * freqs * (1 - freqs)) snp_means = freqs * 2 betas = betas[ok_indices['ss']] log_odds = log_odds[ok_indices['ss']] ps = ssg['ps'][...][ok_indices['ss']] nts = sp.array(ok_nts) sids = (ssg['sids'][...]).astype(util.sids_u_dtype) sids = sids[ok_indices['ss']] # Check SNP frequencies.. if check_mafs and 'freqs' in ssg: ss_freqs = ss_freqs[ok_indices['ss']] freq_discrepancy_snp = sp.absolute( ss_freqs - (1 - freqs)) > 0.15 #Array of np.bool values if sp.any(freq_discrepancy_snp): print( 'Warning: %d SNPs were filtered due to high allele frequency discrepancy between summary statistics and validation sample' % sp.sum(freq_discrepancy_snp)) # Filter freq_discrepancy_snps ok_freq_snps = sp.logical_not(freq_discrepancy_snp) raw_snps = raw_snps[ok_freq_snps] snp_stds = snp_stds[ok_freq_snps] snp_means = snp_means[ok_freq_snps] raw_ref_snps = raw_ref_snps[ok_freq_snps] snp_stds_ref = snp_stds_ref[ok_freq_snps] snp_means_ref = snp_means_ref[ok_freq_snps] freqs = freqs[ok_freq_snps] freqs_ref = freqs_ref[ok_freq_snps] ps = ps[ok_freq_snps] positions = positions[ok_freq_snps] nts = nts[ok_freq_snps] sids = sids[ok_freq_snps] betas = betas[ok_freq_snps] log_odds = log_odds[ok_freq_snps] # Filter minor allele frequency SNPs. maf_filter = (freqs > min_maf) * (freqs < (1 - min_maf)) maf_filter_sum = sp.sum(maf_filter) n_snps = len(maf_filter) assert maf_filter_sum <= n_snps, "Problems when filtering SNPs with low minor allele frequencies" if sp.sum(maf_filter) < n_snps: raw_snps = raw_snps[maf_filter] snp_stds = snp_stds[maf_filter] snp_means = snp_means[maf_filter] raw_ref_snps = raw_ref_snps[maf_filter] snp_stds_ref = snp_stds_ref[maf_filter] snp_means_ref = snp_means_ref[maf_filter] freqs = freqs[maf_filter] freqs_ref = freqs_ref[maf_filter] ps = ps[maf_filter] positions = positions[maf_filter] nts = nts[maf_filter] sids = sids[maf_filter] betas = betas[maf_filter] log_odds = log_odds[maf_filter] maf_adj_prs = sp.dot(log_odds, raw_snps) if debug and plinkf_dict['has_phenotype']: maf_adj_corr = sp.corrcoef(plinkf_dict['phenotypes'], maf_adj_prs)[0, 1] print( 'Log odds, per genotype PRS correlation w phenotypes for chromosome %d was %0.4f' % (chrom, maf_adj_corr)) genetic_map = [] if genetic_map_dir is not None: with gzip.open(genetic_map_dir + 'chr%d.interpolated_genetic_map.gz' % chrom) as f: for line in f: l = line.split() # if l[0] in sid_set: # genetic_map.append(l[0]) else: genetic_map = None coord_data_dict = { 'chrom': 'chrom_%d' % chrom, 'raw_snps_ref': raw_ref_snps, 'snp_stds_ref': snp_stds_ref, 'snp_means_ref': snp_means_ref, 'freqs_ref': freqs_ref, 'ps': ps, 'positions': positions, 'nts': nts, 'sids': sids, 'genetic_map': genetic_map, 'betas': betas, 'log_odds': log_odds, 'log_odds_prs': maf_adj_prs, 'raw_snps_val': raw_snps, 'snp_stds_val': snp_stds, 'snp_means_val': snp_means, 'freqs_val': freqs } write_coord_data(cord_data_g, coord_data_dict) maf_adj_risk_scores += maf_adj_prs num_common_snps += len(betas) # Now calculate the prediction r^2 if debug and plinkf_dict['has_phenotype']: maf_adj_corr = sp.corrcoef(plinkf_dict['phenotypes'], maf_adj_risk_scores)[0, 1] print( 'Log odds, per PRS correlation for the whole genome was %0.4f (r^2=%0.4f)' % (maf_adj_corr, maf_adj_corr**2)) print( 'Overall nucleotide concordance counts: g_rg: %d, g_ss: %d, rg_ss: %d' % (tot_g_rg_nt_concord_count, tot_g_ss_nt_concord_count, tot_rg_ss_nt_concord_count)) print('There were %d SNPs in common' % num_common_snps) print('In all, %d SNPs were excluded due to nucleotide issues.' % tot_num_non_matching_nts) print('Done!')