def plot_median_errors(RefinementLevels): for i in RefinementLevels[0].cases: x =[]; y =[]; print "Analyzing median error on: ", i ; for r in RefinementLevels: x.append(r.LUT.D_dim*r.LUT.P_dim) r.get_REL_ERR_SU2(i) y.append(r.SU2[i].median_ERR*100) x = sp.array(x) y = sp.array(y) y = y[sp.argsort(x)] x = x[sp.argsort(x)] LHM = sp.ones((len(x),2)) RHS = sp.ones((len(x),1)) LHM[:,1] = sp.log10(x) RHS[:,0] = sp.log10(y) sols = sp.linalg.lstsq(LHM,RHS) b = -sols[0][1] plt.loglog(x,y, label='%s, %s'%(i,r'$O(\frac{1}{N})^{%s}$'%str(sp.around(b,2))), basex=10, basey=10, \ subsy=sp.linspace(10**(-5), 10**(-2),20),\ subsx=sp.linspace(10**(2), 10**(5),50)) #for r in RefinementLevels: # x.append(r.LUT.D_dim*r.LUT.P_dim) # r.get_REL_ERR_SciPy(i) # y.append(r.SciPy[i].median_ERR*100) #plt.plot(x,y, label='SciPy: %s'%i) plt.grid(which='both') plt.xlabel('Grid Nodes (N)') plt.ylabel('Median relative error [%]') return;
def my_bh_fdr(p_val_vec): index = scipy.argsort(p_val_vec) exp_err = scipy.vstack((float(len(p_val_vec))/scipy.arange(1,len(p_val_vec) + 1)*p_val_vec[index], scipy.tile(1, [1, len(p_val_vec)]))).min(axis = 0) exp_err = scipy.vstack((exp_err,exp_err[scipy.r_[0,scipy.arange(len(exp_err)-1)]])).max(axis=0) #scipy.r_[index[0], index[range(len(index)-1)] resort_index = scipy.argsort(index) return exp_err[resort_index]
def outputTargetSimPairs(self, pairFile): pairList = [] pairFilehandle = open(pairFile) for line in pairFilehandle: words = (line.strip().strip('\n').strip()).split() pairList.append(words) pairFilehandle.close() print "..Outputting similarities" outputFilename = "simPairs.txt" outputFilehandle = open(outputFilename, "w") outputFilehandle.write("word1 word2 sim | zsim1 zsim2 | psim1 psim2 | nIn1 nIn2\n") numTargets = len(self.similarityMatrix[0]) for pair in pairList: if ((pair[0] in self.targetDict) and (pair[1] in self.targetDict)): i = self.targetDict[pair[0]] j = self.targetDict[pair[1]] sim = self.similarityMatrix[i,j] word0Sims = self.similarityMatrix[i] word1Sims = self.similarityMatrix[j] z0Sim = (sim - word0Sims.mean()) / word0Sims.std() z1Sim = (sim - word1Sims.mean()) / word1Sims.std() sim0min = np.amin(word0Sims) sim1min = np.amin(word1Sims) adjSim0 = sim + abs(sim0min) adjSim1 = sim + abs(sim1min) adjSimVector0 = word0Sims + abs(sim0min) adjSimVector1 = word1Sims + abs(sim1min) sim0Sum = adjSimVector0.sum() sim1Sum = adjSimVector1.sum() p0Sim = adjSim0 / sim0Sum p1Sim = adjSim1 / sim1Sum sortedIndexes0 = scipy.argsort(word0Sims) sortedIndexes1 = scipy.argsort(word1Sims) for k in range(numTargets): if sortedIndexes0[k] == j: nIn0 = numTargets - k break for k in range(numTargets): if sortedIndexes1[k] == i: nIn1 = numTargets - k break outputFilehandle.write("%s %s %0.3f | %0.3f %0.3f | %0.5f %0.5f | %0.0f %0.0f\n" % (pair[0], pair[1], sim, z0Sim, z1Sim, p0Sim, p1Sim, nIn0, nIn1)) else: outputFilehandle.write("%s %s NA NA NA NA NA NA NA\n" % (pair[0], pair[1]))
def plot_overlap_ps(result_file, ss_file='/Users/bjarnivilhjalmsson/data/GIANT/GIANT_HEIGHT_Wood_et_al_2014_publicrelease_HapMapCeuFreq.txt', fig_filename='/Users/bjarnivilhjalmsson/data/tmp/manhattan_combPC_HGT.png', method='combPC', ylabel='Comb. PC (HIP,WC,HGT,BMI) $-log_{10}(P$-value$)$', xlabel='Height $-log_{10}(P$-value$)$', p_thres=0.00001): # Parse results ans SS file res_table = pandas.read_table(result_file) ss_table = pandas.read_table(ss_file) # Parse res_sids = sp.array(res_table['SNPid']) if method == 'MVT': comb_ps = sp.array(res_table['pval']) elif method == 'combPC': comb_ps = sp.array(res_table['combPC']) if 'MarkerName' in ss_table.keys(): ss_sids = sp.array(ss_table['MarkerName']) elif 'SNP' in ss_table.keys(): ss_sids = sp.array(ss_table['SNP']) else: raise Exception("Don't know where to look for rs IDs") marg_ps = sp.array(ss_table['p']) # Filtering boring p-values res_p_filter = comb_ps < p_thres res_sids = res_sids[res_p_filter] comb_ps = comb_ps[res_p_filter] # ss_p_filter = marg_ps<p_thres # ss_sids = ss_sids[ss_p_filter] # marg_ps = marg_ps[ss_p_filter] common_sids = sp.intersect1d(res_sids, ss_sids) print 'Found %d SNPs in common' % (len(common_sids)) ss_filter = sp.in1d(ss_sids, common_sids) res_filter = sp.in1d(res_sids, common_sids) ss_sids = ss_sids[ss_filter] res_sids = res_sids[res_filter] marg_ps = marg_ps[ss_filter] comb_ps = comb_ps[res_filter] print 'Now sorting' ss_index = sp.argsort(ss_sids) res_index = sp.argsort(res_sids) marg_ps = -sp.log10(marg_ps[ss_index]) comb_ps = -sp.log10(comb_ps[res_index]) with plt.style.context('fivethirtyeight'): plt.plot(marg_ps, comb_ps, 'b.', alpha=0.2) (x_min, x_max) = plt.xlim() (y_min, y_max) = plt.ylim() plt.plot([x_min, x_max], [y_min, y_max], 'k--', alpha=0.2) plt.ylabel(ylabel) plt.xlabel(xlabel) plt.tight_layout() plt.savefig(fig_filename) plt.clf()
def plotBias(vals, fn_plot, myidx, logScale = False, refname = 'TCGA'): iqr = ( (sp.percentile(vals[~myidx],75) - sp.percentile(vals[~myidx],25) ) * 1.5) iqr2 = ( (sp.percentile(vals[myidx],75) - sp.percentile(vals[myidx],25) ) * 1.5) sidx = sp.argsort(vals) vals = vals[sidx] myidx = myidx[sidx] fig = plt.figure(figsize=(12,10)) ax = fig.add_subplot(111) ax_c = ax.twinx() ax.vlines(sp.array(sp.arange(sp.sum(vals.shape[0])))[myidx],[0], vals[myidx], label = '%s Reference'%refname) ax.vlines(sp.array(sp.arange(sp.sum(vals.shape[0])))[~myidx],[0], vals[~myidx], color = 'r', label = 'Your Samples') ax.plot([0,vals.shape[0]],[3,3], '--', color = 'green') ax.plot([0,vals.shape[0]],[5,5] , '--',color = 'green') ax.plot([0,vals.shape[0]],[iqr + sp.percentile(vals[~myidx], 75),iqr + sp.percentile(vals[~myidx], 75)], '--',color = 'green') ax.plot([0,vals.shape[0]],[iqr2 + sp.percentile(vals[myidx], 75),iqr2 + sp.percentile(vals[myidx], 75)], '--',color = 'green') # ax.plot([0,vals.shape[0]],[6.25,6.25],'--', color = 'green') ax.plot([0,vals.shape[0]],[10,10] , '--',color = 'green') ax.set_ylabel('Median 3\'/5\' Bias') ax.set_xlim(0,vals.shape[0]) if logScale: ax.set_yscale('log') ax_c.set_yscale('log') ax_c.set_ylim(ax.get_ylim()) ### add right side ticks if logScale: tick_thresholds = sp.array([3,5,iqr+sp.percentile(vals[~myidx],75),iqr2 + sp.percentile(vals[myidx], 75), 10])#sp.array(sp.log([3,5,iqr+sp.percentile(vals,75), 10, 50])) else: tick_thresholds = sp.array([3,5,iqr+sp.percentile(vals[~myidx],75),iqr2 + sp.percentile(vals[myidx], 75), 10]) tick_idx = sp.argsort(tick_thresholds) tick_thresholds = tick_thresholds[tick_idx] tick_thresholds = sp.around(tick_thresholds, decimals = 2) ax_c.set_yticks(tick_thresholds) tick_thresholds = tick_thresholds.astype('|S4') tick_thresholds = tick_thresholds.astype('|S50') tick_thresholds[tick_idx == 2] = tick_thresholds[tick_idx == 2][0] + ' (Your Filter)' # tick_thresholds[tick_idx == 3] = tick_thresholds[tick_idx == 3][0] + ' (PRAD Filter)' tick_thresholds[tick_idx == 3] = tick_thresholds[tick_idx == 3][0] + ' (%s Filter)'%(refname) ax_c.set_yticklabels(tick_thresholds) ax.grid() ax.legend(loc=2) plt.tight_layout() plt.savefig(fn_plot, dpi = 300) plt.clf()
def _query(self,lv,k=None): if (k==None): k=self.k if (type(lv)!=numpy.ndarray): lv=numpy.array(lv) if (lv.ndim==1): lv=lv.reshape(1,lv.shape[0]) if (lv.shape[0]==1): dt=abs(self.va.reshape(self.va.shape[0],1)-lv).T dr=scipy.argsort(dt)[0,:k] return numpy.vectorize(lambda x:self.va[x])(dr).reshape(1,k) else: dt=scipy.spatial.distance.cdist(lv,self.va.reshape(self.va.shape[0],1)) dr=scipy.argsort(dt)[:,:k] return numpy.vectorize(lambda x:self.va[x])(dr)
def remove_isolated_clusters(conns, nonzero_locs, num_to_keep): r""" Identifies and removes all disconnected clusters except the number of groups specified by "num_to_keep". num_to_keep=N retains the N largest clusters """ # adj_mat = generate_adjacency_matrix(conns, nonzero_locs) # logger.info('determining connected components...') cs_ids = csgraph.connected_components(csgraph=adj_mat, directed=False)[1] groups, counts = sp.unique(cs_ids, return_counts=True) order = sp.argsort(counts)[::-1] groups = groups[order] counts = counts[order] # msg = ' {} component groups for {} total nodes' logger.debug(msg.format(groups.size, cs_ids.size)) msg = ' largest group number: {}, size {}' logger.debug(msg.format(groups[0], counts[0])) msg = ' {} % of nodes contained in largest group' logger.debug(msg.format(counts[0]/cs_ids.size*100)) msg = ' {} % of nodes contained in {} retained groups' num = sp.sum(counts[0:num_to_keep])/cs_ids.size*100 logger.debug(msg.format(num, num_to_keep)) # inds = sp.where(sp.in1d(cs_ids, groups[0:num_to_keep]))[0] num = nonzero_locs.size nonzero_locs = nonzero_locs[inds] msg = ' removed {} disconnected nodes' logger.debug(msg.format(num - nonzero_locs.size)) # return nonzero_locs
def writeTopXGenes2File(filename,sqlfile,outdir,top=1000): f = h5py.File(filename,'r') chromosomes = f['chromosomes'][:] positions = f['positions'][:] p_values = f['p_values'][:].flatten() name = f['phenotype_name'].value.replace(" ","_").replace("<i>","").replace("</i>","") ind = sp.argsort(p_values)[:-1] chromosomes = chromosomes[ind] positions = positions[ind] p_values = p_values[ind] chromosomes = chromosomes[0:top] positions = positions[0:top] p_values = p_values[0:top] f.close() sqlite = sqlite3.connect(sqlfile) sqlite_cursor = sqlite.cursor() out = open(os.path.join(outdir,name + ".csv"),"w") out.write("Chr,Pos,PVal,GeneID (closest),Distance (bp)\n") for i in xrange(chromosomes.shape[0]): sqlite_cursor.execute("SELECT * FROM geneannotation WHERE chromosome_id=? ORDER BY ABS(annotation_start - ?) LIMIT 1",(str(chromosomes[i]),int(positions[i]))) annotation = sqlite_cursor.fetchall() #print annotation if len(annotation)==1: if positions[i] >= annotation[0][3] and positions[i] <= annotation[0][4]: distance = 0 elif positions[i] > annotation[0][4]: distance = abs(positions[i]-annotation[0][4]) else: distance = abs(positions[i]-annotation[0][3]) out.write(chromosomes[i] + "," + str(int(positions[i])) + ",%.2e"%(p_values[i]) + "," + annotation[0][1] + "," + str(int(distance)) + "\n") sqlite.close()
def eigsort(eigresult): """ Sort the output of scipy.linalg.eig() in terms of eignevalue magnitude """ ix = sp.argsort(abs(eigresult[0])) return ( eigresult[0][ix], eigresult[1][:,ix] )
def loadData(self): '''Завантаження даних з файлів''' Tabs = ( ('tab_2', 'tab_3','tab_4'), ('tab_3', 'tab_2','tab_4')) uiObj = ('XColumn', 'YColumn', 'MColumn', 'MCheck') senderName = self.sender().objectName() key = senderName[0] active = [self.Types[key]] + self.findUi( [key + i for i in uiObj]) data = [] XY = sp.zeros((0,2)) path = self.Path[active[0]] if os.path.exists(path): try: data = sp.loadtxt(path) ''' activeFilt = self.findChilds(QtGui.QLineEdit, FiltersKeys[active[0]]) filtNames = '' if activeFilt[0].isEnabled() and activeFilt[1].isEnabled(): self.filtersDict = self.getFilters(length = self.LENGTH) for i in (0,1): filtNames = activeFilt[i].text().strip().replace(" ","").upper() temp = 1. if filtNames: temp = self.resFilters(filtNames) self.filtList[active[0]][i] = temp else: self.filtList[active[0]][:] = [1., 1.] print("Filters [X,Y]:",self.filtList[active[0]]) ''' xc = active[1].value() yc = active[2].value() mc = active[3].value() if active[4].checkState(): XY = sp.array( [data[:,xc], data[:,yc] ]).T / sp.array([data[:,mc], data[:,mc]]).T else: XY = sp.array( [data[:,xc], data[:,yc] ]).T XY = XY[XY[:,0] > 0] XY = XY[XY[:,1] > 0] if getattr(self.ui,senderName[0]+'CutForward').isChecked(): p = sp.where( XY[:,0] == XY[:,0].max())[0][0] print(p) XY = XY[:p,:] XY = XY[sp.argsort(XY[:,0])] ''' XY[:,0] = XY[:,0]/self.filtList[active[0]][0] XY[:,1] = XY[:,1]/self.filtList[active[0]][1] ''' self.updateData(array = Array(XY,Type = active[0]), action = 0) tabs = self.findUi(Tabs[active[0]]) tabs[0].setEnabled(True) if tabs[1].isEnabled(): tabs[2].setEnabled(True) except (ValueError, IOError, IndexError): self.mprint("loadData: readError") else: self.mprint('loadData: pathError')
def gettimes(ionocontlist): """ This static method will take a list of files, or a single string, and deterimine the time ordering and give the sort order for the files to be in. Inputs ionocontlist- A list of IonoContainer h5 files. Can also be a single string of a file name. Outputs sortlist - A numpy array of integers that will chronilogically order the files outtime - A Nt x 2 numpy array of all of the times. timebeg - A list of beginning times """ if isinstance(ionocontlist,string_types): ionocontlist=[ionocontlist] timelist=[] fileslist = [] for ifilenum,ifile in enumerate(ionocontlist): with tables.open_file(str(ifile)) as f: times = f.root.Time_Vector.read() timelist.append(times) fileslist.append(ifilenum*sp.ones(len(times))) times_file =sp.array([i[:,0].min() for i in timelist]) sortlist = sp.argsort(times_file) timelist_s = [timelist[i] for i in sortlist] timebeg = times_file[sortlist] fileslist = sp.vstack([fileslist[i][0] for i in sortlist]).flatten().astype('int64') outime = sp.vstack(timelist_s) return (sortlist,outime,fileslist,timebeg,timelist_s)
def find(x, v, next_largest=1, indices=None): """Returns the index into the 1D array x corresponding to the element of x that is either equal to v or the nearest to v. x is assumed to contain unique elements. if v is outside the range of values in x then the index of the smallest or largest element of x is returned. If next_largest == 1 then the nearest element taken is the next largest, otherwise if next_largest == 0 then the next smallest is taken. The optional argument indices speeds up multiple calls to this function if you pre-calculate indices=argsort(x). """ if indices is None: indices=argsort(x) xs=take(x, indices) assert next_largest in [0,1], "next_largest must be 0 or 1" eqmask=(xs==v).tolist() try: ix = eqmask.index(1) except ValueError: if next_largest: mask=(xs<v).tolist() else: mask=(xs>v).tolist() try: ix=min([max([0,mask.index(1-next_largest)+next_largest-1]),len(mask)-1]) except ValueError: ix = 0+next_largest-1 return indices[ix]
def readAnnotationFile(fn, format='gaf'): ### get list of overlapping genes overlapgenes = getOverlapGenes(fn, format) ### reading in gaf data = readinganno(fn, overlapgenes, format) uqgid = data.keys() ### unique gene ids newdata = [] for gid in uqgid: ### process transcripts if len(data[gid]) == 1: temp = processSingleTranscriptGenes(data[gid]) else: temp = processMultiTranscriptGenes(data[gid]) ### make sure it has been processed correctly if temp is None: continue else: temp.extend([gid]) newdata.append(temp) newdata = sp.array(newdata) sidx = sp.argsort(newdata[:,5]) newdata = newdata[sidx,:] ### filter gene with no name return sp.array(newdata)
def apply_flow(self,flowrate): r''' Convert the invaded sequence into an invaded time for a given flow rate considering the volume of invaded pores and throats. Parameters ---------- flowrate : float The flow rate of the injected fluid Returns ------- Creates a throat array called 'invasion_time' in the Algorithm dictionary ''' P12 = self._net['throat.conns'] # List of throats conns a = self['throat.invasion_sequence'] # Invasion sequence b = sp.argsort(self['throat.invasion_sequence']) P12_inv = self['pore.invasion_sequence'][P12] # Pore invasion sequence # Find if the connected pores were invaded with or before each throat P1_inv = P12_inv[:,0] == a P2_inv = P12_inv[:,1] == a c = sp.column_stack((P1_inv,P2_inv)) d = sp.sum(c,axis=1,dtype=bool) # List of Pores invaded with each throat # Find volume of these pores P12_vol = sp.zeros((self.Nt,)) P12_vol[d] = self._net['pore.volume'][P12[c]] # Add invaded throat volume to pore volume (if invaded) T_vol = P12_vol + self._net['throat.volume'] # Cumulative sum on the sorted throats gives cumulated inject volume e = sp.cumsum(T_vol[b]/flowrate) t = sp.zeros((self.Nt,)) t[b] = e # Convert back to original order self._phase['throat.invasion_time'] = t
def eigensigma(self): from scipy.linalg import eig from scipy.sparse import lil_matrix,bmat,eye from scipy import argsort,where #from scipy.sparse.linalg import eigen transverseH = lil_matrix((self.wafer.shape[1],self.wafer.shape[1])) transverseH.setdiag([2*self.t0]*self.wafer.shape[1]) transverseH.setdiag([-self.t0]*self.wafer.shape[1],1) transverseH.setdiag([-self.t0]*self.wafer.shape[1],-1) #following is wrong #SO=eye(self.wafer.shape[1],self.wafer.shape[1],1)*self.tso-eye(self.wafer.shape[1],self.wafer.shape[1],-1)*self.tso #transverseHspin = bmat([[transverseH, SO],[SO,transverseH]]) #self.HH = transverseHspin #from pudb import set_trace; set_trace() v,d = eig(transverseH.todense()) ndx = argsort(v) d=d[:,ndx] v=v[ndx] self.v = v self.d = d try: self.maxmode = where(self.v < self.Efermi-self.band_bottom)[0].max()+1 except ValueError: print "- ValueError probably no modes will fit at that energy" if v.max() > self.Efermi-self.band_bottom: print 'Some mode energies larger than fermi energy, only up to mode {0} will fit'.format(self.maxmode) print 'Argument num_modes="all" takes only modes low enough' print ''
def query(self,lv,k=None): """ returns distance and element index""" if (k==None): k=self.k if (type(lv)!=numpy.ndarray): lv=numpy.array(lv) if (lv.ndim==1): lv=lv.reshape(1,lv.shape[0]) if (lv.shape[0]==1): dt=abs(self.va.reshape(self.va.shape[0],1)-lv).T dr=scipy.argsort(dt)[0,:k] return dt.take(dr),dr.reshape(k) else: dt=scipy.spatial.distance.cdist(lv,self.va.reshape(self.va.shape[0],1)) dr=scipy.argsort(dt)[:,:k] return dt.take(dr),dr
def benjamini_hochberg_yekutieli(p_values=None,q_value=0.05,sort_idx=None,return_sort_idx=False): p_values = p_values.ravel() if sort_idx is None: sort_idx = sp.argsort(p_values) p_values = p_values[sort_idx] else: sort_idx = sort_idx.ravel() p_values = p_values[sort_idx] m = p_values.shape[0] idx_line = sp.arange(1,m+1) cV = (1.0/idx_line).sum() thr_line = (idx_line*q_value*cV)/float(m); thr_ind = sp.where(p_values<=thr_line)[0] if thr_ind.shape[0]==0: thr = 0.0; else: thr = p_values[thr_ind.max()] #adjust p_values p_values_adjusted = sp.ones(m) prev = 1.0 for i in range(m,0,-1): p_values_adjusted[i-1] = sp.minimum(prev,p_values[i-1]*float(m)*cV/float(i)) if p_values_adjusted[i-1]>1: p_values_adjusted[i-1]=1 prev = p_values_adjusted[i-1] #resort pvalues p_tmp = p_values_adjusted.copy() p_values_adjusted[sort_idx] = p_tmp if return_sort_idx==True: return [thr,p_values_adjusted,sort_idx] else: return [thr,p_values_adjusted]
def setup(self, phase, throat_prop='throat.capillary_pressure', **kwargs): r""" Set up the required parameters for the algorithm Parameters ---------- phase : OpenPNM Phase object The phase to be injected into the Network. The Phase must have the capillary entry pressure values for the system. throat_prop : string The name of the throat property containing the capillary entry pressure. The default is 'throat.capillary_pressure'. """ self._phase = phase # Setup arrays and info self['throat.entry_pressure'] = phase[throat_prop] # Indices into t_entry giving a sorted list self['throat.sorted'] = sp.argsort(self['throat.entry_pressure'], axis=0) self['throat.order'] = sp.zeros_like(self['throat.sorted']) self['throat.order'][self['throat.sorted']] = sp.arange(0, self._net.Nt) self['throat.invaded'] = -sp.ones((self._net.Nt,)) self['pore.invaded'] = -sp.ones((self._net.Np,)) self._tcount = 0
def nms(boxes, T = 0.5): if len(boxes) == 0: return [] boxes = boxes.astype("float") pick = [] x1 = boxes[:,0] y1 = boxes[:,1] x2 = boxes[:,2] y2 = boxes[:,3] area = (x2 - x1 + 1) * (y2 - y1 + 1) idxs = sp.argsort(y2) while len(idxs) > 0: last = len(idxs) - 1 i = idxs[last] pick.append(i) xx1 = sp.maximum(x1[i], x1[idxs[:last]]) yy1 = sp.maximum(y1[i], y1[idxs[:last]]) xx2 = sp.minimum(x2[i], x2[idxs[:last]]) yy2 = sp.minimum(y2[i], y2[idxs[:last]]) w = sp.maximum(0, xx2 - xx1 + 1) h = sp.maximum(0, yy2 - yy1 + 1) I = w * h #overlap_ratio = I / area[idxs[:last]] overlap_ratio = I /(area[i] + area[idxs[:last]] - I) idxs = sp.delete(idxs, sp.concatenate(([last], sp.where(overlap_ratio > T)[0]))) return boxes[pick].astype("int")
def _get_model_cv_preds(self, model, X_train, y_train, cache_file): """ Return cross-validation predictions on the training set, using cache if possible. This is used if stacking is enabled (ie. a second model is used to combine the stage 0 predictions). """ stack_preds = load_from_cache( "models/%s/cv_preds/%s.pkl" % (self.cache_dir, cache_file), self.use_cached_models) if stack_preds is None: kfold = cross_validation.StratifiedKFold(y_train, 4) stack_preds = [] indexes_cv = [] for stage0, stack in kfold: model.fit(X_train[stage0], y_train[stage0]) stack_preds.extend(list(model.predict_proba( X_train[stack])[:, 1])) indexes_cv.extend(list(stack)) stack_preds = np.array(stack_preds)[sp.argsort(indexes_cv)] with open("cache/models/%s/cv_preds/%s%d.pkl" % ( self.cache_dir, cache_file), 'wb') as f: pickle.dump(stack_preds, f, pickle.HIGHEST_PROTOCOL) return stack_preds
def CreateEnergyGrid(self,ParticlesPerBin=1000): v2 = self.Snapshot.vx*self.Snapshot.vx+self.Snapshot.vy*self.Snapshot.vy+self.Snapshot.vz*self.Snapshot.vz E = 0.5*v2 + self.Snapshot.V index = scipy.argsort(E) tmpE = [] tmpMass = [] N = len(index) BinNo = 0 self.EGrid = EnergyGrid() TotalMass = self.Snapshot.m.sum() while (BinNo+1)*ParticlesPerBin < N: Particles = index[ range(BinNo*ParticlesPerBin,(BinNo+1)*ParticlesPerBin) ] Max = E[Particles].max() Min = E[Particles].min() Mean = E[Particles].mean() tmpE.append( Mean ) tmpMass.append( self.Snapshot.m[Particles].sum() / ( Max - Min ) ) BinNo += 1 self.EGrid.Mass = scipy.array(tmpMass) self.EGrid.E = scipy.array(tmpE) return self.EGrid
def add_times(self,self2): """This method will combine the times and content of two instances of the GeoData class. The first object will be extendent in time.""" datakeys = self.data.keys() assert set(datakeys) ==set(self2.data.keys()),'Data must have the same names.' # Look at the coordinate names assert self.coordnames==self2.coordnames,'Must be same coordinate same.' # Look at the data location a = np.ma.array(self.dataloc,mask=np.isnan(self.dataloc)) blah = np.ma.array(self2.dataloc,mask=np.isnan(self2.dataloc)) assert np.ma.allequal(a,blah),'Location points must be the same' # Look at the sensor location a = np.ma.array(self.sensorloc,mask=np.isnan(self.sensorloc)) blah = np.ma.array(self2.sensorloc,mask=np.isnan(self2.sensorloc)) assert np.ma.allequal(a,blah),'Sensor Locations must be the same' alltimes = sp.vstack((timerepair(self.times),timerepair(self2.times))) #sort based off of start times s_ind = sp.argsort(alltimes[:,0]) self.times = alltimes[s_ind] if self.issatellite(): for ikey in self.datanames(): outarr=sp.concatenate((self.data[ikey],self2.data[ikey]),0) self.data[ikey]=outarr[s_ind] for ikey in self.datanames(): outarr = sp.hstack((self.data[ikey],self2.data[ikey])) self.data[ikey] = outarr[:,s_ind]
def roc(labels, predictions): """roc - calculate receiver operator curve labels: true labels (>0 : True, else False) predictions: the ranking generated from whatever predictor is used""" #1. convert to arrays labels = S.array(labels).reshape([-1]) predictions = S.array(predictions).reshape([-1]) #threshold t = labels>0 #sort predictions in desceninding order #get order implied by predictor (descending) Ix = S.argsort(predictions)[::-1] #reorder truth t = t[Ix] #compute true positiive and false positive rates tp = S.double(N.cumsum(t))/t.sum() fp = S.double(N.cumsum(~t))/(~t).sum() #add end points tp = S.concatenate(([0],tp,[1])) fp = S.concatenate(([0],fp,[1])) return [tp,fp]
def _get_model_cv_preds(self, model, X_train, y_train): """ Return cross-validation predictions on the training set """ fname = self._get_model_cv_fname(model, X_train, y_train, self.n_folds_stack) try: logger.debug("trying to load cv_pred from %s", fname) with open(fname,"rb") as f: stack_preds = pickle.load(f) except IOError: logger.debug("not found: %s", fname) stack_preds = None if stack_preds is None: kfold = cross_validation.StratifiedKFold(y_train, self.n_folds_stack) stack_preds = [] indexes_cv = [] for stage0, stack in kfold: model.fit(X_train[stage0], y_train[stage0]) stack_preds.extend(list(model.predict_proba( X_train[stack])[:, 1])) indexes_cv.extend(list(stack)) stack_preds = np.array(stack_preds)[sp.argsort(indexes_cv)] with open(fname,"wb") as f: pickle.dump(stack_preds,f) if self.use_logit and self.gnrl=='LR': logger.debug('transform stack_preds(%s) using logit',stack_preds.shape) stack_preds = logit(stack_preds) return stack_preds
def toplines(self,n_lines=5): """ This function is given. """ lines = sp.zeros((self.n_topics,n_lines)) for i in xrange(self.n_topics): args = sp.argsort(self._theta[:,i]).tolist() args.reverse() lines[i,:] = sp.array(args)[0:n_lines] + 1 return lines
def precision_and_recall(actual,predicted,cls): c = (actual == cls) si = sp.argsort(-c) tp = sp.cumsum(sp.single(predicted[si] == cls)) fp = sp.cumsum(sp.single(predicted[si] != cls)) rec = tp /sp.sum(predicted == cls) prec = tp / (fp + tp) return prec,rec
def segmented(): radius = 5 sigmaI = 0.02 sigmaX = 3.0 height = img.shape[0] width = img.shape[1] flatImg = img.flatten() darkImg = flatImg brightImg = flatImg nodes = img.flatten() W = spar.lil_matrix((nodes.size, nodes.size),dtype=float) D = sp.zeros((1,nodes.size)) for row in range(height): for col in range(width): for k in range(row-radius,row+radius): for l in range(col-radius,col+radius): try: w = weight(row,col,k,l) W[row*width+col,k*width+l] = w D[0,row*width+col] += w except: continue D = spar.spdiags(D, 0, nodes.size, nodes.size) Q = D - W D1 = D.todense() Q1 = Q.todense() diags = sp.diag(D1) DminusHalf = sp.diag(diags**-0.5) segQ = sp.dot(sp.dot(DminusHalf, Q1),DminusHalf) vals, vecs = la.eig(segQ) vecind = sp.argsort(vals)[1] theVec = vecs[vecind] for i in range(0,height**2): if theVec[i] < 0: darkImg[i] = 0.0 else: brightImg[i] = 0.0 darkImg = sp.reshape(darkImg, (height,height)) brightImg = sp.reshape(brightImg, (height,height)) return darkImg, flatImg, brightImg
def Experimento(db): # nome das figuras name_arr = scipy.array(db.keys()) # outro dicionario: nome das figuras x rótulos das classes cl = dict(zip(name_arr,[int(db[i][0]) for i in name_arr])) # Obtém da base de entrada uma Matriz N_Samples x N_Features # Descarta primeira coluna (Rótulos das classes) data = scipy.array([db[nome][1:] for nome in name_arr]) # distancia : medida de dissimilaridade a ser empregada #distancias = ['braycurtis','canberra','chebyshev','cityblock','correlation', # 'cosine','dice','euclidean','hamming','jaccard', # 'kulsinski','mahalanobis','matching','minkowski', # 'rogerstanimoto','russelrao','seuclidean','sokalmichener', # 'sokalsneath','sqeuclidean','yule'] distancia = 'euclidean' # Numero de amostras Nobj = data.shape[0] # Total de classes Nclasses = max(cl.values()) # Total de amostras por classe # assumindo que a base é balanceada!!!! Nac = Nobj/Nclasses # Numero de recuperações Nretr = Nac # Calcula matriz de distancias md = squareform(pdist(data,distancia)) # Para contabilizar a Matriz de confusão l = scipy.zeros((Nclasses,Nac),dtype = int) for i,nome in zip(scipy.arange(Nobj),name_arr): # Para cada linha de md estabelece rank de recuperacao # O primeiro elemento de cada linha corresponde a forma modelo # Obtem a classe dos objetos recuperados pelo ordem crescente de distancia idx = scipy.argsort(md[i]) # pega classes a qual pertencem o primeiro padrao e as imagens recuperadas classe_padrao = cl[nome] name_retr = name_arr[idx] aux = scipy.array([cl[j] for j in name_retr]) # estamos interessados apenas nos Nretr subsequentes resultados classe_retrs = aux[1:Nretr] n = scipy.nonzero(classe_retrs == classe_padrao) # Contabiliza resultados for i in n[0]: l[classe_padrao-1,i] = l[classe_padrao-1,i] + 1 return l,Nac
def __init__(self, N, vectors, coverage_ratio=0.2): """ Performs exact nearest neighbour search on the data set. vectors can either be a numpy matrix with all the vectors as columns OR a python array containing the individual numpy vectors. """ # We need a dict from vector string representation to index self.vector_dict = {} self.N = N self.coverage_ratio = coverage_ratio # Get numpy array representation of input self.vectors = numpy_array_from_list_or_numpy_array(vectors) # Build map from vector string representation to vector for index in range(self.vectors.shape[1]): self.vector_dict[self.__vector_to_string( self.vectors[:, index])] = index # Get transposed version of vector matrix, so that the rows # are the vectors (needed by cdist) vectors_t = numpy.transpose(self.vectors) # Determine the indices of query vectors used for comparance # with approximated search. query_count = numpy.floor(self.coverage_ratio * self.vectors.shape[1]) self.query_indices = [] for k in range(int(query_count)): index = numpy.floor(k*(self.vectors.shape[1]/query_count)) index = min(index, self.vectors.shape[1]-1) self.query_indices.append(int(index)) print('\nStarting exact search (query set size=%d)...\n' % query_count) # For each query vector get the closest N neighbours self.closest = {} self.exact_search_time_per_vector = 0.0 for index in self.query_indices: v = vectors_t[index, :].reshape(1, self.vectors.shape[0]) exact_search_start_time = time.time() D = cdist(v, vectors_t, 'euclidean') self.closest[index] = scipy.argsort(D)[0, 1:N+1] # Save time needed for exact search exact_search_time = time.time() - exact_search_start_time self.exact_search_time_per_vector += exact_search_time print('\Done with exact search...\n') # Normalize search time self.exact_search_time_per_vector /= float(len(self.query_indices))
def topterms(self,n_terms=10): """ This function is given. """ vec = sp.atleast_2d(sp.arange(0,self.n_words)) topics = [] for k in xrange(self.n_topics): probs = sp.atleast_2d(self._phi[k,:]) mat = sp.append(probs,vec,0) sind = sp.array([mat[:,i] for i in sp.argsort(mat[0])]).T topics.append([self.vocab[int(sind[1,self.n_words - 1 - i])] for i in xrange(n_terms)]) return topics
def test_gaussian_multiple_populations_adpative_population_size( db_path, sampler): sigma_x = 1 sigma_y = .5 y_observed = 2 def model(args): return {"y": st.norm(args['x'], sigma_y).rvs()} models = [model] models = list(map(SimpleModel, models)) nr_populations = 4 population_size = AdaptivePopulationSize(600) parameter_given_model_prior_distribution = [ Distribution(x=st.norm(0, sigma_x)) ] abc = ABCSMC(models, parameter_given_model_prior_distribution, MinMaxDistance(measures_to_use=["y"]), population_size, eps=MedianEpsilon(.2), sampler=sampler) abc.new(db_path, {"y": y_observed}) minimum_epsilon = -1 abc.do_not_stop_when_only_single_model_alive() history = abc.run(minimum_epsilon, max_nr_populations=nr_populations) posterior_x, posterior_weight = history.get_distribution(0, None) posterior_x = posterior_x["x"].values sort_indices = sp.argsort(posterior_x) f_empirical = sp.interpolate.interp1d( sp.hstack((-200, posterior_x[sort_indices], 200)), sp.hstack((0, sp.cumsum(posterior_weight[sort_indices]), 1))) sigma_x_given_y = 1 / sp.sqrt(1 / sigma_x**2 + 1 / sigma_y**2) mu_x_given_y = sigma_x_given_y**2 * y_observed / sigma_y**2 expected_posterior_x = st.norm(mu_x_given_y, sigma_x_given_y) x = sp.linspace(-8, 8) max_distribution_difference = sp.absolute( f_empirical(x) - expected_posterior_x.cdf(x)).max() assert max_distribution_difference < 0.15 assert history.max_t == nr_populations - 1 mean_emp, std_emp = mean_and_std(posterior_x, posterior_weight) assert abs(mean_emp - mu_x_given_y) < .07 assert abs(std_emp - sigma_x_given_y) < .12
def __init__(self, hash_name, projection_count, training_set): """ Computes principal components for training vector set. Uses first projection_count principal components for projections. Training set must be either a numpy matrix or a list of numpy vectors. """ super(PCABinaryProjections, self).__init__(hash_name) self.projection_count = projection_count # Only do training if training set was specified if not training_set is None: # Get numpy array representation of input training_set = numpy_array_from_list_or_numpy_array(training_set) # Get subspace size from training matrix self.dim = training_set.shape[0] # Get transposed training set matrix for PCA training_set_t = numpy.transpose(training_set) # Compute principal components (eigenvalues, eigenvectors) = perform_pca(training_set_t) # Get largest N eigenvalue/eigenvector indices largest_eigenvalue_indices = numpy.flipud( scipy.argsort(eigenvalues))[:projection_count] # Create matrix for first N principal components self.components = numpy.zeros( (self.dim, len(largest_eigenvalue_indices))) # Put first N principal components into matrix for index in range(len(largest_eigenvalue_indices)): self.components[:, index] = \ eigenvectors[:, largest_eigenvalue_indices[index]] # We need the component vectors to be in the rows self.components = numpy.transpose(self.components) else: self.dim = None self.components = None # This is only used in case we need to process sparse vectors self.components_csr = None
def sorted_csr_from_coo(shape, row_idx, col_idx, val, only_topk=None): m = (sp.absolute(val).sum() + 1) * 3 sorted_idx = sp.argsort(row_idx * m - val) row_idx[:] = row_idx[sorted_idx] col_idx[:] = col_idx[sorted_idx] val[:] = val[sorted_idx] indptr = sp.cumsum(sp.bincount(row_idx + 1, minlength=(shape[0] + 1))) if only_topk is not None and isinstance(only_topk, int): only_topk = max(min(1, only_topk), only_topk) selected_idx = (sp.arange(len(val)) - indptr[row_idx]) < only_topk row_idx = row_idx[selected_idx] col_idx = col_idx[selected_idx] val = val[selected_idx] indptr = sp.cumsum(sp.bincount(row_idx + 1, minlength=(shape[0] + 1))) return smat.csr_matrix((val, col_idx, indptr), shape=shape, dtype=val.dtype)
def weighted_quantile(points, weights=None, alpha=0.5): """ Weighted alpha-quantile. E.g. alpha = 0.5 -> median. """ # sort input and set weights sorted_indices = sp.argsort(points) points = points[sorted_indices] if weights is None: len_points = len(points) weights = sp.ones(len_points) / len_points else: weights = weights[sorted_indices] cs = sp.cumsum(weights) quantile = sp.interp(alpha, cs - 0.5 * weights, points) return quantile
def setup(self, phase, entry_pressure='', pore_volume='', throat_volume=''): r""" Set up the required parameters for the algorithm Parameters ---------- phase : OpenPNM Phase object The phase to be injected into the Network. The Phase must have the capillary entry pressure values for the system. entry_pressure : string The dictionary key to the capillary entry pressure. If none is supplied then the current value is retained. The default is 'throat.capillary_pressure'. pore_volume : string The dictionary key to the pore volume. If none is supplied then the current value is retained. The default is 'pore.volume'. throat_volume : string The dictionary key to the throat volume. If none is supplied then the current value is retained. The default is 'throat.volume'. """ self.settings['phase'] = phase.name if pore_volume: self.settings['pore_volume'] = pore_volume if throat_volume: self.settings['throat_volume'] = throat_volume if entry_pressure: self.settings['entry_pressure'] = entry_pressure # Setup arrays and info self['throat.entry_pressure'] = phase[self.settings['entry_pressure']] # Indices into t_entry giving a sorted list self['throat.sorted'] = sp.argsort(self['throat.entry_pressure'], axis=0) self['throat.order'] = 0 self['throat.order'][self['throat.sorted']] = sp.arange(0, self.Nt) self['throat.invasion_sequence'] = -1 self['pore.invasion_sequence'] = -1
def parse_plink_snps(genotype_file, snp_indices): plinkf = plinkfile.PlinkFile(genotype_file) samples = plinkf.get_samples() num_individs = len(samples) num_snps = len(snp_indices) raw_snps = sp.empty((num_snps, num_individs), dtype='int8') # If these indices are not in order then we place them in the right place while parsing SNPs. snp_order = sp.argsort(snp_indices) # print(snp_indices) ordered_snp_indices = list(snp_indices[snp_order]) ordered_snp_indices.reverse() print('Iterating over file to load SNPs') snp_i = 0 next_i = ordered_snp_indices.pop() line_i = 0 max_i = ordered_snp_indices[0] while line_i <= max_i: if line_i < next_i: next(plinkf) elif line_i == next_i: line = next(plinkf) snp = sp.array(line, dtype='int8') bin_counts = line.allele_counts() if bin_counts[-1] > 0: mode_v = sp.argmax(bin_counts[:2]) snp[snp == 3] = mode_v s_i = snp_order[snp_i] ## fixed buggy code ## wrong encoding of genotype (A1 should be encoded as 1 instead of A2. It is different from plinkio default) ## original code: # raw_snps[s_i] = snp ## new code raw_snps[s_i] = 2 - snp ## fix finish if line_i < max_i: next_i = ordered_snp_indices.pop() snp_i += 1 line_i += 1 plinkf.close() assert snp_i == len(raw_snps), 'Failed to parse SNPs?' num_indivs = len(raw_snps[0]) freqs = sp.sum(raw_snps, 1, dtype='float32') / (2 * float(num_indivs)) return raw_snps, freqs
def compute_MI(seqs, batches, emat): # preliminaries n_seqs = len(batches) n_batches = int(batches.max()) + 1 # assumes zero indexed batches n_bins = 1000 #energies = sp.zeros(n_seqs) f = sp.zeros((n_batches, n_seqs)) # compute energies # for i in range(n_seqs): # energies[i] = sp.sum(seqs[:,:,i]*emat) # alternate way dot = emat[:, :, sp.newaxis] * seqs energies = dot.sum(0).sum(0) # sort energies inds = sp.argsort(energies) for i, ind in enumerate(inds): f[batches[ind], i] = 1.0 / n_seqs # batches aren't zero indexed # bin and convolve with Gaussian f_binned = sp.zeros((n_batches, n_bins)) for i in range(n_batches): f_binned[i, :] = sp.histogram(f[i, :].nonzero()[0], bins=n_bins, range=(0, n_seqs))[0] #f_binned = f_binned/f_binned.sum() f_reg = scipy.ndimage.gaussian_filter1d(f_binned, 0.04 * n_bins, axis=1) f_reg = f_reg / f_reg.sum() # compute marginal probabilities p_b = sp.sum(f_reg, axis=1) p_s = sp.sum(f_reg, axis=0) # finally sum to compute the MI MI = 0 for i in range(n_batches): for j in range(n_bins): if f_reg[i, j] != 0: MI = MI + f_reg[i, j] * sp.log2(f_reg[i, j] / (p_b[i] * p_s[j])) print MI return MI, f_reg
def _goodK(self, cutoff=None): if cutoff is None: cutoff = 1e-10 * self.X.max() powers = self.Et * self.Ew.max(0) * self.Eh.max(1) sorted_powers = sp.flipud(sp.argsort(powers)) idx = sp.where(powers[sorted_powers] > cutoff * powers.max())[0] goodk = sorted_powers[:(idx[-1] + 1)] if powers[goodk[-1]] < cutoff: goodk = sp.delete(goodk, -1) goodk = sp.sort(goodk) # Etが1を超えているindexは削除するようにする # too_large_k = sp.where(self.Et > 1.0)[0] # goodk = sp.array(list(set(goodk) - set(too_large_k))) return goodk
def _get_model_cv_preds(self, model, X_train, y_train): """ Return cross-validation predictions on the training set. This is used if stacking is enabled (ie. a second model is used to combine the stage 0 predictions). """ kfold = cross_validation.StratifiedKFold(y_train, 4) stack_preds = [] indexes_cv = [] for stage0, stack in kfold: model.fit(X_train[stage0], y_train[stage0]) stack_preds.extend(list(model.predict_proba(X_train[stack])[:, 1])) indexes_cv.extend(list(stack)) stack_preds = np.array(stack_preds)[sp.argsort(indexes_cv)] return stack_preds
def coo_to_csr(coo): nr_rows, nr_cols, nnz, row, col, val = ( coo.shape[0], coo.shape[1], coo.data.shape[0], coo.row, coo.col, coo.data, ) indptr = sp.cumsum(sp.bincount(row + 1, minlength=(nr_rows + 1)), dtype=sp.uint64) indices = sp.zeros(nnz, dtype=sp.uint32) data = sp.zeros(nnz, dtype=dtype) sorted_idx = sp.argsort(row * sp.float64(nr_cols) + col) indices[:] = col[sorted_idx] data[:] = val[sorted_idx] return indptr, indices, data
def plotLoadings(FA, term, n_genes=10): """Plot highest loadings of a factor Args: FA (:class:`fscLVM.CSparseFA`): Factor analysis object, usually generated using `initFA` function term (str): Name of facotr for which loadings are to be plotted n_genes (int): Number of loadings to be shown """ Zchanged = FA.getZchanged([term])[:, 0] W = FA.getW([term])[:, 0] Z = FA.getZ([term])[:, 0] gene_labels = SP.array(FA.gene_ids) #plot weights Wabs = SP.absolute(W) * SP.absolute(Z) gene_index = SP.argsort(-Wabs)[:n_genes] Igain = (Zchanged[gene_index] == 1) Ielse = (Zchanged[gene_index] == 0) fig = plt.figure(figsize=(5, 5)) y = SP.arange(len(gene_index)) if Ielse.any(): plt.plot(abs(W[gene_index][Ielse] * Z[gene_index][Ielse]), y[Ielse], 'k.', label='pre annotated') if Igain.any(): plt.plot(abs(W[gene_index][Igain] * Z[gene_index][Igain]), y[Igain], 'r.', label='gains') plt.xlabel('Abs. weight', fontsize=14) plt.ylabel('Genes', fontsize=14) plt.yticks(y, gene_labels[gene_index], fontsize=14) plt.xticks(fontsize=13) plt.legend() plt.show() return fig
def Pk2Mp(ar,k,pk,ell_max=None): """ Implementation of FFTLog from A.J.S. Hamilton (2000) assumes log(k) are equally spaced """ muk = model.muk dmuk = model.dmuk k0 = k[0] l=np.log(k.max()/k0) r0=1. N=len(k) emm=N*np.fft.fftfreq(N) r=r0*sp.exp(-emm*l/N) dr=abs(np.log(r[1]/r[0])) s=sp.argsort(r) r=r[s] xi=np.zeros([ell_max//2+1,len(ar)]) for ell in range(0,ell_max+1,2): pk_ell=np.sum(dmuk*L(muk,ell)*pk,axis=0)*(2*ell+1)*(-1)**(ell//2) mu=ell+0.5 n=2. q=2-n-0.5 x=q+2*sp.pi*1j*emm/l lg1=myGamma.LogGammaLanczos((mu+1+x)/2) lg2=myGamma.LogGammaLanczos((mu+1-x)/2) um=(k0*r0)**(-2*sp.pi*1j*emm/l)*2**x*sp.exp(lg1-lg2) um[0]=sp.real(um[0]) an=np.fft.fft(pk_ell*k**n/2/sp.pi**2*np.sqrt(sp.pi/2)) an*=um xi_loc=np.fft.ifft(an) xi_loc=xi_loc[s] xi_loc/=r**(3-n) xi_loc[-1]=0 spline=sp.interpolate.splrep(np.log(r)-dr/2,sp.real(xi_loc),k=3,s=0) xi[ell//2,:]=sp.interpolate.splev(np.log(ar),spline) return xi
def classify(self, test_data, n_neighbors=5): if test_data.shape[1] != self.training_data.shape[1]: raise ValueError( 'Training data and test data do not have the same dimensions.') n_train = self.training_data.shape[0] n_test = test_data.shape[0] dists = sp.zeros((n_test, n_train)) labels = [] for i in xrange(n_test): for j in xrange(n_train): dists[i, j] = la.norm(self.training_data[j, :] - test_data[i, :]) inds = sp.argsort(dists[i, :])[0:n_neighbors] votes = sp.array([ sum(self.training_labels[inds] == self.classes[k]) for k in xrange(self.n_classes) ]) labels.append(self.classes[sp.copy(votes).argmax()]) return labels
def translate(tr_pairs, first_only=False): '''Takes a list of pairs with keys that should be translated into each other Returns a list of dictionaries providing the requested translation ''' metapickle = 'metadata.pickle.gz' if not os.path.exists(metapickle): (header_rel, data_rel) = _parse_metatable(META_REL) (header_rna, data_rna) = _parse_metatable(META_RNA) data_rel = _handle_multi_entries(header_rel, data_rel) ### merge the two tables into a single one (header, data) = _merge_tables(header_rel, header_rna, data_rel, data_rna) cPickle.dump((header, data), gzip.open(metapickle, 'w'), -1) else: (header, data) = cPickle.load(gzip.open(metapickle, 'r')) dicts = [] for source, target in tr_pairs: curr_dict = dict() idx1 = sp.where(header == source)[0][0] idx2 = sp.where(header == target)[0][0] s_idx = sp.argsort(data[:, idx1]) data = data[s_idx, :] _, cnt = sp.unique(data[:, idx1], return_counts=True) cum = 0 for c in cnt: if c == 1 or first_only: curr_dict[data[cum, idx1]] = data[cum, idx2] else: if sp.unique(data[cum:(cum + c), idx2]).shape[0] == 1: curr_dict[data[cum, idx1]] = data[cum, idx2] else: curr_dict[data[cum, idx1]] = ','.join( sp.unique(data[cum:(cum + c), idx2])) cum += c dicts.append(curr_dict) return dicts
def desi_convert_DLA(inPath, outPath): """ Convert a catalog of DLA from a DESI format to the format used by picca """ fromDESIkey2piccaKey = { 'RA': 'RA', 'DEC': 'DEC', 'Z': 'Z_DLA_RSD', 'ZQSO': 'Z_QSO_RSD', 'NHI': 'N_HI_DLA', 'THING_ID': 'MOCKID', 'DLAID': 'DLAID', 'PLATE': 'MOCKID', 'MJD': 'MOCKID', 'FIBERID': 'MOCKID' } cat = {} h = fitsio.FITS(inPath) for k, v in fromDESIkey2piccaKey.items(): cat[k] = h['DLACAT'][v][:] h.close() print('INFO: Found {} DLA from {} quasars'.format( cat['Z'].size, sp.unique(cat['THING_ID']).size)) w = sp.argsort(cat['THING_ID']) for k in cat.keys(): cat[k] = cat[k][w] for k in ['RA', 'DEC']: cat[k] = cat[k].astype('float64') ### Save out = fitsio.FITS(outPath, 'rw', clobber=True) cols = [v for v in cat.values()] names = [k for k in cat.keys()] out.write(cols, names=names, extname='DLACAT') out.close() return
def RBFKernelPCA(matrix=None, gamma=1, n_components=2): #1. Compute RBF Kernel K = np.exp(-gamma * distance.squareform(distance.pdist(matrix, 'sqeuclidean'))) #2. Center kernel matrix N = K.shape[0] one_n = np.ones((N, N)) / N cen_K = (np.eye(N) - one_n).dot(K.dot(np.eye(N) - one_n)) #3. Compute eigenvalues and eigenvactors [eigen_values, eigen_vectors] = linalg.eig(cen_K) #4. sort eigen vectors in decreasing order based on eigen values indices = sp.argsort(-eigen_values) [eigen_values, eigen_vectors ] = [sp.real(eigen_values[indices]), eigen_vectors[:, indices]] #5. Return transformed data for the first n_components A = (eigen_vectors[:, 0:n_components]) * sp.sqrt( 1 / eigen_values[0:n_components]) transformed = transformData(A, cen_K) return transformed
def main(self): print 'reading image' filename = os.path.join(self.main_path,self.img_file_path) img = Image.open(filename) img = img.resize(self.img_size, Image.ANTIALIAS) arr = scipy.misc.fromimage(img) ar = arr.reshape((scipy.product(arr.shape[:2]), arr.shape[2])) print 'img_reshaped to size:'ar.shape print 'finding clusters' codes, dist = scipy.cluster.vq.kmeans(ar, self.num_clusters) print 'cluster centres:\n', codes vecs, dist = scipy.cluster.vq.vq(ar, codes) # assign codes counts, bins = scipy.histogram(vecs, len(codes)) # count occurrences index_max = scipy.argsort(counts) [::-1] # find most frequent in desc order for i in codes[index_max]: colour = ''.join(chr(c) for c in i).encode('hex') print 'most frequent is %s (#%s)' % (i, colour)
def remove_isolated_clusters(conns, nonzero_locs, num_to_keep, **kwargs): r""" Identifies and removes all disconnected clusters except the number of groups specified by "num_to_keep". num_to_keep=N retains the N largest clusters """ # adj_mat = generate_adjacency_matrix(conns, nonzero_locs) # logger.info('determining connected components...') cs_ids = csgraph.connected_components(csgraph=adj_mat, directed=False)[1] groups, counts = sp.unique(cs_ids, return_counts=True) order = sp.argsort(counts)[::-1] groups = groups[order] counts = counts[order] del adj_mat, order num_to_keep = min(num_to_keep, groups.size) # msg = '\t{} component groups for {} total nodes' logger.debug(msg.format(groups.size, cs_ids.size)) msg = '\tlargest group number: {}, size {}' logger.debug(msg.format(groups[0], counts[0])) msg = '\t{} % of nodes contained in largest group' logger.debug(msg.format(counts[0] / cs_ids.size * 100)) msg = '\t{} % of nodes contained in {} retained groups' num = sp.sum(counts[0:num_to_keep]) / cs_ids.size * 100 logger.debug(msg.format(num, num_to_keep)) # # creating image colored by clusters if desired if kwargs.get('output_img', False): save_cluster_image(cs_ids, groups, counts, nonzero_locs, kwargs.get('img_shape'), kwargs.get('img_name')) # inds = sp.where(sp.in1d(cs_ids, groups[0:num_to_keep]))[0] del cs_ids, groups, counts # num = nonzero_locs.size nonzero_locs = nonzero_locs[inds] msg = '\tremoved {} disconnected nodes' logger.debug(msg.format(num - nonzero_locs.size)) # return nonzero_locs
def storey_tibishirani(p_values=None, sort_idx=None, return_sort_idx=False): p_values = p_values.ravel() if sort_idx is None: sort_idx = sp.argsort(p_values) p_values = p_values[sort_idx] else: sort_idx = sort_idx.ravel() p_values = p_values[sort_idx] m = p_values.shape[0] if m < 100: #if number if tests is too small use pi0=1 pi0 = 1.0 else: # otherwise estimate pi0 using a natural cubic spline #evaluate pi0 for a set of lambdas pi0 = [] lambdas = sp.arange(0.01, 0.96, 0.01) counts = [] for __lambda in lambdas: counts.append((p_values > __lambda).sum()) counts = sp.array(counts) for i in xrange(lambdas.shape[0]): pi0.append(counts[i] / float(m * (1.0 - lambdas[i]))) pi0 = sp.array(pi0) splrep = interpolate.splrep(lambdas, pi0, k=3) pi0 = interpolate.splev(lambdas[-1], splrep) if pi0 > 1.0: pi0 = 1.0 q_values = pi0 * p_values #q_values[-1] = sp.minimum(q_values[-1],1.0) for i in xrange(m - 2, -1, -1): q_values[i] = sp.minimum(pi0 * m * p_values[i] / float(i + 1.0), q_values[i + 1]) #resort q_values q_tmp = q_values.copy() q_values[sort_idx] = q_tmp if return_sort_idx == True: return [q_values, sort_idx] else: return q_values
def call_dfa(chrom, xdata, DFs, mask, data): """Runs DFA on subset of variables from "xdata" as defined by "chrom" and returns a vector of fitness scores to be fed back into the GA """ Y = [] for x in range(len(chrom)): if _remdup(chrom[x]) == 0: #extract vars from xdata slice = meancent(_slice(xdata, chrom[x])) collate = 0 for nF in range(mask.shape[1]): #split in to training and test tr_slice, cv_slice, ts_slice, tr_grp, cv_grp, ts_grp, tr_nm, cv_nm, ts_nm = _split( slice, data['class'][:, 0], mask[:, nF].tolist(), data['label']) try: u, v, eigs, dummy = cva(tr_slice, tr_grp, DFs) projU = scipy.dot(cv_slice, v) u = scipy.concatenate((u, projU), 0) group2 = scipy.concatenate((tr_grp, cv_grp), 0) B, W = _BW(u, group2) L, A = scipy.linalg.eig(B, W) order = _flip( scipy.argsort(scipy.reshape(L.real, (len(L), )))) Ls = _flip(scipy.sort(L.real)) eigval = Ls[0:DFs] collate += sum(eigval) except: continue if collate != 0: Y.append(float(mask.shape[1]) / collate) else: Y.append(10.0**5) else: Y.append(10.0**5) return scipy.array(Y)[:, nA]
def qqplot(pv, distr = 'log10', alphaLevel = 0.05): """ This script makes a Quantile-Quantile plot of the observed negative log P-value distribution against the theoretical one under the null. Input: pv pvalues (numpy array) distr scale of the distribution (log10 or chi2) alphaLevel significance bounds """ shape_ok = (len(pv.shape)==1) or ((len(pv.shape)==2) and pv.shape[1]==1) assert shape_ok, 'qqplot requires a 1D array of p-values' tests = pv.shape[0] pnull = (0.5 + sp.arange(tests))/tests # pnull = np.sort(np.random.uniform(size = tests)) Ipv = sp.argsort(pv) if distr == 'chi2': qnull = sp.stats.chi2.isf(pnull, 1) qemp = (sp.stats.chi2.isf(pv[Ipv],1)) xl = 'LOD scores' yl = '$\chi^2$ quantiles' if distr == 'log10': qnull = -sp.log10(pnull) qemp = -sp.log10(pv[Ipv]) xl = '-log10(P) observed' yl = '-log10(P) expected' plt.plot(qnull, qemp, '.') #plt.plot([0,qemp.m0x()], [0,qemp.max()],'r') plt.plot([0,qnull.max()], [0,qnull.max()],'r') plt.ylabel(xl) plt.xlabel(yl) if alphaLevel is not None: if distr == 'log10': betaUp, betaDown, theoreticalPvals = _qqplot_bar(M=tests,alphaLevel=alphaLevel,distr=distr) lower = -sp.log10(theoreticalPvals-betaDown) upper = -sp.log10(theoreticalPvals+betaUp) plt.fill_between(-sp.log10(theoreticalPvals),lower,upper,color='grey',alpha=0.5)
def mds(SM): """ MDS (Multi Dimensional Scaling) @param SM Input similarity matrix @return V1: ndarray Dimension 1 of MDS V2: ndarray Dimension 2 of MDS """ N = SM.shape[0] # 距離の2乗行列を作成 D = SM * SM # 中心化行列 one = sp.eye(N) - sp.ones((N, N)) / N # ヤング・ハウスホルダー変換 P = -0.5 * one * D * one # これだと要素積になってしまうのでは? # P = -0.5 * sp.dot( sp.dot(one, D), one ) # 固有値分解 W, V = sp.linalg.eig(P) ind = sp.argsort(W) x1 = ind[-1] x2 = ind[-2] # 標準偏差を掛けたデータにする # s = P.std(axis=0) # w1 = s[x1] # w2 = s[x2] # V1 = w1 * V[:, x1] # V2 = w2 * V[:, x2] V1 = V[:, x1] V2 = V[:, x2] # 実数値に変換 V1 = V1.astype('float') V2 = V2.astype('float') return V1, V2
def RBFKernelPCA(matrix=None, gamma=1, n_components=2): n = matrix.shape[0] #1. Compute RBF Kernel kernelmat = np.exp(-gamma * (distance.cdist(matrix, matrix, metric='euclidean'))) #2. Center kernel matrix center = np.identity(n) - np.ones((n, n)) / n cen_kernelmat = center @ kernelmat @ center #3. Compute eigenvalues and eigenvactors [eigen_values, eigen_vectors] = linalg.eig(cen_kernelmat) #4. sort eigen vectors in decreasing order based on eigen values indices = sp.argsort(-eigen_values) [eigen_values, eigen_vectors ] = [sp.real(eigen_values[indices]), eigen_vectors[:, indices]] #sollte nicht negativ sein: make them unit length #first two PCs: A = np.sqrt( 1 / eigen_values[:n_components]) * eigen_vectors[:, :n_components] #5. Return transformed data return sp.dot(A.T, cen_kernelmat.T).T
def test_continuous_non_gaussian(db_path, sampler): def model(args): return {"result": sp.rand() * args['u']} models = [model] models = list(map(SimpleModel, models)) population_size = ConstantPopulationSize(250) parameter_given_model_prior_distribution = [Distribution(u=RV("uniform", 0, 1))] abc = ABCSMC(models, parameter_given_model_prior_distribution, MinMaxDistanceFunction(measures_to_use=["result"]), population_size, eps=MedianEpsilon(.2), sampler=sampler) d_observed = .5 abc.new(db_path, {"result": d_observed}) abc.do_not_stop_when_only_single_model_alive() minimum_epsilon = -1 history = abc.run(minimum_epsilon, max_nr_populations=2) posterior_x, posterior_weight = history.get_distribution(0, None) posterior_x = posterior_x["u"].values sort_indices = sp.argsort(posterior_x) f_empirical = sp.interpolate.interp1d(sp.hstack((-200, posterior_x[sort_indices], 200)), sp.hstack((0, sp.cumsum( posterior_weight[ sort_indices]), 1))) @sp.vectorize def f_expected(u): return (sp.log(u)-sp.log(d_observed)) / (- sp.log(d_observed)) * \ (u > d_observed) x = sp.linspace(0.1, 1) max_distribution_difference = sp.absolute(f_empirical(x) - f_expected(x)).max() assert max_distribution_difference < 0.12
def auc(y, prob, w): if len(w) == 0: mindiff = scipy.amin(scipy.diff(scipy.unique(prob))) pert = scipy.random.uniform(0, mindiff / 3, prob.size) t, rprob = scipy.unique(prob + pert, return_inverse=True) n1 = scipy.sum(y, keepdims=True) n0 = y.shape[0] - n1 u = scipy.sum(rprob[y == 1]) - n1 * (n1 + 1) / 2 result = u / (n1 * n0) else: op = scipy.argsort(prob) y = y[op] w = w[op] cw = scipy.cumsum(w) w1 = w[y == 1] cw1 = scipy.cumsum(w1) wauc = scipy.sum(w1 * (cw[y == 1] - cw1)) sumw = cw1[-1] sumw = sumw * (c1[-1] - sumw) result = wauc / sumw return (result)
def train(self, X): # データの中心化 self.X_mean = X.mean(0) X_centered = X - self.X_mean # 分散共分散行列の作成 V = sp.cov(X_centered.T) # Vの固有値計算 self.eigvals, self.eigvecs = linalg.eig(V) # 大きい方からn_components個の固有値を取り出し,それに対応する固有ベクトルを並べて基底を定める eigvals_idx = sp.argsort(self.eigvals) eigvals_idx = eigvals_idx[len(eigvals_idx)::-1] self.U = self.eigvecs[eigvals_idx[:self.n_components]] # 基底ベクトルから射影した点を求める X_pca = sp.dot(self.U, X_centered.T) X_pca = X_pca.T return X_pca, self.U
def cubeIndex_RWGNumbers_computation(RWGNumber_cubeNumber, RWGNumber_cubeCentroidCoord): """each finest-level cube must somehow know which edges it contains. This function has the goal of establishing this list for every cube. Only the cubes containing edges will be retained. We also create a list of the cubes centroids, which will be ordered the same way as the cubes_lists_edges_numbers list.""" E = RWGNumber_cubeNumber.shape[0] # the number of RWGs involved ind_sorted_cubes_numbers = argsort(RWGNumber_cubeNumber, kind='mergesort') sorted_cubes_numbers = take(RWGNumber_cubeNumber, ind_sorted_cubes_numbers, axis=0) sorted_edges_numbers = take(arange(E), ind_sorted_cubes_numbers, axis=0) sorted_edges_numbers_cubes_centroids = take(RWGNumber_cubeCentroidCoord, ind_sorted_cubes_numbers, axis=0) cubes_lists_edges_numbers = {} # the desired dictionary, renewed for each cube cube_list_edges_numbers_tmp = [sorted_edges_numbers[0]] # the temporary list, renewed for each cube cubes_centroids = [sorted_edges_numbers_cubes_centroids[0]] cubeIndex = 0 for j in range(E-1): # we cannot go up to (E-1), since (j+1) will then be equal to E (out of bound index) if sorted_cubes_numbers[j+1] == sorted_cubes_numbers[j]: # if the next cube number is the same as the current one cube_list_edges_numbers_tmp.append(sorted_edges_numbers[j+1]) # add the next element to the temporary list else: # if not, we then add the temporary "per-cube" list to the complete list cubes_lists_edges_numbers[cubeIndex] = array(cube_list_edges_numbers_tmp) cubes_centroids.append(sorted_edges_numbers_cubes_centroids[j+1]) cube_list_edges_numbers_tmp = [sorted_edges_numbers[j+1]] # init of the temporary list for the next cube cubeIndex += 1 # we must append the last temporary list if cubeIndex in cubes_lists_edges_numbers: cubes_lists_edges_numbers[cubeIndex+1] = array(cube_list_edges_numbers_tmp) else: cubes_lists_edges_numbers[cubeIndex] = array(cube_list_edges_numbers_tmp) # we transform the "cubes_lists_edges_numbers" in a linear array, useful for the C++ code C = len(cubes_lists_edges_numbers) cubes_edges_numbers = zeros(E, 'i') cube_N_RWGs = zeros(C, 'i') startIndex = 0 for j in range(C): length = cubes_lists_edges_numbers[j].shape[0] cube_N_RWGs[j] = length cubes_edges_numbers[startIndex:startIndex + length] = cubes_lists_edges_numbers[j] startIndex += length return cubes_edges_numbers, cubes_lists_edges_numbers, cube_N_RWGs.astype('i'), (array(cubes_centroids)).astype('d')
def qqplot(pv, outPlot, color="#2c7fb8", label='unknown', alphaLevel=0.05): distr = 'log10' ax = plt.gca() #if (len(pv.shape) == 1) or ((len(pv.shape) == 2) and pv.shape[1] == 1): # die("qqplot requires a 1D array of p-values") tests = pv.shape[0] pnull = (0.5 + sp.arange(tests)) / tests Ipv = sp.argsort(pv) if distr == 'log10': qnull = -sp.log10(pnull) qemp = -sp.log10(pv[Ipv]) xl = '-log10(P) observed' yl = '-log10(P) expected' plt.plot(qnull, qemp, '.', color=color, label=label) # plt.plot([0,qemp.m0x()], [0,qemp.max()],'r') plt.plot([0, qnull.max()], [0, qnull.max()], 'r') plt.ylabel(xl) plt.xlabel(yl) if alphaLevel is not None: if distr == 'log10': betaUp, betaDown, theoreticalPvals = _qqplot_bar( M=tests, alphaLevel=alphaLevel, distr=distr) lower = -sp.log10(theoreticalPvals - betaDown) upper = -sp.log10(theoreticalPvals + betaUp) plt.fill_between(-sp.log10(theoreticalPvals), lower, upper, color='grey', alpha=0.5) # plt.plot(-sp.log10(theoreticalPvals),lower,'g-.') # plt.plot(-sp.log10(theoreticalPvals),upper,'g-.') ax.spines["right"].set_visible(False) ax.spines["top"].set_visible(False) ax.xaxis.set_ticks_position('bottom') ax.yaxis.set_ticks_position('left') fig = ax.get_figure() fig.savefig(outPlot)
def plot_pct_fwds(self): """ Plot the pct of forward motion for each genotype. """ Nn = len(self.dirs_to_plot) for iD, dir in enumerate(self.dirs_to_plot): filename = os.path.join(dir, 'pct_fwd.txt') tmp_data = sp.loadtxt(filename) if self.data is None: self.data = sp.zeros((Nn, len(tmp_data))) self.data[iD, :] = tmp_data # Get average fwd_pcts with error bars (1 sem) self.avgs = sp.average(self.data, axis=1)*100 self.stds = sp.std(self.data, axis=1)*100 sort_idxs = sp.argsort(self.avgs)[::-1] # Make empty zero index if not yet (hacky!!) if sort_idxs[0] != 0: zero_idx = sp.argwhere(sort_idxs == 0)[0] change_idx = sort_idxs[zero_idx] sort_idxs[zero_idx] = sort_idxs[0] sort_idxs[0] = 0 sort_labels = self.genotypes[sort_idxs] sort_avgs = self.avgs[sort_idxs] sort_stds = self.stds[sort_idxs] # Plot for each genotype fig = plt.figure() fig.set_size_inches(3, 4) plt.errorbar(range(Nn), sort_avgs, sort_stds, lw=0, elinewidth=1.5, capsize=5, color='k') plt.scatter(range(Nn), sort_avgs, c=sp.arange(Nn), cmap=plt.cm.winter, zorder=100, s=30) plt.ylim(0, 105) plt.xticks(rotation=90) plt.xticks(range(Nn), sort_labels)
def RBFKernelPCA(matrix=None, gamma=1, n_components=2): #1. Compute RBF Kernel #2. Center kernel matrix #3. Compute eigenvalues and eigenvactors #4. sort eigen vectors in decreasing order based on eigen values #5. Return transformed data for the first n_components d = distance.pdist(matrix, 'sqeuclidean') m1 = distance.squareform(d) K = np.exp(-gamma * m1) o1 = np.ones(K.shape) / K.shape[0] I = np.identity(K.shape[0]) Kp = np.dot(np.dot((I - o1), K), (I - o1)) eigen_values, eigen_vectors = computePCA_SVD(Kp) indices = sp.argsort(-eigen_values) seigen_values = sp.real(eigen_values[indices]) seigen_vectors = eigen_vectors[:, indices] for i in range(len(seigen_values)): sseigen_vectors = np.sqrt(1 / seigen_values[i]) * seigen_vectors transformed = transformData(sseigen_vectors[:, 0:n_components], Kp) return transformed