def _set_covmatrix(self, covmatrix=None): """Builds covmatrix from self.pars. If setting from an externally provided covariance matrix then updates self.pars for consistency""" # If covmatrix hasn't been provided, generate from self._pars # and set. if covmatrix is None: dx = self._pars[6] dv = self._pars[7] self._covmatrix = np.identity(6) self._covmatrix[:3, :3] *= dx ** 2 self._covmatrix[3:, 3:] *= dv ** 2 # If covmatrix has been provided, reverse engineer the most # suitable set of parameters and update self._pars accordingly # (e.g. take the geometric mean of the (square-rooted) velocity # eigenvalues as dv, as this at least ensures constant volume # in velocity space). else: self._covmatrix = np.copy(covmatrix) dx = gmean(np.sqrt( np.linalg.eigvalsh(self._covmatrix[:3, :3])) ) dv = gmean(np.sqrt( np.linalg.eigvalsh(self._covmatrix[3:, 3:])) ) self._pars[6] = dx self._pars[7] = dv self.set_sphere_stds()
def score(self, h, ref, postag=False, hpos=[], refpos=[], wts=[]): """ Weights are for ngram weights in the average """ score = 0.0 if len(h) > 0: ngram_precisions = self.ngram_precisions(h, ref) bp = self.brevity_penalty(h, ref) if postag: postag_ngram_precisions = self.ngram_precisions(hpos, refpos) if wts: score = bp * (1-self.beta)*self.wgmean(ngram_precisions, wts) + \ self.beta*self.wgmean(postag_ngram_precisions, wts) else: score = bp * (1-self.beta)*gmean(ngram_precisions) + \ self.beta*gmean(postag_ngram_precisions) else: if wts: score = bp * self.wgmean(ngram_precisions, wts) else: score = bp * gmean(ngram_precisions) return score
def impute_missing_total_reads(total_reads, missing_variant_confidence): # Change NaNs to masked values via SciPy. masked_total_reads = ma.fix_invalid(total_reads) # Going forward, suppose you have v variants and s samples in a v*s matrix of # read counts. Missing values are masked. # Calculate geometric mean of variant read depth in each sample. Result: s*1 sample_means = gmean(masked_total_reads, axis=0) assert np.sum(sample_means <= 0) == np.sum(np.isnan(sample_means)) == 0 # Divide every variant's read count by its mean sample read depth to get read # depth enrichment relative to other variants in sample. Result: v*s normalized_to_sample = np.dot(masked_total_reads, np.diag(1./sample_means)) # For each variant, calculate geometric mean of its read depth enrichment # across samples. Result: v*1 variant_mean_reads = gmean(normalized_to_sample, axis=1) assert np.sum(variant_mean_reads <= 0) == np.sum(np.isnan(variant_mean_reads)) == 0 # Convert 1D arrays to vectors to permit matrix multiplication. imputed_counts = np.dot(variant_mean_reads.reshape((-1, 1)), sample_means.reshape((1, -1))) nan_coords = np.where(np.isnan(total_reads)) total_reads[nan_coords] = imputed_counts[nan_coords] assert np.sum(total_reads <= 0) == np.sum(np.isnan(total_reads)) == 0 total_reads[nan_coords] *= missing_variant_confidence return np.floor(total_reads).astype(np.int)
def test_2D(self): a = ma.array(((1, 2, 3, 4), (1, 2, 3, 4), (1, 2, 3, 4)), mask=((0, 0, 0, 0), (1, 0, 0, 1), (0, 1, 1, 0))) actual = mstats.gmean(a) desired = np.array((1, 2, 3, 4)) assert_array_almost_equal(actual, desired, decimal=14) desired1 = mstats.gmean(a, axis=0) assert_array_almost_equal(actual, desired1, decimal=14) actual = mstats.gmean(a, -1) desired = ma.array((np.power(1 * 2 * 3 * 4, 1.0 / 4.0), np.power(2 * 3, 1.0 / 2.0), np.power(1 * 4, 1.0 / 2.0))) assert_array_almost_equal(actual, desired, decimal=14)
def set_sphere_stds(self): """ Set the spherical standard deviations in position space and velocity space. Calculated in such a way so as to preserved volume in position space and velocity space retrospectively. Note that combined phase-space volume is not conserved by this implementation. """ self._sphere_dx = gmean(np.sqrt( np.linalg.eigvalsh(self._covmatrix[:3, :3])) ) self._sphere_dv = gmean(np.sqrt( np.linalg.eigvalsh(self._covmatrix[3:, 3:])) )
def make_histos(v, s, nIon, size, nbins, ion): ''' Bins up the x and y data into nbins Value is each bin is the geometric mean of column density ''' # Find column density column = np.zeros(len(size)) for i, (n,l) in enumerate(zip(nIon, size)): # Take the cube root of the cell length and convert from kpc to cm length = l**(1./3.) * 3.086e21 column[i] = (10**n) * length print min(column) print max(column) vmin = -100 vmax = 100 smin = -220 smax = 220 print 'Making histogram' H, xed, yed = np.histogram2d(v,s,bins=nbins, range=[[vmin,vmax],[smin,smax]]) h = np.zeros_like(H) print 'Histogram done\n' for i in range(0,H.shape[0]): for j in range(0,H.shape[1]): # #rows = shape[0] # #cols = shape[1] vmin = xed[j] vmax = xed[j+1] smin = yed[i] smax = yed[i+1] val = [] for k, (vel,sloc) in enumerate(zip(v,s)): if vel>vmin and vel<vmax and sloc>smin and sloc<smax: val.append(column[k]) print np.log10(min(val)) print np.log10(max(val)) print np.log10(np.mean(val)) print np.log10(gmean(val)) print vmin, vmax print smin, smax h[i,j] = gmean(val) print h[i,j] h = np.log10(h) np.savetxt('{0:s}_velHist.out'.format(ion), h) return h,xed,yed
def geoMean(vals): vals = array(vals) if len(unique(sign(vals))) != 1: raise ArithmeticError("Sequence of numbers for geometric mean must be all positive or all negative") vals = numpy.abs(vals) m = gmean(vals) return m
def compute_result_dist_prodll_allt(self, all_variables): ''' Given outputs from FitExperimentAllT, will compute the geometric mean of the LL. UGLY HACK: in order to keep track of the minLL, we return it here. You should have a cma_iter_function that cleans it before cma_es.tell() is called... ''' if 'result_ll_sum' in all_variables: repetitions_axis = all_variables.get('repetitions_axis', -1) # Shift to get LL > 0 always currMinLL = np.min(all_variables['result_ll_sum']) if currMinLL < all_variables['all_parameters']['shiftMinLL']: all_variables['all_parameters']['shiftMinLL'] = currMinLL # Remove the current minLL, to make sure fitness > 0 print 'Before: ', all_variables['result_ll_sum'] all_variables['result_ll_sum'] -= all_variables['all_parameters'][ 'shiftMinLL'] all_variables['result_ll_sum'] += 0.001 print 'Shifted: ', all_variables['result_ll_sum'] result_dist_nll_geom = -mstats.gmean( utils.nanmean(all_variables['result_ll_sum'], axis=repetitions_axis), axis=-1) print result_dist_nll_geom return np.array([ result_dist_nll_geom, all_variables['all_parameters']['shiftMinLL'] ]) else: raise ValueError('result_ll_sum was not found in the outputs')
def testCombo(paramCombo, use_datasets, numExamples, compute_mistakes=False, verbose=False, parallelize=True): paramsStr = getParamsString(paramCombo) if parallelize: sys.stdout = open(str(os.getpid()) + ".out", "a") # Create length penalty function from params if 'lengthPenaltyParams' in paramCombo: power, firstDenom, secondDenom = paramCombo['lengthPenaltyParams'] paramCombo['lengthPenaltyFn'] = lambda x: x**power/firstDenom if x<4 else x/secondDenom del paramCombo['lengthPenaltyParams'] # Create ngram penalty and adjacency boost functions from params if 'ngramPenaltyParams' in paramCombo: constant = paramCombo['ngramPenaltyParams'] paramCombo['ngramPenaltyFn'] = lambda length, count: constant * float(length) / np.sqrt(count) del paramCombo['ngramPenaltyParams'] if 'ngramAdjacentBoostParams' in paramCombo: constant = paramCombo['ngramAdjacentBoostParams'] paramCombo['ngramAdjacentBoostFn'] = lambda length, count: constant * np.sqrt(length * count) del paramCombo['ngramAdjacentBoostParams'] constructor = paramCombo[MODEL_KEYWORD] del paramCombo[MODEL_KEYWORD] model = constructor(**paramCombo) paramCombo.update({MODEL_KEYWORD: constructor}) results = model.evaluate(numExamples=numExamples, compute_mistakes=compute_mistakes, verbose=verbose, use_datasets=use_datasets) score = gmean([results[dataset][0] for dataset in use_datasets]) print "Parameters:\n%s" % (paramsStr) print "Score: {}\n\n\n".format(score) return score, paramsStr, paramCombo
def getRatio(self, parent_count_dict, child_count_dict, values): ratio_list = [] parent_sum = 0 parent_count = 0 for value in values: if parent_count_dict[value] == 0: if child_count_dict[value] == 0: ratio_list.append(0) else: ratio_list.append(float("inf")) else: ratio_list.append(float(child_count_dict[value]) / parent_count_dict[value]) parent_count += 1 parent_sum += parent_count_dict[value] ## FOR if len(ratio_list) == 0: ratio_list.append(0) ## IF if parent_count > 0: parent_average = float(parent_sum) / parent_count else: parent_average = 0 ## IF return mstats.gmean(ratio_list), parent_average
def fit_logistic_GLM(X, y, C_value = np.array([-4,5]), num_cv = 5, verbose = False, intercept_scaling = 10, penalty = 'l1', reg_strength = None, plot_results = False ): scores_to_return = [] loss_score = [] X = pp.scale(X) # If regularization strength isn't specified, CV to find it if reg_strength == None: kf = SKFold(y = y, n_folds = num_cv) C_values = np.logspace(C_value[0], C_value[1], 10) C_dict = {"C": C_values} best_param = [] #------------------------------------------------------------------------------ for train, test in kf: X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test] # Do grid search for regularization parameter clf = GSCV( LR(C=1, penalty=penalty, dual=False,intercept_scaling=intercept_scaling), C_dict, cv=num_cv ) # Fit model clf.fit(X_train,y_train) best_param.append(clf.best_params_['C']) if verbose: for params, mean_score, scores in clf.grid_scores_: print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() / 2, params)) if verbose: print np.mean(np.asarray(scores)) reg_strength = gmean(best_param) #------------------------------------------------------------------------------ kf2 = SKFold(y = y, n_folds = num_cv) clf = [] clf_temp = LR( penalty=penalty, dual=False, C = reg_strength, intercept_scaling = intercept_scaling ) for train, test in kf2: X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test] clf_temp.fit(X_train, y_train) scores_to_return.append(clf_temp.score(X_test, y_test)) clf.append(clf_temp) pred = clf_temp.predict_proba(X_test)[:,1] loss_score.append(lossFx(y_test,pred)) #------------------------------------------------------------------------------ # Plot results if plot_results: plot_logistic_fit(clf,X,kf2) # Returns model, scores of each CV, best C parameter, CV fold indices return clf, scores_to_return, loss_score, reg_strength, kf2
def test_1D(self): a = (1,2,3,4) actual = mstats.gmean(a) desired = np.power(1*2*3*4,1./4.) assert_almost_equal(actual, desired, decimal=14) desired1 = mstats.gmean(a,axis=-1) assert_almost_equal(actual, desired1, decimal=14) assert_(not isinstance(desired1, ma.MaskedArray)) a = ma.array((1,2,3,4),mask=(0,0,0,1)) actual = mstats.gmean(a) desired = np.power(1*2*3,1./3.) assert_almost_equal(actual, desired,decimal=14) desired1 = mstats.gmean(a,axis=-1) assert_almost_equal(actual, desired1, decimal=14)
def main(argv): parsed = parse_args(argv) # check directory if not parsed.dir_fimo.endswith("/"): parsed.dir_fimo += "/" # get the lists of tfs, targets rids = numpy.loadtxt(parsed.fn_rids, dtype=str) gids = numpy.loadtxt(parsed.fn_gids, dtype=str) adjmtr = numpy.zeros([len(rids), len(gids)]) # build the adjmtr lines = open(parsed.fn_infer, "r").readlines() for i in range(1, len(lines)): # get inferred tf and database motif linesplit = lines[i].strip().split('\t') infer_tf = linesplit[0] zscore = float(linesplit[3]) if parsed.inference_method == 'cisbp': infer_motifs = linesplit[1].split(',') elif parsed.inference_method == 'fire': infer_motifs = linesplit[0].split(',') else: sys.exit("Inference method not specified.") if zscore >= parsed.zscore_thld: # get fimo scores for the inferred motif index = numpy.where(rids == infer_tf)[0] if len(index) > 0: if len(infer_motifs) > 1: temp_mtr = numpy.zeros([len(infer_motifs), len(gids)]) for j in range(len(infer_motifs)): fn_motif = parsed.dir_fimo + infer_motifs[j] + parsed.summary_suffix # fn_motif = parsed.dir_fimo + infer_motifs[j] + ".summary" # fn_motif = parsed.dir_fimo + infer_motifs[j] + ".summary_mask3_cons_thd_0.5" if os.path.isfile(fn_motif): dict_scores = get_fimo_scores(fn_motif) for k in range(len(gids)): t = gids[k] temp_mtr[j, k] = dict_scores[t] if t in dict_scores else 0 adjmtr[index[0], :] = gmean(temp_mtr).data else: fn_motif = parsed.dir_fimo + infer_motifs[0] + parsed.summary_suffix # fn_motif = parsed.dir_fimo + infer_motifs[0] + ".summary" # fn_motif = parsed.dir_fimo + infer_motifs[0] + ".summary_mask3_cons_thd_0.5" if os.path.isfile(fn_motif): dict_scores = get_fimo_scores(fn_motif) for j in range(len(gids)): t = gids[j] adjmtr[index[0], j] = dict_scores[t] if t in dict_scores else 0 # write adjmtr file write_adjmtr(adjmtr, parsed.fn_adjmtr)
def rotatecube(self, theta=0, phi=0, trim=0): "angles in degrees" if (int(theta) % 360) != 0: rho = rotate(self.rho, theta, (0, 2), mode="nearest", order=1) rhoN = rotate(self.rhoN, theta, (0, 2), mode="nearest", order=1) t = rotate(self.t, theta, (0, 2), mode="nearest", order=1) v = rotate(self.V, theta, (1, 3), mode="nearest", order=1) M = Ry(theta * pi / 180) M[ abs(M) < (finfo(1.0).eps * 10) ] = 0 # set numbers with abs value less than 10 times the floating point epsilon to 0 f = lambda x: (x * M).flat v = apply_along_axis(f, 0, v) if (int(phi) % 360) != 0: rho = rotate(rho, phi, (0, 1), mode="nearest", order=1) rhoN = rotate(rhoN, phi, (0, 1), mode="nearest", order=1) t = rotate(t, phi, (0, 1), mode="nearest", order=1) v = rotate(v, phi, (1, 2), mode="nearest", order=1) M = Rz(phi * pi / 180) M[abs(M) < (finfo(1.0).eps * 10)] = 0 f = lambda x: (x * M).flat v = apply_along_axis(f, 0, v) elif (int(phi) % 360) != 0: rho = rotate(self.rho, phi, (0, 1), mode="nearest", order=1) rhoN = rotate(self.rhoN, phi, (0, 1), mode="nearest", order=1) t = rotate(self.t, phi, (0, 1), mode="nearest", order=1) v = rotate(self.V, phi, (1, 2), mode="nearest", order=1) M = Rz(phi * pi / 180) M[abs(M) < (finfo(1.0).eps * 10)] = 0 f = lambda x: (x * M).flat v = apply_along_axis(f, 0, v) else: rho = self.rho.copy() t = self.t.copy() v = self.V.copy() rho[rho < 1e-30] = 1e-30 t[t < 1] = 1 thresh = rho if trim: for _ in rho.shape: thresh = gmean(thresh, axis=0) sl = trimCube(rho, thresh * 5) self.rho = rho[sl] self.rhoN = rhoN[sl] self.t = t[sl] sl = [slice(0, 3)] + sl self.V = v[sl] else: self.rho = rho self.rhoN = rhoN self.t = t self.V = v try: self.dt[...] = 0 except AttributeError: None
def adaptive_bandwidths(self): """Computes the bandwidths for the adaptive KDE.""" key = "adaptive_bandwidths" if key not in self.cache: KDE_list = self.KDE_of_training_list(fixed=True, approx=False) geom_mean = gmean(KDE_list) lambdas = np.power(KDE_list/geom_mean, -0.5) self.cache[key] = lambdas * self.h return self.cache[key]
def independent_variable_model_collapse(model,independent_column_name="Frequency", **options): """Returns a model with a single set of independent variables. Default is to average values together but geometric mean, std, variance, rss, mad and median are options. Geometric means of odd number of negative values fails""" if isinstance(model,pandas.DataFrame): model_1 = DataFrame_to_AsciiDataTable(model) defaults = {"method": "mean"} # load other options from model for option, value in model.options.items(): if not re.search('begin_line|end_line', option): defaults[option] = value for element in model.elements: if model.__dict__[element]: if re.search("meta", element, re.IGNORECASE): defaults["metadata"] = model.metadata.copy() else: defaults[element] = model.__dict__[element][:] # We need to preserve the frequency column some how collapse_options = {} for key, value in defaults.items(): collapse_options[key] = value for key, value in options.items(): collapse_options[key] = value unique_independent_variable_list = sorted(list(set(model[independent_column_name]))) independent_variable_selector = model.column_names.index(independent_column_name) out_data = [] for index, independent_variable in enumerate(unique_independent_variable_list): data_row = [x for x in model.data[:] if x[independent_variable_selector] == independent_variable] if re.search('mean|av', collapse_options["method"], re.IGNORECASE): new_row = np.mean(np.array(data_row), axis=0).tolist() elif re.search('median', collapse_options["method"], re.IGNORECASE): new_row = np.median(np.array(data_row), axis=0).tolist() elif re.search('geometric', collapse_options["method"], re.IGNORECASE): new_row = gmean(np.array(data_row), axis=0).tolist() elif re.search('st', collapse_options["method"], re.IGNORECASE): new_row = np.std(np.array(data_row), axis=0).tolist() elif re.search('var', collapse_options["method"], re.IGNORECASE): new_row = np.var(np.array(data_row), axis=0, dtype=np.float64).tolist() elif re.search('rms', collapse_options["method"], re.IGNORECASE): new_row = np.sqrt(np.mean(np.square(np.array(data_row)), axis=0, dtype=np.float64)).tolist() elif re.search('rss', collapse_options["method"], re.IGNORECASE): new_row = np.sqrt(np.sum(np.square(np.array(data_row)), axis=0, dtype=np.float64)).tolist() elif re.search('mad', collapse_options["method"], re.IGNORECASE): new_row = mad(np.array(data_row), axis=0).tolist() new_row[independent_variable_selector]=independent_variable out_data.append(new_row) collapse_options["data"] = out_data if collapse_options["specific_descriptor"]: collapse_options["specific_descriptor"] = collapse_options["method"] + "_" + \ collapse_options["specific_descriptor"] resulting_model = AsciiDataTable(None, **collapse_options) return resulting_model
def centeredLogRatio(otu_table, otu_table_m): from scipy.stats.mstats import gmean noZeros = otu_table.copy().replace(0, np.nan) geomeans = np.repeat(np.nan, repeats = noZeros.shape[0]) for i in range(0, noZeros.shape[0]): geomeans[i] = gmean(noZeros.ix[i, :].dropna()) clr_table = np.log(noZeros.divide(geomeans, axis=0)) clr_table.replace(np.nan, 0, inplace=True) clr_table_m = otu_table_m.copy() clr_table_m.ix[:, otu_table.columns] = clr_table return clr_table, clr_table_m
def summarise_metric(ys): flat = [] for s in SERIES: flat += ys[s] m = gmean(flat) result = [0.0] * (len(flat) / len(SERIES)) for s in SERIES: for i in range(len(result)): result[i] += ys[s][i] / m for i in range(len(result)): result[i] /= float(len(SERIES)) return result
def ratings(d, ks, rating_index): avgs = [] for k in ks: if not k in d: if rating_index < 2: avgs.append(50.0) else: avgs.append(5.0) else: v = d[k][rating_index] avgs.append(ms.gmean(v)) return np.average(avgs)
def clear_data( RFs, n ): p = 25 Z, T = [], [] Noise = np.load( 'noise.npy' ).reshape(n*n,p,p) cRFs = np.zeros((n*n,p,p)) for i in range( n ): for j in range( n ): RF = RFs[i,j,...] # WARNING : Centering the RF s0,s1 = np.unravel_index(np.argmax(RF),RF.shape) RF = np.roll(RF,13-s0,axis=0) RF = np.roll(RF,13-s1,axis=1) # WARNING : Centering the RF # RF += Noise[i*n+j] # RF = gaussian_filter( RF, sigma=2.2 ) RF += 1.5*Noise[i*n+j] RF = gaussian_filter( RF, sigma=1.5 ) abs_max = np.max( np.abs( RF ) ) RF[np.where( ( ( RF < +0.10*abs_max ) & (RF>0) ) | ( ( RF > -0.10*abs_max ) & (RF < 0) ) ) ]=0 RF = locate_noise( RF ) cRFs[i*n+j,...] = RF exc = 50.0 * ( RF > 0).sum()/( p * p ) inh = 50.0 * ( RF < 0).sum()/( p * p ) Z.append([exc,inh]) Z = np.array(Z) np.nan_to_num(Z) print '------ Excitatory ------- Inhibitory -------' print 'Minimum :', Z[:,0].min(), Z[:,1].min() print 'Maximum :', Z[:,0].max(), Z[:,1].max() print 'Mean :', np.mean( Z[:,0] ), np.mean( Z[:,1] ) print 'Mean :', np.mean( np.log10(Z[:,0]) ), np.mean( np.log10(Z[:,1]) ) print 'SD : ', np.std( np.log10(Z[:,0]) ), np.std( np.log10(Z[:,1]) ) print 'GMean :', gmean( Z[:,0] ), gmean( Z[:,1] ) print "Pearson cor: ", pearsonr( Z[:,0], np.abs(Z[:,1]) ) return Z, cRFs
def gmean_bin(x, y, nIon, size, nbins, xlims, ylims, ion): ''' Bins up the data according to x and y Value in each bin is the geometric mean of the column density contribution of cells in that bin, as determined by multiplying nIon by size**1/3 ''' # Calculate the column density column = np.zeros(len(size)) f = open('{0:s}_column.out'.format(ion), 'w') for i, (n,l) in enumerate(zip(nIon, size)): # Take the cube root of the cell length and convert from kpc to cm length = l**(1./3.) * 3.086e21 col = n * length column[i] = col f.write('{0:.4e}\n'.format(col)) f.close() # Make the bins xbins = np.linspace(xlims[0], xlims[1], nbins+1) ybins = np.linspace(ylims[0], ylims[1], nbins+1) # Determine what cells go in what bins xdig = np.digitize(x, xbins) ydig = np.digitize(y, ybins) # Fix the edge effects maxBinNum = len(xbins) for i in range(len(xdig)): if xdig[i]==maxBinNum: xdig[i] -= 1 if ydig[i]==maxBinNum: ydig[i] -= 1 # Create empty array h = np.zeros((nbins, nbins)) # Loop through array for i in range(nbins): for j in range(nbins): # Find the indicies where x and y belong to this bin bits = np.bitwise_and( xdig==i+1, ydig==j+1) if True in bits: h[i,j] = np.log10( gmean( column[bits] ) ) h = np.rot90(h) h = np.flipud(h) np.savetxt('{0:s}_velHist.out'.format(ion), h) print 'Max of h: ', np.max(h) print 'Mean of h: ', np.mean(h) return h, xbins, ybins
def get_annot_kpts_baseline_weights(ibs, aid_list, config2_=None, config={}): r""" Returns weights based on distinctiveness and/or features score / or ones. Customized based on config. Args: qreq_ (QueryRequest): query request object with hyper-parameters aid_list (int): list of annotation ids config (dict): Returns: list: weights_list CommandLine: python -m ibeis.algo.hots.scoring --test-get_annot_kpts_baseline_weights Example: >>> # DISABLE_DOCTEST >>> from ibeis.algo.hots.scoring import * # NOQA >>> qreq_, cm = plh.testdata_scoring('testdb1') >>> aid_list = cm.daid_list >>> config = qreq_.qparams >>> # execute function >>> config2_ = qreq_.qparams >>> kpts_list = qreq_.ibs.get_annot_kpts(aid_list, config2_=config2_) >>> weights_list = get_annot_kpts_baseline_weights(qreq_.ibs, aid_list, config2_, config) >>> # verify results >>> depth1 = ut.get_list_column(ut.depth_profile(kpts_list), 0) >>> depth2 = ut.depth_profile(weights_list) >>> assert depth1 == depth2 >>> print(depth1) >>> result = str(depth2) >>> print(result) """ # TODO: clip the fgweights? (dilation?) # TODO; normalize and paramatarize and clean dcvs_on = config.get('dcvs_on') fg_on = config.get('fg_on') weight_lists = [] if dcvs_on: qdstncvs_list = get_kpts_distinctiveness(ibs, aid_list, config2_, config) weight_lists.append(qdstncvs_list) if fg_on: qfgweight_list = ibs.get_annot_fgweights(aid_list, ensure=True, config2_=config2_) weight_lists.append(qfgweight_list) if len(weight_lists) == 0: baseline_weights_list = [np.ones(num, np.float) for num in ibs.get_annot_num_feats(aid_list, config2_=config2_)] #baseline_weights_list = [None] * len(aid_list) else: # geometric mean of the selected weights baseline_weights_list = [spmstat.gmean(weight_tup) for weight_tup in zip(*weight_lists)] return baseline_weights_list
def get_probs(md5, average_type='gmean'): temp = [] for position, row in enumerate(md5_2_ind_joined[md5]): temp += [ms[position][row]] if average_type == 'mean': temp = scipy.sparse.vstack(temp).mean(axis=0) elif average_type == 'gmean': temp = gmean(scipy.sparse.vstack(temp).todense() + 1e-15, axis=0) temp[temp < 1e-6] = 0 return md5, csr_matrix(temp)
def calculate_nf(sample_frame, ref_targets, ref_sample): """Calculates a normalization factor from the geometric mean of the expression of all ref_targets, normalized to a reference sample. :param DataFrame sample_frame: A sample data frame. :param iterable ref_targets: A list or Series of target names. :param string ref_sample: The name of the sample to normalize against. :return: a Series indexed by sample name containing normalization factors for each sample. """ grouped = sample_frame.groupby(['Target', 'Sample'])['Cq'].aggregate(average_cq) samples = sample_frame['Sample'].unique() nfs = gmean([pow(2, -grouped.ix[zip(repeat(ref_gene), samples)] + grouped.ix[ref_gene, ref_sample]) for ref_gene in ref_targets]) return pd.Series(nfs, index=samples)
def main(): infile = open('analysis/PI_DataSet.txt') header = infile.readline().rstrip().rsplit() tpvdict = {} drvdict = {} mutlist = [ '10F', '32I', '46I', '47V', '50V', '54L', '54M', '74P', '76V', '82T', '82F', '84V', '90M' ] for line in infile: line = line.rstrip().rsplit() muts = [] for i, mut in enumerate(line[9:]): if str(i + 1) + mut in mutlist: muts.append(str(i + 1) + mut) if len(muts) == 1: mut = muts[0] if mut not in tpvdict: tpvdict[mut] = [] drvdict[mut] = [] if line[7] != 'NA': tpvdict[mut].append(float(line[7])) if line[8] != 'NA': drvdict[mut].append(float(line[8])) print tpvdict print drvdict infile.close() for mut in tpvdict: tpvdict[mut] = gmean(tpvdict[mut]) drvdict[mut] = gmean(drvdict[mut]) outfile = open('analysis/resistance_single.txt', 'w') outfile.write('mutation\tTPV\tDRV\n') for mut in tpvdict: outfile.write(mut + '\t' + str(tpvdict[mut]) + '\t' + str(drvdict[mut]) + '\n') outfile.close()
def get_probs(i, average_type='gmean'): image_name = file_names.loc[i, 'file_name'] temp = [] for j, m in enumerate(ms): temp += [m[i]] if average_type == 'mean': temp = scipy.sparse.vstack(temp).mean(axis=0) elif average_type == 'gmean': temp = gmean(scipy.sparse.vstack(temp).todense() + 1e-15, axis=0) temp[temp < 1e-6] = 0 return file_to_md5[image_name], csr_matrix(temp)
def compositional_transform(self, add_pseudocount:bool=False): """calculated the three Aitchison geometry transforms for the Ab counts. - alr uses the IgG1 counts as universal reference - ilr contrasts are based on the SVD of clr if add_pseudocount is set to true add 1 to prevent zero division, otherwise cells with zero counts in the denominator create inf and have to be filtered before downstream analysis, e.g.: clr_filter = np.isfinite(clr).all(axis=1) clr = clr[clr_filter,:]""" if add_pseudocount: self.clr_data = np.log((1+self.andat_raw.X)/gmean(self.andat_raw.X+1, axis=1).reshape(-1,1)) self.alr_data = np.log((1+self.andat_raw.X)/(self.andat_raw.X[:,-1]+1).reshape(-1,1)) U,s,Vt = np.linalg.svd(self.clr_data, full_matrices=False) self.ilr_data = np.dot(U*s,helmert(len(s)).T) else: self.clr_data = np.log(self.andat_raw.X/gmean(self.andat_raw.X, axis=1).reshape(-1,1)) self.alr_data = np.log(self.andat_raw.X/(self.andat_raw.X[:,-1].reshape(-1,1))) finite_clr = np.isfinite(self.clr_data).all(axis=1) U,s,Vt = np.linalg.svd(self.clr_data[finite_clr,:], full_matrices=False) self.ilr_data = np.dot(U*s,helmert(len(s)).T) return
def run_IterLinQuadReg_matrix(self, A, B, C, dist_info_sharing='AM', us_init=None): x_input, u_input = self.state_inputs() if np.ndim(A) != 2: if dist_info_sharing == 'GM': A = gmean(A, axis=0) B = gmean(B, axis=0) C = gmean(C, axis=0) print(A.shape, 'A', B.shape, 'B', C.shape, 'C') elif dist_info_sharing == 'AM': A = np.sum(A, axis=0) / A.shape[0] B = np.sum(B, axis=0, keepdims=True) / B.shape[0] B = B.T C = np.sum(C, axis=0) / C.shape[0] else: pass f = self.next_states_matrix(x_input, u_input, A, B, C) dynamics = AutoDiffDynamics(f, x_input, u_input) x_goal = self.augment_state(self.x_goal) if self.Q_terminal.all() == None: cost = QRCost(self.Q, self.R) else: cost = QRCost(self.Q, self.R, Q_terminal=self.Q_terminal, x_goal=x_goal) x0 = self.augment_state(self.x0) if us_init == None: us_init = np.random.uniform(-1, 1, (self.N, dynamics.action_size)) ilqr = iLQR(dynamics, cost, self.N) xs, us = ilqr.fit(x0, us_init, on_iteration=self.on_iteration) return xs, us
def scale_parameter(self, phi, psi, rotamer, residues, concentration, alpha, quadtree): r""" Calculates query-dependent scale parameter at (``phi``, ``psi``). When calculating the mean χ angles of a rotamer at a particular (φ, ψ) point, each χ angle is calculated as a weighted mean: .. math:: \mu(\chi|\phi, \psi, r) = \frac{% \sum_i^{N_r} K_m(\phi - \phi_i) K_m(\psi - \psi_i) \chi_i }{% \sum_i^{N_r} K_m(\phi - \phi_i) K_m(\psi - \psi_i) }. The scale factor for the kernels :math:`K_m` is evaluated once at each query (φ, ψ) point. We follow the method of Shapovalov and Dunbrack (2010) and ensure that the bandwidth of the kernel encompasses at least 25 points. """ pilot_estimates = to_numpy_array( residues, lambda res: res["rotamer_pilot"][rotamer]) phi_psi_list = to_numpy_array( residues, lambda res: np.deg2rad(np.array(res["torsion"]))) # Get the non-adaptive kde at the query point query_estimator = prob_density_kde(np.array([phi, psi]), phi_psi_list, concentration) scale_param = np.power(gmean(pilot_estimates) / query_estimator, alpha) # We want to expand the scale parameter in sparse regions. # We do this by following Shapovalov and Dunbrack. # First, we find the nearest 25 points. If the distance of the farthest # point is less than our required distance, we accept the current scale # parameter. Otherwise, we take the distance of the 25th point and # convert that into our scale factor. # The conversion between distances r and scale factors λ is given by # r = 1/√(k / λ). So λ = kr^2 cutoff_distance = np.sqrt(scale_param / concentration) distances, _ = quadtree.query(np.array([phi, psi]), 25) if distances[-1] >= cutoff_distance: scale_param = concentration * distances[-1]**2 return scale_param
def compute_hdbscan(data): global hdbscan_cache if 4 in hdbscan_cache: return hdbscan_cache # Recompute raw dataset scales, as the normalization may not be scale scales = data.values.div(data.values.max(axis=1), axis=0) test_data = dataset.from_values(data.features, scales, data.values) # Build map from number of classes to configs ranges = {} for c, s in itertools.product(CLUSTERS, SAMPLES): clusterer = hdbscan.HDBSCAN(metric='l2', min_cluster_size=c, min_samples=s) clusterer.fit(data.normalized) # Labels are 0-indexed n_classes = clusterer.labels_.max() + 1 chosen_labels = [ np.argmax(mstats.gmean(x, axis=0)) for x in clusterer.exemplars_ ] kernel_map = [data.normalized.columns[i] for i in chosen_labels] err = utils.geom_mean( utils.get_perfect_errors_for(kernel_map, test_data)) #print("hdbscan {} classes for {}, {}. err {}".format(n_classes, c, s, err)) #print('\n'.join(kernel_map)) if n_classes in ranges.keys(): ranges[n_classes] += [(c, s, err)] else: ranges[n_classes] = [(c, s, err)] for i in sorted(ranges.keys()): print("hdbscan: {} -> {}".format(i, ranges[i])) # Scan through map to get best trained config for each number of classes configs = {0: (15, 15)} for i in range(1, 16): if i in ranges: m = 0 for c, s, e in ranges[i]: if e > m: configs[i] = (c, s) m = e else: configs[i] = configs[i - 1] hdbscan_cache = configs return configs
def get_statistics(exp_nm): ''' Complete global editing statistics for each target site: - Base editing frequency among all non-noisy reads - Indel frequency among all non-noisy reads Can be used to calculate base editing to indel ratio (low biological replicability, due to low replicability of indel frequency). ''' stats_df = pd.read_csv(inp_dir + f'{exp_nm}.csv', index_col = 0) mdf = stats_df # Filter readcount mdfs = mdf[mdf['Total count_indel'] >= 1000] # Filter target sites with no substrate nt in core has_c_in_core = lambda row: bool('C' in row['gRNA (20nt)'][3 : 7]) mdfs['Has Core C'] = mdfs.apply(has_c_in_core, axis = 'columns') mdfs = mdfs[mdfs['Has Core C']] # Filter target sites with very low base editing fq mdfs['Base edit fq'] = (mdfs['Edited count']) / mdfs['Total count_indel'] mdfs = mdfs[mdfs['Base edit fq'] > 0.025] # mdfs = mdfs[mdfs['Base edit fq'] > 0.01] # No pseudocounts mdfs = mdfs[mdfs['Indel count'] > 0] # # Pseudocount # mdfs['Edited count'] += 1 # mdfs['Indel count'] += 1 mdfs['Base edit to indel ratio'] = mdfs['Edited count'] / mdfs['Indel count'] mdfs['Log10 base edit to indel ratio'] = np.log10(mdfs['Base edit to indel ratio']) mdfs.to_csv(out_dir + '%s.csv' % (exp_nm)) from scipy.stats.mstats import gmean data = mdfs['Base edit to indel ratio'] stats = { 'Num. target sites': len(mdfs), 'Geometric mean': gmean(data), 'Median': np.median(data), '25th percentile': np.percentile(data, 25), '75th percentile': np.percentile(data, 75), 'Geometric std': np.exp(np.std(np.log(data))), } return stats
def multioutput_fscore(y_true, y_pred, beta=1): score_list = [] if isinstance(y_pred, pd.DataFrame) == True: y_pred = y_pred.values if isinstance(y_true, pd.DataFrame) == True: y_true = y_true.values for column in range(0, y_true.shape[1]): score = fbeta_score(y_true[:, column], y_pred[:, column], beta, average='weighted') score_list.append(score) f1score_numpy = np.asarray(score_list) f1score_numpy = f1score_numpy[f1score_numpy < 1] f1score = gmean(f1score_numpy) return f1score
def _runBacktest(allocation_map, ticker_data, start_date_int, next_date_int): small_ticker_data = { ticker: data for ticker, data in ticker_data.items() if ticker in allocation_map } (small_ticker_tuple, small_data_matrix, small_expense_array) = data_cleaner.cleanAndConvertData( small_ticker_data, 30, next_date_int, first_date=start_date_int) allocation_map = defaultdict(int, allocation_map) small_allocation_array = np.array( [allocation_map[ticker] for ticker in small_ticker_tuple], dtype=np.float64) performance = gmean(np.matmul(small_data_matrix, small_allocation_array)) expense = pow(1 - np.matmul(small_allocation_array, small_expense_array), 1 / config.TRADING_DAYS_PER_YEAR) return performance * expense
def mean_diversity(self, path, alpha=1.0, include_sink=False, method='arithmetic'): path = CheckPath(path) diversities = self.individual_diversities(path, alpha=alpha, include_sink=include_sink) # Computing the mean if method == 'arithmetic': return diversities.mean() elif method == 'geo': return gmean(diversities) elif method == 'wpm': raise ValueError('Weighted Power Mean Method not implemented yet.')
def search_combinations(grid, qos = 1.2): benchmarks = [x for x in grid if x.endswith('1') or x.endswith('2')] sens = dict((bench, gmean(grid[bench].values())) for bench in benchmarks) sens = map(lambda x: x[0], sorted(sens.items(), key = lambda x: x[1])) gridT = transpose_grid(grid) (cos, _, _) = classes(grid, [qos]) best = ('', 0, []) for b1 in sens[:len(sens) - 1]: for b2 in sens[sens.index(b1) + 1:]: cont = contentiousness(b1, b2, grid) accuracy = validate(cont, cos) if accuracy > best[1]: best = (b1 + '_' + b2, accuracy, cont) return best[2]
def extract_preprocessed(self, X, y): stds = X.std(axis=0) stds = stds[stds > 0] std_ratio = gmean(stds) corr_mean = X.corr(method='pearson').abs().values.mean() skew_mean = X.skew(axis=0).mean() kurt_mean = X.kurtosis(axis=0).mean() self.meta_data.update({ 'STDRatio': std_ratio, 'CorrelationMean': corr_mean, 'SkewnessMean': skew_mean, 'KurtosisMean': kurt_mean }) self._extract_landmarks(X, y)
def get_weight_attr(cov_u, cov_v, reads_weight, db_weight): cov_diff = 1.0 / (abs(cov_u - cov_v) + sys.float_info.epsilon) weight_attr = { 'cov_diff': cov_diff, 'reads_and_db': reads_weight + db_weight, 'geometric_mean': gmean([cov_diff, reads_weight, db_weight]), 'harmonic_mean': hmean([ cov_diff, reads_weight + sys.float_info.epsilon, db_weight + sys.float_info.epsilon ]) } return weight_attr
def main(argv): parsed = parse_args(argv) # get input networks fns = parsed.FILE networks = [] for i in range(len(fns)-1): x = numpy.abs(numpy.loadtxt(fns[i])) x += numpy.min(x[numpy.nonzero(x)]) networks.append(x) combined = gmean(networks) # write combined network fn_output = parsed.FILE[len(parsed.FILE)-1] # numpy.savetxt(fn_output, combined, fmt="%.10f", delimiter="\t", newline="\n") write_adjmtr(fn_output, combined)
def calculate_sfm(frame): """Calculates the Spectral Flatness Measure of a signal The SFM is defined as the ratio of the geometrical mean by the arithmetical mean :param frame: frame of a discrete signal :return: the SFM of the frame """ a = np.mean(frame) g = gmean(frame) if a == 0 or g/a <= 0: sfm = 0 else: sfm = 10*np.log10(g/a) return sfm
def attributes(location, kind): global fail global fallas img = cv2.imread(location, 0) #Preprocessing #If image is monochromatic hist = cv2.calcHist([img], [0], None, [256], [0, 256]) #Else #Gray scale trace = hist.reshape(256) #trace[trace!=-10000]+=1 #gTrace=trace[trace>0] #Getting atributes attributes = np.zeros(10, dtype='<U256') #.astype(object) #Kurtosis attributes[0] = str(sts.kurtosis(trace)) #Skewness attributes[1] = str(sts.skew(trace)) #Std attributes[2] = str(np.std(trace)) #Range attributes[3] = str(np.ptp(trace)) #Median attributes[4] = str(np.median(trace)) #Geometric_Mean attributes[5] = str(gmean(trace)) #Hjorth a, mor, comp = hjorth_params(trace) #Mobility attributes[6] = str(mor) #Complexity attributes[7] = str(comp) attributes[8] = str(kind) attributes[9] = str(location) #print(attributes) if (str(comp) == 'nan' or str(mor) == 'nan' or str(attributes[5]) == "nan"): a = np.array((location, str(attributes[5]), mor, comp)) fallas = np.vstack((fallas, a)) fail += 1 return attributes
def calc_detectable_difference(data): '''Second part of performance indicator calculations adapted from Ruijter et al. (Methods, 2013)''' conc = np.repeat([15 * np.geomspace(1, 10000, num=5)], 3) conc_log = np.log10(conc) data_log = np.log10(data) steyx_arr = [] for i in range(len(data_log[0, :])): steyx_i = calc_steyx(conc_log, data_log[:, i]) steyx_arr.append(steyx_i) steyx = np.asarray(steyx_arr) intv = generate_intv() mean_arr = [] for i in intv: mean_i = np.mean(data_log[i[0]:i[1]], axis=0) mean_arr.append(mean_i) mean_intv = np.asarray(mean_arr) mean_x = np.mean(conc_log) ss_x = np.var(conc_log, ddof=1) * 13 conc_log_mean = conc_log[[0, 3, 6, 9, 12]] sq_part = np.sqrt(1. / 3. + (((conc_log_mean - mean_x)**2) / ss_x)) se_yfit = np.outer(sq_part, steyx) t_intv = t.ppf(1 - 0.0125, 2) ci_y_upper = mean_intv + t_intv * se_yfit ci_y_lower = mean_intv - t_intv * se_yfit ci_y_upper_no_log = 10**ci_y_upper ci_y_lower_no_log = 10**ci_y_lower mean_intv_no_log = 10**mean_intv fold_up = ci_y_upper_no_log / mean_intv_no_log fold_down = mean_intv_no_log / ci_y_lower_no_log detectable_difference = gmean(fold_up, axis=0) return detectable_difference
def __init__(self, dataset, n_classes): # HDBScan is a better clustering algorithm that may give a better set # of representatives. c, s = compute_hdbscan(dataset)[n_classes] clusterer = hdbscan.HDBSCAN(metric='l2', min_cluster_size=c, min_samples=s) clusterer.fit(dataset.normalized) # For each cluster, choose a representative that gives the best overall # performance for the class exemplars. chosen_labels = [ np.argmax(mstats.gmean(x, axis=0)) for x in clusterer.exemplars_ ] kernel_map = [dataset.normalized.columns[i] for i in chosen_labels] self.classes = kernel_map self.name = "{}{}".format(self.cls_name, n_classes)
def main(): X_train, X_val, y_train, y_val = common.load_train_dummies() slr = make_pipeline(MinMaxScaler(), LogisticRegression()) plr = make_pipeline(PCA(), LogisticRegression()) nb_bag = BaggingClassifier(base_estimator=GaussianNB()) clfs = ( GaussianNB(), #GridSearchCV(slr, dict(logisticregression__C=[1.0, 0.8])), make_pipeline(PCA(), GaussianNB()), GridSearchCV(plr, dict(pca__n_components=[None, 3, 8], logisticregression__C=[1.0, 0.7]), scoring='roc_auc'), GridSearchCV(nb_bag, dict(max_samples=[0.2, 0.4, 0.6], max_features=[0.3, 0.7]), scoring='roc_auc'), xgb.XGBClassifier(n_estimators=20, max_depth=3, colsample_bytree=0.7, subsample=0.6, learning_rate=0.1), #make_pipeline(KMeans(), GaussianNB()), #GridSearchCV( # BaggingClassifier(), # dict(base_estimator=[None, GaussianNB(), LogisticRegression()], # n_estimators=[7, 10, 14], # max_samples=[0.3, 0.6])), #GridSearchCV(xgb.XGBClassifier(), dict(n_estimators=[2, 3, 4], learning_rate=[0.01, 0.1], subsample=[0.5, 0.9])), #BaggingClassifier(base_estimator=SVC(), max_features=0.8, max_samples=2500, n_estimators=5), ) preds = [] for clf in clfs: print clf clf.fit(X_train, y_train) val_pred = clf.predict(X_val) print roc_auc_score(y_val, val_pred) clf.fit(X_val, y_val) train_pred = clf.predict(X_train) preds.append(np.concatenate((train_pred, val_pred))) print roc_auc_score(y_train, train_pred) print y_all = np.concatenate((y_train, y_val)) preds = np.column_stack(preds) gm = gmean(preds, axis=1) hm = hmean(preds+1, axis=1) preds = np.column_stack((preds, gm, hm)) print 'GM', roc_auc_score(y_all, gm) print 'HM', roc_auc_score(y_all, hm) meta = GaussianNB() meta = GridSearchCV(xgb.XGBClassifier(), dict(max_depth=[2, 3, 4], learning_rate=[0.01, 0.05, 0.1], n_estimators=[20, 40, 60]), scoring='roc_auc') meta.fit(preds, y_all) scores = cross_val_score(meta, preds, y_all, scoring='roc_auc', cv=5) print scores print scores.mean()
def dominate(portfolio, mu, cov, cost, prices, risk_tolerance, single_period=False): """By Default, always multi-period""" # start date for the based portfolio to be determined ... always assign to past 6 months (ie rebalance the period) start_date = (datetime.now() - relativedelta(months=6)).strftime("%Y-%m-%d") # get the number of days in the backtest period ... to determine target returns and variances later days = business_days(start_date, datetime.now().strftime("%Y-%m-%d")) # call backtest to get the value of the portfolio portfolio_value = back_test(portfolio, start_date, end_date=None, dollars=None)[0].sum(axis=1) print(">>> portfolio_value: ", portfolio_value) # calculate portfolio returns portfolio_returns = (portfolio_value / portfolio_value.shift(1) - 1).dropna() print(">>> portfolio_returns: ", portfolio_returns) # assign the target return and variance target_returns = (gmean(portfolio_returns + 1, axis=0) - 1) * days target_variance = portfolio_returns.var() * days mu_p2 = mu[0] if single_period else mu[1] cov_p2 = cov[0] if single_period else cov[1] soln, agg_soln = optimize(mu=(mu[0], mu_p2), sigma=(cov[0], cov_p2), alpha=(0.05, 0.10), return_target=(target_returns, target_returns), costs=cost, prices=prices, gamma=risk_tolerance[2]) return soln, agg_soln
def _all_shared_index(self): try: # Getting data of weighted Volume average of less than 5 mins wvl = [] query = ("SELECT DISTINCT stock FROM RECORDS") self.dbRecord.cur.execute(query) result = self.dbRecord.cur.fetchall() print(result) for stock_record in range(len(result)): print(result[stock_record][0]) w = self._get_volume_weighted_average(result[stock_record][0]) wvl.append(w) res = gmean(wvl) return res except Exception as e: print("Database connectivity error" + str(e)) return None
def get_data_online(): """get equal length dividend yield and interest rate time series for longest period possible dividend yield comes from S&P 500; interest rate from Kenneth French's database""" dy_monthly_data = quandl.get("MULTPL/SP500_DIV_YIELD_MONTH")["Value"] # Resampling yearly dividend yield and cutting of 2021 dy_data = dy_monthly_data.resample("Y").apply(lambda x: gmean(x))[:-1] datareader = pandas_datareader.famafrench.FamaFrenchReader( "F-F_Research_Data_Factors", freq="Y", start=1926) int_data = datareader.read()[1][ "RF"] # 0 for monthly data; 1 for yearly data min_len = min(len(dy_data), len(int_data)) return (int_data[-min_len:].values / 100), dy_data[-min_len:].values / 100
def set_count(self): counts = np.array([gene.count for gene in self.genes.values()]) stoichs = np.array( [gene.stoichiometry for gene in self.genes.values()]) reactions = np.array( [gene.nz_reactions for gene in self.genes.values()]) # if there is any non-zero protein that is used only in this reaction, filter other zeros exclusive_non_zero = False if len(counts) > 1 and 0. in counts: for (cnt, reac) in zip(counts, reactions): if reac == 1 and cnt != 0: exclusive_non_zero = True counts = counts / stoichs #counts = counts / reactions if exclusive_non_zero: counts = filter(lambda x: x != 0, counts) self.count = gmean(counts)
def my_rescale_sin(value_at_each_time, L=0.02, R=0.98, h=2.5, l=0.2 / 2, silent=True): if any(value_at_each_time != 0): # I compute the geometric mean from our estimator. G = gmean(value_at_each_time) else: # G == 0, it happens if no norm computed. # then it has to return 0.01 such that it widen all the kernels. return np.full(len(value_at_each_time), 0.01) L_quant = np.quantile(value_at_each_time, L) R_quant = np.quantile(value_at_each_time, R) if not L_quant < G < R_quant: raise Error_not_allowed_input( "L < G < R for the well definiteness of the function.") if not silent: print("Left boundary : ", L_quant) if not silent: print("Right boundary : ", R_quant) xx = value_at_each_time - G ans = 0 scaling1 = math.pi / (G - L_quant) scaling2 = math.pi / (R_quant - G) # I fix the part outside of my interest, to be the final value, h. # This part corresponds to math.pi. # I also need the scaling by +h/2 given by math.pi # xx2 and xx3 are the cosinus, but they are different cosinus. # So I fix them where I don't want them to move at 0 and then I can add the two functions. my_xx2 = np.where((xx * scaling1 > -math.pi) & (xx * scaling1 < 0), xx * scaling1, math.pi) # left my_xx3 = np.where((xx * scaling2 > 0) & (xx * scaling2 < math.pi), xx * scaling2, math.pi) # right ans += -(h - l) / 2 * np.cos(my_xx2) ans += -(h - l) / 2 * np.cos(my_xx3) ans += l # avoid infinite width kernel, with a minimal value. return ans
def evaluate(self): prob = np.array([]) for inputs, targets in self.data_loader: predicted = self.model.predict(inputs) if type(predicted) is list: # if predicted == [[label, label, ...], [...], ...] # targets = [[label, label, ...], [...], ...] predicted = np.concatenate(predicted) targets = np.concatenate(targets) predicted = predicted[[range(len(targets)), targets]] if type(predicted) is not np.ndarray: predicted = predicted.cpu().numpy() prob = np.concatenate((prob, predicted)) perplexity = gmean(1 / prob) return perplexity
def __init__(self, n_model, norm=False): # retrieve sed and wavelength data from model self.n_model = n_model self.filepath = model_path + 'model' + str( n_model) + '/data_th/sed_rt.fits' hdulist = fits.open(self.filepath) sedlist = hdulist[0].data[0][0] self.wavelength = hdulist[1].data # normalize according to highest inclination if norm == True: mean_range = np.where((2 < self.wavelength) & (self.wavelength < 10)) gmean = stats.gmean(sedlist[-1][mean_range]) self.seds = [sed / gmean for sed in sedlist] # for non-normalized case else: self.seds = sedlist # store the model image impath = self.filepath[:-19] + 'data_0.6/RT.fits' self.images = fits.open(impath)[0].data[0][0] # store convolved+rebinned images image # convolve images with rebinned PSF (20mas/pixel) convolved_prebinned_images = [ convolve_fft(i, PSF_20mas) for i in self.images ] # rebin to 40mas/pixel via averaging binpix = int((self.images[0].shape[0] - 1) / 2) self.convolved_images = [ rebin(i[:-1, :-1], (binpix, binpix)) for i in convolved_prebinned_images ] # generate model inclinations- is it in the files? i_0, i_f = np.radians(45), np.radians(90) cosi = np.linspace(np.cos(i_0), np.cos(i_f), 15) self.inclinations = np.degrees(np.arccos(cosi)) # store parameters self.parameters = get_model_parameters(n_model)
def get_colors(image: np.ndarray, n_colors: int, write_pca: bool = False): pca = PCA(n_components=3) print(image.shape) X = np.array(image).reshape(-1, 3) pca.fit(X) samples = np.random.randint(-1000, 2, size=X.shape[0]) index = np.where(samples > 0, np.ones(shape=X.shape[0]), np.zeros(shape=X.shape[0])).astype(np.int).nonzero() Y = pca.transform(X[index]).astype(np.int) X_pca_0 = Y[:, 0] X_pca_1 = Y[:, 2] # good_colors = np.where(X_pca_0 > 150)[0] more_good_colors = np.apply_along_axis(f2, 1, X[index]).nonzero() # plot samples in eigenspace fig = plt.figure(figsize=(12.8, 9.6)) ax = fig.add_subplot(111) ax.scatter(X_pca_0, X_pca_1, c=X[index] / 255) if write_pca: ax.figure.savefig('./kmeans-pca_output.png', format='png') # plt.show() cluster = KMeans(n_colors) cluster.fit(X[index][more_good_colors]) clustered_colors = cluster.predict(X[index][more_good_colors]) print(clustered_colors.shape) color_map = dict() for label, color in zip(clustered_colors, X[index][more_good_colors]): try: color_map[label].append(color) except KeyError: color_map[label] = [color] print('cluster geometric means') for label, members in color_map.items(): value = gmean(np.array(members)) print(label, len(members), value) color_map[label] = value return color_map.values()
def analyze(predictions, test): frame = test["dataframe"] oracle = np.array(frame["oracle_enc"], dtype=np.bool) incorrect = np.logical_xor(predictions, oracle) correct = np.logical_not(incorrect) zero_r = Counter(oracle).most_common(1)[0][0] zero_r_key = enc2key(zero_r) speedups = np.array([ min(d["runtime_cpu"], d["runtime_gpu"]) / d[enc2key(p)] for p, d in zip(predictions, frame.T.to_dict().values()) ]) speedup_avg = speedups.mean() speedup_geo = gmean(speedups) accuracy = sum(correct) / len(test["dataframe"]) confusion_matrix = np.zeros((2, 2), dtype="int32") confusion_matrix[0][0] = sum( np.logical_and(np.logical_not(predictions), np.logical_not(oracle))) confusion_matrix[0][1] = sum( np.logical_and(predictions, np.logical_not(oracle))) confusion_matrix[1][0] = sum( np.logical_and(np.logical_not(predictions), oracle)) confusion_matrix[1][1] = sum(np.logical_and(predictions, oracle)) assert (confusion_matrix.sum() == len(test["dataframe"])) assert (confusion_matrix[0][1] + confusion_matrix[1][1] == sum(predictions)) assert (confusion_matrix[0][1] + confusion_matrix[1][0] == sum(incorrect)) assert (confusion_matrix[0][0] + confusion_matrix[1][1] == sum(correct)) print(confusion_matrix) return { "accuracy": accuracy, "correct": correct, "confusion_matrix": confusion_matrix, "speedups": speedups, "speedup_min": min(speedups), "speedup_max": max(speedups), "speedup_avg": speedup_avg, "speedup_geo": speedup_geo, }
def main(): sol = dict() for method in ['dopri5', 'adams']: for tol in [1e-3, 1e-6, 1e-9]: print('======= {} | tol={:e} ======='.format(method, tol)) nfes = [] times = [] errs = [] for c in ['A', 'B', 'C', 'D', 'E']: for i in ['1', '2', '3', '4', '5']: diffeq, init, _ = getattr(detest, c + i)() t0, y0 = init() diffeq = NFEDiffEq(diffeq) if not c + i in sol: sol[c + i] = odeint(diffeq, y0, torch.stack( [t0, torch.tensor(20.)]), atol=1e-12, rtol=1e-12, method='dopri5')[1] diffeq.nfe = 0 start_time = time.time() est = odeint(diffeq, y0, torch.stack([t0, torch.tensor(20.)]), atol=tol, rtol=tol, method=method) time_spent = time.time() - start_time error = torch.sqrt(torch.mean((sol[c + i] - est[1])**2)) errs.append(error.item()) nfes.append(diffeq.nfe) times.append(time_spent) print('{}: NFE {} | Time {} | Err {:e}'.format( c + i, diffeq.nfe, time_spent, error.item())) print('Total NFE {} | Total Time {} | GeomAvg Error {:e}'.format( np.sum(nfes), np.sum(times), gmean(errs)))
def carga_pi(data): ''' Recibe un dataset especifico por categoria de producto de una region; Limpia los datos, determina las diferentes variedades dentro de las categorias de producto; Carga los precios promedio de estes productos y determina su variación en relación al mes anterior y calcula la media geométrica entre las variacenes de producto. Retorna una tabla con los valores "pit". ''' x = data.split('_') x = x[1].split('/') producto = x[-1] df = limpia_data(data) variedades, n = encuentra_variedades(df) tabla_precios = precios_promedio(df, variedades) tabla_var = variaciones_precios_promedio(tabla_precios, n) tabla_var[producto] = gmean(tabla_var, axis=1) tabla_var = tabla_var[1:] return tabla_var
def saatiMethod(): relationshipMatrix = [] firstRow = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0] relationshipMatrix.append(firstRow) for i in range(2, 10): nextRow = [] for j in range(9): value = firstRow[j] / i nextRow.append(value) relationshipMatrix.append(nextRow) gmeanList = [] for row in relationshipMatrix: gmeanList.append(gmean(row)) priorityVectors = [] for g in gmeanList: pVector = g / sum(gmeanList) priorityVectors.append(pVector) return priorityVectors
def normalize2(data, mask): (nC, nB) = mask.shape handled = full(shape=(nB), fill_value=False, dtype=np.bool) print("Começa a normalizar: \n\n") for c in range(0, nC): if (all(handled)): break factor = stats.gmean(data[c, handled]) if any(handled) else 1 print(" Handled: ", handled) handle = mask[c, :] & ~handled print("Handling: ", handle) if (any(handle)): data[:, handle] *= kron(factor / data[c, handle], ones((nC, 1))) handled = handled | handle return data, mask print(data)
def discardseasons(df, seasons, gdthres=2.0, smin=5): """ Calculate peak variability in order to keep only seasons with relatively low variability rdthres. Always mantain at least smin seasons. :param df: data frame with seasons by columns :param seasons: list of column names corresponding to each season :param gdthres: maximum geometric deviation from median :param smin: minimum number of seasons maintained :return drop_seasons: list with seasons to be dropped """ drop_seasons = [] seasons = seasons.copy() # Drop null seasons series = df[seasons].max() drop_seasons = list(series[series == 0].index) series.drop(drop_seasons, axis=0, inplace=True) # If resulting data contains less than smin seasons, return nseasons = len(series) nmax = nseasons - smin if nmax <= 0: return drop_seasons ####### Test removing one by one ###### # Take log of geometric deviation threshold for simplicity gdthres = np.log(gdthres) for n in range(nmax): # Current maxima tmp_series = df[list(set(seasons).difference(drop_seasons))].max() # Grab current geometric mean series_gmean = np.log(gmean(tmp_series)) # Calculate maximum geometric deviation from geometric mean mgd = abs(np.log(tmp_series) - series_gmean).max() if mgd > gdthres: idx = abs(np.log(tmp_series) - series_gmean).idxmax() drop_seasons.append(idx) return drop_seasons
def getGeometricMAP(self, depth=1000, trec_eval=True): """ The Geometric Mean Average Precision is the same as measured by MAP (mean average precision) on individual topics,\n but the geometric mean is used on over the results of each topic. Note that as done in the original trec_eval, the Geometric Map is only reported in the summary over all topics, not for individual topics. Params ------- depth: the evaluation depth. Default = 1000 trec_eval: set to True if result should be the same as trec_eval, e.g., sort documents by score first. Default = True. Returns -------- The Geometric Mean Average Precision for all topics. Topics with MAP = 0 are replaced by MAP = GMEAN_MIN (default = .00001) """ from scipy.stats.mstats import gmean maps = self.getMAP(depth=depth, trec_eval=trec_eval, per_query=True) maps = maps.replace(0.0, self.GMEAN_MIN) return gmean(maps)[0]