def get_data(column, np_values, alpha): mvs = bayes_mvs(np_values, alpha) #report these metrics output = [ present("Column", column), present("Length", len(np_values)), present("Unique", len(np.unique(np_values))), present("Min", np_values.min()), present("Max", np_values.max()), present("Mid-Range", (np_values.max() - np_values.min())/2), present("Range", np_values.max() - np_values.min()), present("Mean", np_values.mean()), present("Mean-%s-CI" % alpha, tupleToString(mvs[0][1])), present("Variance", mvs[1][0]), present("Var-%s-CI" % alpha, tupleToString(mvs[1][1])), present("StdDev", mvs[2][0]), present("Std-%s-CI" % alpha, tupleToString(mvs[2][1])), present("Mode", stats.mode(np_values)[0][0]), present("Q1", stats.scoreatpercentile(np_values, 25)), present("Q2", stats.scoreatpercentile(np_values, 50)), present("Q3", stats.scoreatpercentile(np_values, 75)), present("Trimean", trimean(np_values)), present("Minhinge", midhinge(np_values)), present("Skewness", stats.skew(np_values)), present("Kurtosis", stats.kurtosis(np_values)), present("StdErr", sem(np_values)), present("Normal-P-value", normaltest(np_values)[1]) ] return output
def get_data(column, np_values, alpha): mvs = bayes_mvs(np_values, alpha) #report these metrics output = [ present("Column", column), present("Length", len(np_values)), present("Unique", len(np.unique(np_values))), present("Min", np_values.min()), present("Max", np_values.max()), present("Mid-Range", (np_values.max() - np_values.min()) / 2), present("Range", np_values.max() - np_values.min()), present("Mean", np_values.mean()), present("Mean-%s-CI" % alpha, tupleToString(mvs[0][1])), present("Variance", mvs[1][0]), present("Var-%s-CI" % alpha, tupleToString(mvs[1][1])), present("StdDev", mvs[2][0]), present("Std-%s-CI" % alpha, tupleToString(mvs[2][1])), present("Mode", stats.mode(np_values)[0][0]), present("Q1", stats.scoreatpercentile(np_values, 25)), present("Q2", stats.scoreatpercentile(np_values, 50)), present("Q3", stats.scoreatpercentile(np_values, 75)), present("Trimean", trimean(np_values)), present("Minhinge", midhinge(np_values)), present("Skewness", stats.skew(np_values)), present("Kurtosis", stats.kurtosis(np_values)), present("StdErr", sem(np_values)), present("Normal-P-value", normaltest(np_values)[1]) ] return output
def _compute_params(self, features): m = features.shape[1] percentiles = np.ones((2, m)) * np.nan for j in xrange(m): percentiles[0, j] = scoreatpercentile(features[:, j], self.lower_q) percentiles[1, j] = scoreatpercentile(features[:, j], self.upper_q) return percentiles
def _compute_percentiles(features): m = features.shape[1] percentiles = np.ones((2, m)) * np.nan for j in xrange(m): percentiles[0, j] = scoreatpercentile(features[:, j], 1) percentiles[1, j] = scoreatpercentile(features[:, j], 99) return percentiles
def showStatistics(key=None, domain=True, dist=False, cumdist=False, clip=None, vmin=None, vmax=None, percentile=False): """Show the values corresponding with key in the specified mode. key is one of the keys of SelectableStatsValues mode is one of ['On Domain','Histogram','Cumulative Histogram'] """ S = selection.check(single=True) if S: func, onEdges = SelectableStatsValues[key] kargs = {} if key == "Curvature": kargs["neighbours"] = _stat_dia.results["neighbours"] val = func(S, **kargs) if key == "Curvature": ind = CurvatureValues.index(_stat_dia.results["curval"]) val = val[ind] val = val[S.elems] # !! THIS SHOULD BE IMPLEMENTED AS A GENERAL VALUE CLIPPER # !! e.g popping up when clicking the legend # !! and the values should be changeable if clip: clip = clip.lower() if percentile: try: from scipy.stats.stats import scoreatpercentile except: warning( """.. **The **percentile** clipping option is not available. Most likely because 'python-scipy' is not installed on your system.""" ) return Q1 = scoreatpercentile(val, vmin) Q3 = scoreatpercentile(val, vmax) factor = 3 if vmin: vmin = Q1 - factor * (Q3 - Q1) if vmax: vmax = Q3 + factor * (Q3 - Q1) if clip == "top": val = val.clip(max=vmax) elif clip == "bottom": val = val.clip(min=vmin) else: val = val.clip(vmin, vmax) if domain: clear() lights(False) showSurfaceValue(S, key, val, onEdges) if dist: showHistogram(key, val, cumulative=False) if cumdist: showHistogram(key, val, cumulative=True)
def hit_detection_proliferation(res, who, who_mitotic_hits, ctrl_points, whis=1.5): q3=scoreatpercentile(ctrl_points, per=75) q1=scoreatpercentile(ctrl_points, per=25) val=q1-whis*(q3-q1) hits=np.where(res<=val)[0] return filter(lambda x: x not in who_mitotic_hits, np.array(who)[hits])
def normalize(data): wtdata = array(data) wtdata[wtdata < 0] = 0 q1 = stats.scoreatpercentile(wtdata, 25) q3 = stats.scoreatpercentile(wtdata, 75) interquart = q3 - q1 tenperc = stats.scoreatpercentile(wtdata[wtdata <= q1 + 1.5 * interquart], 90) maxav = wtdata[wtdata >= tenperc].mean() wtdata = wtdata / maxav return wtdata
def hit_detection_pheno_score(res, who, ctrl_points, whis=1.5): ''' Permissive hit detection taking experiments where Interphase phenotypic score is under the bottom whisker, which is 1.5*IQR ''' q3=scoreatpercentile(ctrl_points[:,CLASSES.index('Interphase')], per=75) q1=scoreatpercentile(ctrl_points[:,CLASSES.index('Interphase')], per=25) val=q1-whis*(q3-q1) hits=np.where(res[:,CLASSES.index('Interphase')]<=val)[0] return res[hits], np.array(who)[hits]
def normalize(bonuses): l = len(bonuses) wtdata = array(bonuses) if wtdata.min() < 0: wtdata -= wtdata.min() interquart = stats.scoreatpercentile(wtdata, 75) - stats.scoreatpercentile(wtdata, 25) tenperc = stats.scoreatpercentile(wtdata, 90) maxcount = 0 maxav = 0. for i in range(l): if wtdata[i] >= tenperc: maxav += wtdata[i] maxcount += 1 maxav /= maxcount wtdata = wtdata/maxav return wtdata
def graphWorking(trajdist, mds_transformed, genes=None, percentile=10, only_high_degree=True, clustering_labels=None): couleurs=['green', 'red', 'blue', 'orange', 'yellow', "white", 'purple', 'pink', 'grey'] newtrajdist=np.array(trajdist) newtrajdist[np.where(newtrajdist>scoreatpercentile(newtrajdist.flatten(), percentile))]=0 G=nx.from_numpy_matrix(newtrajdist); labels={} #maintenant les edges ont bien les poids que l'on voudrait mais on ne peut pas faire de plots pcq il faut donner les positions des noeuds. Utiliser les resultats MDS pr ca for el in G.nodes(): G.node[el]['pos']=mds_transformed[el][:2] if genes is not None: labels[el]=genes[el] pos=nx.get_node_attributes(G,'pos') p.figure(figsize=(8,8)) if not only_high_degree: nx.draw_networkx_edges(G,pos,alpha=0.4) node_color=[float(G.degree(v)) for v in G] if clustering_labels is None else [couleurs[k] for k in clustering_labels] nx.draw_networkx_nodes(G,pos, node_size=60, node_color=node_color, cmap=p.cm.Reds_r) else: degrees=np.array([float(G.degree(v)) for v in G]) wh_=np.where(degrees>=scoreatpercentile(degrees, 90))[0] pos={x:pos[x] for x in pos if x in wh_} labels={x:labels[x] for x in pos} for k in range(len(genes)): if k not in pos: G.remove_node(k) node_color=[float(G.degree(v)) for v in G] nx.draw_networkx_edges(G,pos,alpha=0.4) nx.draw_networkx_nodes(G,pos, node_size=60, node_color=node_color, cmap=p.cm.Reds_r) nx.draw_networkx_labels(G, pos, labels, font_size=10) p.axis('off') p.show() return G
def compute_fft_stats(self): if config.args.conflicts == config.ADD_FILES and \ (os.path.exists(self.fivepseq_out.get_file_path(FivePSeqOut.FFT_SIGNALS_TERM)) & os.path.exists(self.fivepseq_out.get_file_path(FivePSeqOut.FFT_SIGNALS_START)) & os.path.exists(self.fivepseq_out.get_file_path(FivePSeqOut.FFT_STATS_DF_FILE))): self.logger.info( "Skipping FFT statistics calculation: files already exist") else: self.logger.info("Computing FFT statistics") count_vector_list = self.fivepseq_counts.get_count_vector_list( FivePSeqCounts.FULL_LENGTH) span_size = self.fivepseq_counts.annotation.span_size lengths = [0] * len(count_vector_list) for i in range(len(count_vector_list)): count_vector = count_vector_list[i][ span_size:len(count_vector_list[i]) - span_size] lengths[i] = len(count_vector) # align start size = int(stats.scoreatpercentile(lengths, per=25)) num = 3 * len(count_vector_list) // 4 start_array = np.zeros((num, size)) term_array = np.zeros((num, size)) ind = 0 for i in range(len(count_vector_list)): count_vector = count_vector_list[i][ span_size:len(count_vector_list[i]) - span_size] if (len(count_vector)) > size: start_array[ind, :] = count_vector[0:size] term_array[ind, :] = count_vector[len(count_vector) - size:len(count_vector)] ind += 1 start_mean = start_array.mean(axis=0) term_mean = term_array.mean(axis=0) self.fft_stats_start = self.fft_stats_on_vector(start_mean, 5) self.fft_stats_term = self.fft_stats_on_vector(term_mean, 5) self.fft_stats_df = pd.DataFrame(data=[ self.fft_stats_start[1], self.fft_stats_start[2], self.fft_stats_start[3], self.fft_stats_term[1], self.fft_stats_term[2], self.fft_stats_term[3] ], index=[ "START_periods", "START_signals", "START_scales", "TERM_periods", "TERM_signals", "TERM_scales" ]) self.logger.info("Writing FFT stats") self.fivepseq_out.write_df_to_file(self.fft_stats_df, FivePSeqOut.FFT_STATS_DF_FILE) self.fivepseq_out.write_series_to_file( self.fft_stats_start[0], FivePSeqOut.FFT_SIGNALS_START) self.fivepseq_out.write_series_to_file( self.fft_stats_term[0], FivePSeqOut.FFT_SIGNALS_TERM)
def createFeldmanRanking(protectedCandidates, nonProtectedCandidates, k): """ creates a ranking that promotes the protected candidates by adjusting the distribution of the qualifications of the protected and non-protected group IMPORTANT: THIS METHOD MODIFIES THE ORIGINAL LIST OF PROTECTED CANDIDATES! I.e. it modifies the qualification of the protected candidates. If the original list has to be preserved, it has to be deep-copied into a new data structure, before handed over into this method. steps: 1. take a protected candidate x 2. determine the percentile of that candidate within their group percentile(x) 3. find a non-protected candidate y that has the same percentile(y) == percentile(x) 4. assign the score of y to x 5. goto 1 Parameters: ---------- :param protectedCandidates: array of protected candidates :param nonProtectedCandidates: array of non-protected candidates :param k: length of the ranking to return Return: ------ a ranking of protected and non-protected candidates, which tries to have a better share of protected and non-protected candidates """ # ensure candidates are sorted by descending qualificiations protectedCandidates.sort(key=lambda candidate: candidate.qualification, reverse=True) nonProtectedCandidates.sort(key=lambda candidate: candidate.qualification, reverse=True) protectedQualifications = [ protectedCandidates[i].qualification for i in range(len(protectedCandidates)) ] nonProtectedQualifications = [ nonProtectedCandidates[i].qualification for i in range(len(nonProtectedCandidates)) ] # create same distribution for protected and non-protected candidates for i, candidate in enumerate(protectedCandidates): if i >= k: # only need to adapt the scores for protected candidates up to required length # the rest will not be considered anyway break # find percentile of protected candidate p = percentileofscore(protectedQualifications, candidate.qualification) # find score of a non-protected in the same percentile score = scoreatpercentile(nonProtectedQualifications, p) candidate.qualification = score # create a colorblind ranking return createFairRanking(k, protectedCandidates, nonProtectedCandidates, ESSENTIALLY_ZERO, 0.1)
def get_goat_indicies(self): """Return the indicies of goat user. Speration is done at 30th (or 70th) percentile. Source: Exploiting the "Doddington Zoo" Effect in Biometric Fusion """ sorted_genuine = self._genuine_scores.copy() sorted_genuine.sort() if self._type == 'distance': score = scoreatpercentile(sorted_genuine, 70) indicies = np.where(self._genuine_scores>score) elif self._type == 'score': score = scoreatpercentile(sorted_genuine, 30) indicies = np.where(self._genuine_scores<score) return indicies[0]
def normalize(bonuses): l = len(bonuses) wtdata = array(bonuses) if wtdata.min() < 0: wtdata -= wtdata.min() interquart = stats.scoreatpercentile(wtdata, 75) - stats.scoreatpercentile( wtdata, 25) tenperc = stats.scoreatpercentile(wtdata, 90) maxcount = 0 maxav = 0. for i in xrange(l): if wtdata[i] >= tenperc: maxav += wtdata[i] maxcount += 1 maxav /= maxcount wtdata = wtdata / maxav return wtdata
def _create_cache_percentiles(self, predicate, resume=False): controls = self._get_controls(predicate) for i, (plate, imKeys) in enumerate(make_progress_bar('Percentiles')(controls.items())): filename = self._percentiles_filename(plate) if i == 0: _check_directory(os.path.dirname(filename), resume) if resume and os.path.exists(filename): continue features = self.cache.load(imKeys)[0] if len(features) == 0: logger.warning('No DMSO features for plate %s' % str(plate)) percentiles = np.zeros((0, len(self.cache.colnames))) else: m = features.shape[1] percentiles = np.ones((2, m)) * np.nan for j in xrange(m): percentiles[0, j] = scoreatpercentile(features[:, j], 1) percentiles[1, j] = scoreatpercentile(features[:, j], 99) np.save(filename, percentiles)
def get_lamb_indicies(self): """Return the indicies of lambs users. BUG: seems to not work """ lambs_scores = [] for userid in self._users_id: presentations = self.get_impostor_presentations_of_user(userid) best_matchings = [] # loop other the other user to get the best matching gallery_ids = presentations.get_raw_gallery_userid(unique=True) for gallery_id in gallery_ids: assert gallery_id != userid # Select only one gallery user indicies = presentations._data[:,self.GALLERY] == gallery_id impscores = presentations.get_raw_scores()[indicies] if self._type == 'score': selected_score = np.min(impscores) elif self._type == 'distance' : selected_score = np.max(impscores) else: raise "Impossible" best_matchings.append(selected_score) lambs_scores.append(np.mean(best_matchings)) lamb_score_sorted = np.sort(lambs_scores) if self._type == 'distance': score = scoreatpercentile(lamb_score_sorted, 10) indicies = np.where(self._genuine_scores<score) elif self._type == 'score': score = scoreatpercentile(lamb_score_sorted, 90) indicies = np.where(self._genuine_scores>score) else: raise "Impossible" return indicies[0]
def _create_cache_percentiles(self, predicate, resume=False): controls = self._get_controls(predicate) for i, (plate, imKeys) in enumerate( make_progress_bar('Percentiles')(controls.items())): filename = self._percentiles_filename(plate) if i == 0: _check_directory(os.path.dirname(filename), resume) if resume and os.path.exists(filename): continue features = self.cache.load(imKeys)[0] if len(features) == 0: logger.warning('No DMSO features for plate %s' % str(plate)) percentiles = np.zeros((0, len(self.cache.colnames))) else: m = features.shape[1] percentiles = np.ones((2, m)) * np.nan for j in xrange(m): percentiles[0, j] = scoreatpercentile(features[:, j], 1) percentiles[1, j] = scoreatpercentile(features[:, j], 99) np.save(filename, percentiles)
def extract(self, point_cloud, neighborhood, target_point_cloud, target_index, volume_description): """ Extract the feature value(s) of the point cloud at location of the target. :param point_cloud: environment (search space) point cloud :param neighborhood: array of indices of points within the point_cloud argument :param target_point_cloud: point cloud that contains target point :param target_index: index of the target point within the target point cloud :param volume_description: cell, sphere, cylinder or voxel size description :target_index: index of the target point in the target point cloud :return: feature value """ source_data = point_cloud[point][self.data_key]['data'][neighborhood] return stats.scoreatpercentile(source_data, self.percentile)
def graphVisualClustering(genesTarget, mds_dist, genes, G): result=[[] for k in range(len(genesTarget))] arr=np.array([gene in genesTarget for gene in genes]) degrees=np.array([float(G.degree(v)) for v in G]) high_degrees=np.where(degrees>=scoreatpercentile(degrees, 90))[0] for el in high_degrees: gene=genes[el] distances=cdist(mds_dist[np.newaxis, el], mds_dist[np.where(arr)]) print el, gene, np.argmax(distances) result[np.argmax(distances)].append(gene) return result
def greyscale(infile, outfile): f = lasfile.File(infile,None,'rb') h = f.header # Projections for converting between the two #utmproj = Proj(init="epsg:26910") utmproj = Proj(r'+proj=utm +zone=10 +datum=NAD83 +units=us-ft +no_defs') latproj = Proj(proj='latlong',datum='WGS84') # Conversion from feet to meters conv = 1.0/0.3048 tile_width = int(h.max[0] - h.min[0]) + 1 tile_height = int(h.max[1] - h.min[1]) + 1 height_offset = h.min[2] img = Image.new("RGB", (tile_width, tile_height)) print "Image size: %d x %d" % (tile_width, tile_height) zvals = [] for point in f: zvals.append(point.z) # Get a reasonable max height max_height = stats.scoreatpercentile(zvals, 99.5) depth_scale = 255/(max_height - h.min[2]) print "Computed max height (99.5 percentile): %.2f" % (max_height) i = 0 for point in f: if point.z > max_height: z = max_height else: z = point.z x = int(point.x - h.min[0]) y = int(point.y - h.min[1]) intensity = 255 - int(((z - height_offset) * depth_scale)) img.putpixel((x,y), (intensity,intensity,intensity)) i += 1 img.save(outfile)
def plotInternalConsistency(M, tick_labels, cmap=mpl.cm.bwr, second_labels=None): norm = mpl.colors.Normalize(0,scoreatpercentile(M.flatten(), 95)) f=p.figure() ax=f.add_subplot(111) ax.matshow(M, cmap=cmap, norm=norm) p.yticks(range(0,M.shape[0],15),tick_labels[::15]) ax.tick_params(labelsize=6) # if second_labels is not None: # ax_=ax.twinx() # ax_.set_yticks(range(M.shape[0])) # ax_.set_yticklabels(second_labels) # ax_.tick_params(labelsize=4) # ax_.set_yscale(ax.get_yscale()) p.show() return
def feldmanRanking(protectedCandidates, nonProtectedCandidates, k, dataSetName): # ensure candidates are sorted by descending qualificiations nonProtectedCandidates.sort(key=lambda candidate: candidate.learnedScores, reverse=True) nonProtectedQualifications = [ nonProtectedCandidates[i].learnedScores for i in range(len(nonProtectedCandidates)) ] protectedCandidates.sort(key=lambda candidate: candidate.learnedScores, reverse=True) protectedQualifications = [ protectedCandidates[i].learnedScores for i in range(len(protectedCandidates)) ] ranking = [] # create same distribution for protected and non-protected candidates for i, candidate in enumerate(protectedCandidates): if i >= k: # only need to adapt the scores for protected candidates up to required length # the rest will not be considered anyway break # find percentile of protected candidate p = percentileofscore(protectedQualifications, candidate.learnedScores) # find score of a non-protected in the same percentile score = scoreatpercentile(nonProtectedQualifications, p) candidate.qualification = score ranking.append(candidate) ranking += nonProtectedCandidates # create a colorblind ranking ranking.sort(key=lambda candidate: candidate.qualification, reverse=True) rankingResultsPath = "FeldmanEtAl/" + dataSetName + "ranking.csv" return ranking, rankingResultsPath
gene = affy2entrez[p].replace('"','') if gene in genecols.keys(): genecols[gene].append(i) else: genecols[gene] = [i] o.write(l[0]+','+l[1]+',') for i, g in enumerate(genecols): o.write(g) if i < len(genecols)-1: o.write(',') else: o.write('\n') print 'Number of probes' print len(probes) i = 0 while True: i += 1 l = f.readline() if not l: break fields = l.strip(' \n').split() a = array([float(x) for x in fields[2:]]) o.write(fields[0]+','+fields[1]+',') for j, g in enumerate(genecols): o.write(str(stats.scoreatpercentile(a[genecols[g]], 50))) if j < len(genecols)-1: o.write(',') else: o.write('\n')
def trimean(values): return (stats.scoreatpercentile(values, 25) + 2.0*stats.scoreatpercentile(values, 50) + stats.scoreatpercentile(values, 75))/4.0
def trimean(values): return (stats.scoreatpercentile(values, 25) + 2.0 * stats.scoreatpercentile(values, 50) + stats.scoreatpercentile(values, 75)) / 4.0
rdat = RDATFile() rdat.load(open(args.rdatdir+'/'+filename)) for cname in rdat.constructs: construct = rdat.constructs[cname] struct = SecondaryStructure(construct.structure) frags = struct.explode() for data in construct.data: if (('mutation' not in data.annotations) or \ ('mutation' in data.annotations and \ 'WT' in data.annotations['mutation'])): if 'modifier' in data.annotations: if args.normalize: normvals = normalize(data.values) else: normvals = data.values iqr = scoreatpercentile(normvals, 75) - scoreatpercentile(normvals, 25) for fragtype in frags: db['all'].extend(normvals) if data.errors: db['all'].extend(data.errors) dbidx['all'] = dict([((construct.name, construct.seqpos[i]), v) for i, v in enumerate(normvals)]) fraglist = frags[fragtype] for frag in fraglist: vals = [] valerrors = [] pos = [] for idx in frag: try: iddx = construct.seqpos.index(idx + construct.offset + 1) if ('DMS' in data.annotations['modifier'] and construct.sequence[idx].upper() not in ['A', 'C']) or\ ('CMCT' in data.annotations['modifier'] and construct.sequence[idx].upper() not in ['G', 'U']) or\
def plot_outlier_detection(df_floats: pd.DataFrame, y_labels: np.ndarray, clf, clf_name: str = None, scaler=None): """ Return contourf plot of the anomaly detection model. Plots contourf plot of decision scores marking area where observations would be considered inliers. Parameters ---------- df_floats: pd.DataFrame with elements as floats. y_labels: numpy array of the same length as df_floats that assigns 0/1 (inlier/outlier) to each observation according to fitted model. clf: fitted model. clf_name: name of fitted model. scaler: estimator that was used to change range of df_floats DataFrame Returns ------- Contourf plot. """ if df_floats.shape[1] > 2: print('Plotting first two variables...') elif df_floats.shape[1] < 2: print('Sorry can not plot less than two variables.') return # predict raw anomaly score y_scores = clf.decision_function(df_floats.iloc[:, [0, 1]]) * -1 # threshold value to consider a datapoint inlier or outlier threshold = stats.scoreatpercentile( y_scores, 100 * len(y_labels[y_labels == 1]) / len(y_labels)) # Specifies interval over which the np.linspace will be created x_lim_min, x_lim_max = df_floats.iloc[:, 0].min(), df_floats.iloc[:, 0].max() x_delta = 0.05 * (x_lim_max - x_lim_min) y_lim_min, y_lim_max = df_floats.iloc[:, 1].min(), df_floats.iloc[:, 1].max() y_delta = 0.05 * (y_lim_max - y_lim_min) # coordinate array for vectorized evaluation of raw anomaly scores # TODO: Coarser grid sometimes returns error when plotting (threshold out of [zz.min();zz.max()]). xx, yy = np.meshgrid( np.linspace(x_lim_min - x_delta, x_lim_max + x_delta, 100), np.linspace(y_lim_min - y_delta, y_lim_max + y_delta, 100)) # decision function calculates the raw anomaly score for every point zz = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) * -1 zz = zz.reshape(xx.shape) # undo the scaling so the plot is in the same scale as input data if scaler: df_floats.iloc[:, [0, 1]] = scaler.inverse_transform( df_floats.iloc[:, [0, 1]]) x_lim_min, x_lim_max = df_floats.iloc[:, 0].min( ), df_floats.iloc[:, 0].max() x_delta = 0.05 * (x_lim_max - x_lim_min) y_lim_min, y_lim_max = df_floats.iloc[:, 1].min( ), df_floats.iloc[:, 1].max() y_delta = 0.05 * (y_lim_max - y_lim_min) xx, yy = np.meshgrid( np.linspace(x_lim_min - x_delta, x_lim_max + x_delta, 100), np.linspace(y_lim_min - y_delta, y_lim_max + y_delta, 100)) # inliers_1 - inlier feature 1, inliers_2 - inlier feature 2 inliers_1 = (df_floats.iloc[:, 0][y_labels == 0]).values.reshape(-1, 1) inliers_2 = (df_floats.iloc[:, 1][y_labels == 0]).values.reshape(-1, 1) # outliers_1 - outlier feature 1, outliers_2 - outlier feature 2 outliers_1 = df_floats.iloc[:, 0][y_labels == 1].values.reshape(-1, 1) outliers_2 = df_floats.iloc[:, 1][y_labels == 1].values.reshape(-1, 1) plt.figure(figsize=(10, 10)) # fill blue map colormap from minimum anomaly score to threshold value plt.contourf(xx, yy, zz, levels=np.linspace(zz.min(), threshold, 50), cmap=plt.cm.Blues_r) plt.colorbar( plt.contourf(xx, yy, zz, levels=np.linspace(zz.min(), threshold, 50), cmap=plt.cm.Blues_r)) # fill orange contour lines where range of anomaly score is from threshold to maximum anomaly score plt.contourf(xx, yy, zz, levels=[threshold, zz.max()], colors='orange') # draw red contour line where anomaly score is equal to threshold a = plt.contour(xx, yy, zz, levels=[threshold], linewidths=2, colors='red') # draw inliers as white dots b = plt.scatter(inliers_1, inliers_2, c='white', s=20, edgecolor='k') # draw outliers as black dots c = plt.scatter(outliers_1, outliers_2, c='black', s=20, edgecolor='k') plt.axis('tight') # loc=2 is used for the top left corner plt.legend([a.collections[0], b, c], ['learned decision function', 'inliers', 'outliers'], prop=matplotlib.font_manager.FontProperties(size=20), loc=2) plt.xlim((x_lim_min - x_delta, x_lim_max + x_delta)) plt.ylim((y_lim_min - y_delta, y_lim_max + y_delta)) plt.xlabel(df_floats.columns[0]) plt.ylabel(df_floats.columns[1]) if clf_name: plt.title(clf_name) plt.show() return
def variableSelector(predictorTrain, targetTrain, method, verbose=False): varSelected = [] #SelectKBest if (method == 'kb'): selector = SelectKBest(mutual_info_classif, k='all').fit(predictorTrain, targetTrain) #selector = SelectKBest(mutual_info_classif).fit(predictNormWithPCA,target) #selector = SelectPercentile(mutual_info_classif, percentile=100).fit(predictorTrain,targetTrain) scores = selector.scores_ if (verbose): print "Scores de selection" print scores threshold = stats.scoreatpercentile(scores, 33) for i in range(len(scores)): if (scores[i] > threshold): varSelected.append(i) if (verbose): print scores[0] if (verbose): plt.hist(scores, bins=30) #, bins = range(1,len(scores))) plt.title("Score de la selection de variables avec SelectKBest") plt.ylabel('Nombre de variables') plt.xlabel('Score') plt.show() #Random Forest elif (method == 'rf'): clf = RandomForestClassifier(n_estimators=20) # Random Forest clf.fit(predictorTrain, targetTrain) indexes = range(len(predictorTrain[0])) if (verbose): print "Features sorted by their score:" print sorted(zip( map(lambda x: round(x, 4), clf.feature_importances_), indexes), reverse=True) plt.hist(clf.feature_importances_, bins=30) # , bins = range(1,len(scores))) plt.title("Score de la selection de variables avec Random Forest") plt.ylabel('Nombre de variables') plt.xlabel('Score') plt.show() scores = defaultdict(list) scoresTab = [] for train_idx, test_idx in ShuffleSplit( n_splits=10, random_state=0, test_size=.3).split(predictorTrain): X_train, X_test = predictorTrain[train_idx], predictorTrain[ test_idx] Y_train, Y_test = targetTrain[train_idx], targetTrain[test_idx] clf.fit(X_train, Y_train) acc = r2_score(Y_test, clf.predict(X_test)) for i in range(predictorTrain.shape[1]): X_t = X_test.copy() np.random.shuffle(X_t[:, i]) shuff_acc = r2_score(Y_test, clf.predict(X_t)) scores[indexes[i]].append((acc - shuff_acc) / acc) scoresTab.append((acc - shuff_acc) / acc) threshold = stats.scoreatpercentile(scoresTab, 25) for feat, score in scores.items(): scorebis = np.mean(score) if scorebis > threshold: varSelected.append(feat) varSelected = np.unique(varSelected) return varSelected
def summary_string(data): """ Returns the string representation of the data in `vector` """ summary = (np.nanmin(data), stats.scoreatpercentile(data,25), np.median(data), stats.scoreatpercentile(data,75), np.nanmax(data)) return "[%s, %s, %s, %s, %s]" % summary
def midhinge(values): return (stats.scoreatpercentile(values, 25) + stats.scoreatpercentile(values, 75))/2.0
def _extract_one(self, point_cloud, neighborhood): source_data = point_cloud[point][self.data_key]['data'][neighborhood] return stats.scoreatpercentile(source_data, self.percentile)
def midhinge(values): return (stats.scoreatpercentile(values, 25) + stats.scoreatpercentile(values, 75)) / 2.0
rdat = RDATFile() rdat.load(open(args.rdatdir + '/' + filename)) for cname in rdat.constructs: construct = rdat.constructs[cname] struct = SecondaryStructure(construct.structure) frags = struct.explode() for data in construct.data: if (('mutation' not in data.annotations) or \ ('mutation' in data.annotations and \ 'WT' in data.annotations['mutation'])): if 'modifier' in data.annotations: if args.normalize: normvals = normalize(data.values) else: normvals = data.values iqr = scoreatpercentile( normvals, 75) - scoreatpercentile(normvals, 25) for fragtype in frags: db['all'].extend(normvals) if data.errors: db['all'].extend(data.errors) dbidx['all'] = dict([((construct.name, construct.seqpos[i]), v) for i, v in enumerate(normvals)]) fraglist = frags[fragtype] for frag in fraglist: vals = [] valerrors = [] pos = [] for idx in frag: try: iddx = construct.seqpos.index(idx + construct.offset + 1)