def calculateEntropy(self,Y, mship): """ calculates the split entropy using Y and mship (logical array) telling which child the examples are being split into... Input: --------- Y: a label array mship: (logical array) telling which child the examples are being split into, whether each example is assigned to left split or the right one.. Returns: --------- entropy: split entropy of the split """ lexam=Y[mship] rexam=Y[np.logical_not(mship)] pleft= len(lexam) / float(len(Y)) pright= 1-pleft pl= stats.itemfreq(lexam)[:,1] / float(len(lexam)) + np.spacing(1) pr= stats.itemfreq(rexam)[:,1] / float(len(rexam)) + np.spacing(1) hl= -np.sum(pl*np.log2(pl)) hr= -np.sum(pr*np.log2(pr)) sentropy = pleft * hl + pright * hr return sentropy
def dll_type_2(tree, name_only = False): dll_list = {"name": [], "addr": []} for el in tree.iter(): # obtain DLL target in element if el.tag == "load_dll": dll_name = el.get('filename') #split filename and file_address key_bag = dll_name.split("\\") dll_name = key_bag[len(key_bag) - 1] dll_addr = "//".join(key_bag[:(len(key_bag) - 1)]) #TODO: convert to lower case dll_list["name"].append(dll_name) dll_list["addr"].append(dll_addr) dll_list["name"] = stats.itemfreq(dll_list["name"]) dll_list["addr"] = stats.itemfreq(dll_list["addr"]) dll_list_join = concatenate([dll_list["name"], dll_list["addr"]]) dll_name_counter = Counter() for item in dll_list_join: dll_name_counter[item[0]] = int(item[1]) return dll_name_counter
def land_sic_overlap_timeseries(instrument, title="Land-Sea Ice Border Variations"): """ Time Series that shows the percentage variations of the land mask border given the expansion of sea ice in VIRS. """ files = data.file_names(instrument_id=data.INSTRUMENT_MAP.get(instrument)) out = [] for idx, mat in enumerate(data.mat_generator(files)): sic = SIC(files[idx]) lm = LM(files[idx]) sic_surface = sic.surface(boolean=False) lm_surface = lm.silhoutte() silhoutte_freq = itemfreq(lm_surface) border = silhoutte_freq[1][1] merge = np.add(sic_surface, lm_surface) merge_freq = itemfreq(merge) intercept = merge_freq[2][1] land_ice_overlap = (float(intercept) / border) * 100 temp = {'timestamp': lm.title, 'intercept': land_ice_overlap} out.append(temp) index = [elem['timestamp'] for elem in out] df = DataFrame(out, index=index) sdf = df.sort_values(by='timestamp') sdf.plot(title=title) plt.show()
def getDomColour(image): # use k-means clustering to create palette with the n_colours=10 most representative colours of the image arr = np.float32(image) pixels = arr.reshape((-1, 3)) n_colours = 5 criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 25, 0.5) flags = cv2.KMEANS_RANDOM_CENTERS _, labels, centroids = cv2.kmeans(pixels, n_colours, None, criteria, 10, flags) palette = np.uint16(centroids) # quantized = palette[labels.flatten()] # quantized = quantized.reshape(image.shape) # the dominant colour is the palette colour which occurs most frequently on the quantised image: index_domcol = np.argmax(itemfreq(labels)[:, -1]) domcol = palette[index_domcol] freq_domcol = itemfreq(labels)[:, 1][index_domcol] interestingness_domcol = howInteresting(domcol) lightness_domcol = getLightness(domcol) dominance_domcol = interestingness_domcol * freq_domcol * lightness_domcol index = 0 for colour in palette: interestingness = howInteresting(colour) freq = itemfreq(labels)[:, 1][index] lightness = getLightness(colour) dominance = interestingness * freq * lightness if dominance > dominance_domcol is not (isDark(colour)): domcol = palette[index] return domcol
def calculateEntropy(self, Y, mship): """ calculates the split entropy using Y and mship (logical array) telling which child the examples are being split into... Input: --------- Y: a label array mship: (logical array) telling which child the examples are being split into, whether each example is assigned to left split or the right one.. Returns: --------- entropy: split entropy of the split """ lexam = Y[mship] rexam = Y[np.logical_not(mship)] pleft = len(lexam) / float(len(Y)) pright = 1 - pleft pl = stats.itemfreq(lexam)[:, 1] / float(len(lexam)) + np.spacing(1) pr = stats.itemfreq(rexam)[:, 1] / float(len(rexam)) + np.spacing(1) hl = -np.sum(pl * np.log2(pl)) hr = -np.sum(pr * np.log2(pr)) sentropy = pleft * hl + pright * hr return sentropy
def plotStratification(gCNdata,tCNdata,tRnaData,newGeneName,nameGOI,theAxes,strat): # color, shape, and alpha schemes for the stratification if strat == 1: myColorScheme = ['c','b','g','r','m']*5 myShapeScheme = ['<']*5+['v']*5+['o']*5+['^']*5+['>']*5 myAlphaScheme = [0.1]*5+[0.1]*5+[0.1]*5+[0.1]*5+[0.1]*5 elif strat == 2: myColorScheme = ['c']*5 + ['b']*5 + ['g']*5 + ['r']*5 + ['m']*5 myShapeScheme = ['<','v','o','^','>']*5 myAlphaScheme = [0.1]*5+[0.1]*5+[0.1]*5+[0.1]*5+[0.1]*5 sumCNdata = 10*tCNdata + 2*gCNdata theColorCn = sumCNdata + 24 colorDist = ss.itemfreq(theColorCn) colorDist = colorDist[:,0] for level in colorDist: thisIndex = level/2 theAxes.scatter(sumCNdata[theColorCn == level],tRnaData[theColorCn == level],s=100,alpha=0.3,color=myColorScheme[int(thisIndex)],marker=myShapeScheme[int(thisIndex)]) sumCnDist = ss.itemfreq(sumCNdata) sumCnLevels = sumCnDist[:,0] sumCnCounts = sumCnDist[:,1] meanCN = np.mean(gCNdata) stdCN = np.std(gCNdata) tMeanCN = np.mean(tCNdata) tStdCN = np.std(tCNdata) sumMeanExpT = [np.mean(tRnaData[sumCNdata == i]) for i in sumCnLevels] sumStdExpT = [np.std(tRnaData[sumCNdata == sumCnLevels[i]])/np.sqrt(sumCnCounts[i]) for i in range(len(sumCnLevels))] theAxes.errorbar(sumCnLevels,sumMeanExpT,sumStdExpT,marker='_',markersize=15,markeredgewidth=2,color='k',elinewidth=3,capsize=4) theAxes.errorbar(10*tMeanCN,min(tRnaData),xerr=10*tStdCN,marker='^',markersize=10,elinewidth=3,color='k') theAxes.errorbar(10*meanCN,min(tRnaData)-0.1,xerr=10*stdCN,marker='^',markersize=10,elinewidth=3,color='k') theAxes.set_xticks(np.arange(-25,25,5)) theAxes.grid() theAxes.set_ylabel('RNA Expression of %s' %newGeneName)
def learn_with_test(dataset, testset): matrix_ds = np.asarray(dataset) matrix_ds_test = np.asarray(testset) # clf = linear_model.LogisticRegression() training_target = matrix_ds[:, TYPE_INDEX] training_dataset = matrix_ds[:, 1:TYPE_INDEX].astype(np.float) testing_target = matrix_ds_test[:, TYPE_INDEX] testing_dataset = matrix_ds_test[:, 1:TYPE_INDEX].astype(np.float) # Parameter selection # Set the parameters by cross-validation # cv = StratifiedShuffleSplit(training_target, n_iter=5, test_size=0.2, random_state=42) # C_range = 10.0 ** np.arange(-3, 3) # gamma_range = 10.0 ** np.arange(-3, 3 ) # param_grid = dict(gamma=gamma_range, C=C_range) # clf = GridSearchCV(SVC(), param_grid, cv=cv) # clf.fit(training_dataset, training_target) # print("The best parameters are %s with a score of %0.2f" # % (clf.best_params_, clf.best_score_)) clf = SVC(kernel="rbf", C=10, gamma=0.1) clf.fit(training_dataset, training_target) # print clf.score(testing_dataset, testing_target) predictions = clf.predict(testing_dataset) print itemfreq(predictions)
def test_scale(self): dataset = loader.load_kanade(shared=False, n=2, pre={'scale2unit': True}) self.assertTrue(len(dataset[0]) == 2 and len(dataset[1]) == 2) print dataset[0] print itemfreq(dataset[0])
def get_feature_distribution(self): ''' get feature distribution on given dataset. ''' return ({ 'y_train': itemfreq(self.y_train), 'y_test': itemfreq(self.y_test), })
def run_example(data_path): """ method to demonstrate the usage of grbm. :param: data_path path of dataset :type: String """ print('... loading data') # Load the dataset f = gzip.open(data_path, 'rb') train_set, valid_set, test_set = cPickle.load(f) f.close() X_train, Y_train = train_set print('train x: ', X_train.shape) print('train Y: ', Y_train.shape) #valid_set_x, valid_set_y = datasets[1] X_test, Y_test = test_set print('test X: ', X_test.shape) print('test Y: ', Y_test.shape) print('label count for training data:') print(itemfreq(Y_train)) print('label count for test data:') print(itemfreq(Y_test)) parameters_GRBM = [[200, 2, 10, 0.01, 0.9, 1, 'None',0.1, 0.1, 0.0], [200, 100, 10, 0.01, 0.9, 1, 'L1', 0.2, 0.2, 0.0], [200, 10, 10, 0.01, 0.9, 1, 'L2', 0.3, 0.3, 0.5]] for param_grbm in parameters_GRBM: grbm = GRBM(random_state=0) grbm.n_hidden = param_grbm[0] grbm.grbm_n_iter = param_grbm[1] grbm.grbm_batch_size = param_grbm[2] grbm.grbm_learning_rate = param_grbm[3] # fitting time grbm.grbm_momentum = param_grbm[4] grbm.grbm_n_gibbs_steps = param_grbm[5] grbm.penalty = param_grbm[6] grbm.C1 = param_grbm[7] grbm.C2 = param_grbm[8] grbm.pdrop = param_grbm[9] grbm.fit(X_train, Y_train) Y_pred = grbm.predict(X_test) score = metrics.accuracy_score(Y_test, Y_pred) print('Acc score for test set:', score) print("GRBM report:\n%s\n" % ( metrics.classification_report( Y_test, Y_pred)))
def num_cluster (data_bipart): random.seed(17) num = len(data_bipart) /300 +1 kmeans_bipart = KMeans(n_clusters=num, random_state=0).fit(data_bipart) labels_bipart = kmeans_bipart.labels_ max_group = max(itemfreq(labels_bipart)[:,1]) while (max_group >350): num += 2 kmeans_bipart = KMeans(n_clusters=num, random_state=0).fit(data_bipart) labels_bipart = kmeans_bipart.labels_ max_group = max(itemfreq(labels_bipart)[:,1]) return num
def segmenter(self, data, labels): best_impurity = float('inf') best_left, best_right, best_rule = None, None, None data_size, num_features = data.shape # Random forest if self.rf: for i in range(self.trees): m = random.sample(range(num_features), 10) for j in range(num_features): feature = data[:, j] n = random.sample(range(data_size), int(data_size / 50)) for val in n: left_indices = np.nonzero(feature < val)[0] right_indices = np.nonzero(feature >= val)[0] split_rule = (j, val) left_labels = labels[left_indices] right_labels = labels[right_indices] if left_labels.size == 0 or right_labels.size == 0: continue impurity = self.impurity(itemfreq(left_labels), itemfreq(right_labels)) if impurity < best_impurity: best_impurity = impurity best_rule = split_rule best_left = left_indices best_right = right_indices # Normal DT else: for i in range(num_features): feature = data[:, i] for val in feature: # mean = np.mean(feature) # left_indices = np.nonzero(feature < mean)[0] # right_indices = np.nonzero(feature >= mean)[0] # split_rule = (i, mean) left_indices = np.nonzero(feature < val)[0] right_indices = np.nonzero(feature >= val)[0] split_rule = (i, val) left_labels = labels[left_indices] right_labels = labels[right_indices] impurity = self.impurity(itemfreq(left_labels), itemfreq(right_labels)) if left_labels.size == 0 or right_labels.size == 0: continue if impurity < best_impurity: best_impurity = impurity best_rule = split_rule best_left = left_indices best_right = right_indices return best_rule, best_left, best_right
def main(): u_data = csv.reader(open('ml-100k/u.data', 'rb'), delimiter='\t') columns = list(zip(*u_data)) # column 1: user id col1 = np.array(columns[0]).astype(np.int) # column 2: item id col2 = np.array(columns[1]).astype(np.int) # review_list[u] = a list of movies that were reviewed by user u + 1 review_list = user_review_list(col1, col2)[1:] mat = np.zeros(U * (U - 1)) cnt = 0 for i in range(U): for j in range(U): if i != j: mat[cnt] = len(np.intersect1d(review_list[i], review_list[j])) cnt = cnt + 1 # on average, how many movies are commonly reviewed by a user pair? mean = np.mean(mat) # median number of movies commonly reviewed by a user pair median = np.median(mat) # how many user pairs have rated that many movies? freq_table1 = stats.itemfreq(mat) maximum = freq_table1[-1, 0] minimum = freq_table1[0, 0] # display results print 'mean:', mean print 'median:', median # print freq_table1[:, 0] interval = 10 plot_hist(freq_table1[:, 0], freq_table1[:, 1]) # measure how many reviews each movie has freq_table2 = stats.itemfreq(col2) # which movies have the most/fewest reviews? most = reviews(freq_table2, np.amax) fewest = reviews(freq_table2, np.amin) # display results print 'movies that have the most reviews:', most[0] print 'number of reviews:', most[1] print 'movies that have the fewest reviews:', fewest[0] print 'number of reviews:', fewest[1] # sort the movies based on their number of reviews sorted_movies = sort_movies(freq_table2) # display results plot_line(np.arange(len(sorted_movies[:, 0])), sorted_movies[:, 1])
def check_subset(data, subset): """frequency of each element than compare them""" if all(elem in data for elem in subset): data_freq = itemfreq(data) subset_freq = itemfreq(subset) for elem in subset_freq: if elem[0] in data_freq[:, 0]: itemindex = np.where(data_freq[:, 0] == elem[0]) if (len(elem[0]) != len(data_freq[itemindex][0][0])) or \ (int(data_freq[itemindex][0][1]) < int(elem[1])): return False else: return False return True return False
def api_id(): # Check if an ID was provided as part of the URL. # If ID is provided, assign it to a variable. # If no ID is provided, display an error in the browser. if 'p1' in request.args and 'p2' in request.args: # and 'cat' in request.args: p1 = request.args.get('p1') p2 = request.args.get('p2') #cat = request.args.get('cat') else: return "Input error." # get images from URL ssl._create_default_https_context = ssl._create_unverified_context req1 = urllib.request.urlopen(p1) arr1 = np.asarray(bytearray(req1.read()), dtype=np.uint8) img1 = cv2.imdecode(arr1, cv2.IMREAD_COLOR) req2 = urllib.request.urlopen(p2) arr2 = np.asarray(bytearray(req2.read()), dtype=np.uint8) img2 = cv2.imdecode(arr2, cv2.IMREAD_COLOR) # calc color correlation img1blk = cv2.cvtColor(img1, cv2.COLOR_BGR2GRAY) img2blk = cv2.cvtColor(img2, cv2.COLOR_BGR2GRAY) hist1 = cv2.calcHist([img1blk], [0], None, [256], [0, 256]) hist2 = cv2.calcHist([img2blk], [0], None, [256], [0, 256]) colorDiff = cv2.compareHist( hist1, hist2, cv2.HISTCMP_BHATTACHARYYA) # 0-1 higher this is, less close it is colorDiff = 1 - colorDiff # texture comparison lbp1 = local_binary_pattern(img1blk, 24, 3, method='uniform') freq1 = itemfreq(lbp1.ravel()) text_hist1 = freq1[:, 1] / sum(freq1[:, 1]) # normalize lbp2 = local_binary_pattern(img2blk, 24, 3, method='uniform') freq2 = itemfreq(lbp2.ravel()) text_hist2 = freq2[:, 1] / sum(freq2[:, 1]) textDiff = cv2.compareHist(np.array(text_hist1, dtype=np.float32), np.array(text_hist2, dtype=np.float32), cv2.HISTCMP_BHATTACHARYYA) textDiff = 1 - textDiff # feature matching #sift = cv2.xfeatures2d.SIFT_create() #kp_1, desc_1 = sift.detectAndCompute(img1, None) #kp_2, desc_2 = sift.detectAndCompute(img2, None) return str(round(colorDiff * 0.5 + textDiff * 0.5, 2) * 100)
def compute_histogram(data, labels): histogram = itemfreq(sorted(data)) for label in labels: if label not in histogram[:, 0]: histogram = np.vstack((histogram, np.array([[label, 0]], dtype=object))) histogram = histogram[histogram[:, 0].argsort()] return histogram
def call_freq(tree, name_only=False): """ arguments: tree is an xml.etree.ElementTree object returns: a dictionary mapping 'first_call-x' to 1 if x was the first system call made, and 'last_call-y' to 1 if y was the last system call made. (in other words, it returns a dictionary indicating what the first and last system calls made by an executable were.) """ callz = [] in_all_section = False first = True # is this the first system call last_call = None # keep track of last call we've seen for el in tree.iter(): # ignore everything outside the "all_section" element if el.tag == "all_section" and not in_all_section: in_all_section = True elif el.tag == "all_section" and in_all_section: in_all_section = False elif in_all_section: callz.append(el.tag) # finally, count the frequencies freqList = stats.itemfreq(callz) if name_only == True: c = set(callz) else: c = Counter() for item in freqList: c["sys_call-" + item[0]] = int(item[1]) return c
def dominant_color(cls, img, k): """Return an RGB tuple of the dominant color in an image Performs k-means clustering on the image's pixels, then selects the centroid of the largest cluster to be the dominant color of the image Uses kmeans++ for cluster initialization :param img: The image to analyze, read in via cv2.imread() :param k: The number of clusters to use :return: The RGB tuple of the dominant color in the image """ img_as_float32 = np.float32(img) pixels = img_as_float32.reshape((-1, 3)) # Stop after MAX_ITER iterations OR accuracy EPS (epsilon) is reached criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 200, 0.1) # Use kmeans++ center initialization flags = cv2.KMEANS_PP_CENTERS # Amount of times the algorithm is attempted attempts = 10 _, labels, centroids = cv2.kmeans(pixels, k, criteria, attempts, flags) candidate_dominant_colors = np.uint8(centroids) # The dominant color is the cluster with the largest number of members/pixels dominant_color_idx = np.argmax(itemfreq(labels)[:, -1]) dominant_color_tuple = candidate_dominant_colors[dominant_color_idx] d_blue = dominant_color_tuple[0] d_green = dominant_color_tuple[1] d_red = dominant_color_tuple[2] return (d_red, d_green, d_blue)
def calc_accuracy(self, x, y, method='repmet'): """ Calculate the accuracy of reps based on the current clusters :param x: :param y: :param method: :return: """ if method == 'unsupervised': # Tries finding a cluster for each class, and then assigns cluster labels to each cluster based on the max # samples of a particular class in that cluster k = np.unique(y).size kmeans = KMeans(n_clusters=k, max_iter=35, n_init=15, n_jobs=-1).fit(x) emb_labels = kmeans.labels_ G = np.zeros((k, k)) for i in range(k): lbl = y[emb_labels == i] uc = itemfreq(lbl) for uu, cc in uc: G[i, uu] = -cc A = linear_assignment_.linear_assignment(G) acc = 0.0 for (cluster, best) in A: acc -= G[cluster, best] return acc / float(len(y)) else: predictions = self.predict(x, method=method) correct = predictions == y return correct.astype(float).mean()
def _fit_model(self, fcol, dis): """Determine the best fit for one feature column given distribution name Parameters ---------- fcol: feature column, array dis: distribution name, String Returns ---------- function: fit model with feature as argument """ if dis == 'ratio': itfreq = itemfreq(fcol) uniqueVars = itfreq[:, 0] freq = itfreq[:, 1] rat = freq / sum(freq) rat = dict(zip(uniqueVars, rat.T)) func = lambda x: self.funcs[dis](x, rat) if dis == 'poisson': lamb = np.nanmean(fcol, axis=0) func = lambda x: self.funcs[dis](x, lamb) if dis == 'norm': sigma = np.nanvar(fcol, axis=0) theta = np.nanmean(fcol, axis=0) func = lambda x: self.funcs[dis](x, sigma, theta) return np.vectorize(func)
def ClusterSizes(self): """Returns an array containing the number of points in each cluster.""" if not any(self.__clsizes): self.__clsizes = np.zeros(self.NClusters()) tmp = itemfreq(self.__ClusterID) self.__clsizes[tmp[:, 0]] = tmp[:, 1] return self.__clsizes
def dbscan_outliers(data, genes, eps, min_samples, max_samples=1, as_json=True): db = DBSCAN(eps=eps, min_samples=min_samples) # sd_scaler = StandardScaler() res = dr.get_dataset_ensembl_info() outliers_id = [] for g in genes: # scaled = sd_scaler.fit(data.loc[g, :]) fit = db.fit(np.reshape(data.loc[g, :], (196, 1))) candidates = itemfreq(fit.labels_) try: class_zero = candidates[0][1] class_one = candidates[1][1] support = min(class_one, class_zero) if min_samples < support <= max_samples: info = [gene for gene in res if gene.ensemblgeneid == g][0] formatted_info = {"id": g, "name": info.genename, "type": info.genetype, "samples": str(support), "distance": "NA"} jinfo = json.dumps(formatted_info) jinfo += "," outliers_id.append(g) print("outlier found :" + g) if as_json: yield (jinfo) else: yield (formatted_info) except: pass
def cluster_outliers(data, genes, max_samples, min_dist=0.8, mining_id=1, as_json=True): estimator = cluster.KMeans(2) # init kmeans samples_from_perc = round(max_samples * len(data.columns) / 100) print(samples_from_perc) ens = False info = None if str(genes[0]).startswith("ENSG"): res = dr.get_dataset_ensembl_info() ens = True outliers_id = [] # debug_count = 0 if as_json: yield (u"{\"outliers\":[") for g in genes: # if debug_count > 10: # break try: gene_row = data.loc[g, :].dropna() gene_row = gene_row.to_frame() estimator.fit(gene_row) # conversion to dframe for model fit candidates = itemfreq(estimator.labels_) class_zero = candidates[0][1] class_one = candidates[1][1] support = min(class_one, class_zero) majority_class = class_one > class_zero dist = abs(max(gene_row[estimator.labels_ == majority_class]) - max( gene_row[estimator.labels_ == 1 - majority_class])) ran = gene_row.max() - gene_row.min() ndist = dist / float(ran) print(ndist) if 0 < support <= samples_from_perc and min_dist < ndist < 1: # debug_count += 1 if ens: info = [gene for gene in res if gene.ensemblgeneid == g][0] formatted_info = {"identifier": g, "name": info.genename, "type": info.genetype, "samples": str(support), "distance": str(ndist), "range": str(ran)} else: formatted_info = {"identifier": g, "name": "Not available", "type": "Not available", "samples": str(support), "distance": str(ndist), "range": str(ran)} outliers_id.append(formatted_info) print("outlier found :" + g) if as_json: jinfo = json.dumps(formatted_info) jinfo += u"," yield (jinfo) else: yield (formatted_info) except: # if there is an issue on one gene (no variation, clustering impossible) the majority class # selection will obviously explode, we capture that in this block and just continue with the next gene (no harm done, there are # no outliers when the values are the same) pass if len(outliers_id) > 0: pr.save_outliers(mining_id, outliers_id) yield(str(u"]}"))
def pixelize_at_target_nside(self, nside): # this also effectively applies the mask to the data self.nside = nside # so averages are computed in downgrade self.mask[self.mask == hp.UNSEEN] = 0 # step 1 mask_targetnside = hp.pixelfunc.ud_grade( self.mask, pess=False, nside_out=self.nside) gal_index_targetnside = radec_to_index( self.data['DEC'], self.data['RA'], self.nside) mask_targetnside[mask_targetnside == 0] = hp.UNSEEN self.mask[self.mask == 0] = hp.UNSEEN # undo step 1 # prune data that's in a bad part of the mask self.data = self.data[mask_targetnside[ gal_index_targetnside] != hp.UNSEEN] counts = itemfreq(gal_index_targetnside) full_map_counts = np.zeros(hp.nside2npix(self.nside)) full_map_counts[counts[:, 0]] = counts[:, 1] good_counts = full_map_counts[np.where(mask_targetnside != hp.UNSEEN)] good_fracs = mask_targetnside[np.where(mask_targetnside != hp.UNSEEN)] self.nbar = np.average(good_counts, weights=good_fracs) print 'nbar is', self.nbar, 'galaxies per pixel' pixels_to_count = np.where(mask_targetnside != hp.UNSEEN) dec, ra = index_to_radec(pixels_to_count, self.nside) final_counts = full_map_counts[pixels_to_count] self.pixelized = (ra[0], dec[0], final_counts)
def land_sic_overlap(lm_image, sic_image): """ Show Sea Ice Concentration and Land Mask together. This figure shows the overlaps between mw_sic and lm. """ lm = lm_image sic = sic_image sic_surface = sic.surface(boolean=False) lm_surface = lm.image() condlist = [lm_surface == 1] choicelist = [3] merge = np.add(sic_surface, np.select(condlist, choicelist)) freqs = itemfreq(merge) # Pie Chart config params labels = "Sea Water", "Sea Ice", "Land", "Land - Sea Ice Overlap" colors = ["blue", "lightblue", "yellow", "red"] values = [freqs[0][1], freqs[1][1], freqs[2][1], freqs[3][1]] # Make and cofigure figure to be displayed fig, axes = plt.subplots(1, 2) fig.subplots_adjust(hspace=0.3, wspace=0.05) #populate each axis of the figure axes[0].imshow(merge) axes[0].set_title("Sea Ice and Land Mask") axes[1].pie(values, explode=[0.1, 0.1, 0.1, 0.4], labels=labels, colors=colors, shadow=True, autopct='%1.2f%%') plt.show()
def CombinedMeanShift(self, h, alpha, PrincComp=None, njobs=-2, mbf=1): """Performs the scikit-learn Mean Shift clustering. Arguments: h -- the bandwidth alpha -- the weight of the principal components as compared to the spatial data. PrincComp -- used to pass already-computed principal components njobs -- the number of processes to be used (default: n. of CPU - 1) mbf -- the minimum number of items in a seed""" MS = MeanShift(bin_seeding=True, bandwidth=h, cluster_all=True, min_bin_freq=mbf, n_jobs=njobs) if PrincComp is None: PrincComp = self.ShapePCA(2) print("Starting sklearn Mean Shift... ") stdout.flush() fourvector = np.vstack((self.__data, alpha * PrincComp)) MS.fit_predict(fourvector.T) self.__ClusterID = MS.labels_ self.__c = MS.cluster_centers_.T self.__clsizes = itemfreq(self.__ClusterID)[:, 1] print("done.") stdout.flush()
def lbf(infile): result = [] label = [] ix = 0 for here, i in enumerate(open(infile).readlines()): ix +=1 imgpath, l = i.split(',') if os.path.exists(imgpath): im = cv2.imread(imgpath) im = cv2.resize(im, (200, 200)) im_gray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY) radius = 3 no_points = 8 * radius lbp = local_binary_pattern(im_gray, no_points, radius, method='uniform') x = itemfreq(lbp.ravel()) hist = x[:, 1]/sum(x[:, 1]) result.append(hist) if "FE" in l: label.append(1) else: label.append(-1) print len(result) print len(label)
def _fit_model(self, fcol, dis): """Determine the best fit for one feature column given distribution name Parameters ---------- fcol: feature column, array dis: distribution name, String Returns ---------- function: fit model with feature as argument """ if dis == 'ratio': itfreq = itemfreq(fcol) uniqueVars = itfreq[:,0] freq = itfreq[:,1] rat = freq/sum(freq) rat = dict(zip(uniqueVars, rat.T)) func = lambda x: self. funcs[dis](x, rat) if dis == 'poisson': lamb = np.nanmean(fcol, axis = 0) func = lambda x: self.funcs[dis](x, lamb) if dis == 'norm': sigma = np.nanvar(fcol, axis=0) theta = np.nanmean(fcol, axis = 0) func = lambda x: self.funcs[dis](x, sigma, theta) return np.vectorize(func)
def calculateLBP(self): paramList = list() with open(self.paramTxt) as f: for line in f: paramList.append(int(line.strip())) print(paramList) for image in self.trainDict.iterkeys(): print(image) img = cv2.imread(image) imgGray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # radius = 3 # noPoints = 8 * radius radius = paramList[0] noPoints = paramList[1] * radius print(radius) print(noPoints) lbpImage = local_binary_pattern(imgGray, noPoints, radius, method='uniform') # Calculate the histogram x = itemfreq(lbpImage.ravel()) # normalize the histogram hist = x[:, 1] / sum(x[:, 1]) # hist = cv2.calcHist(lbp, [0], None, [256], [0, 256]) # cv2.normalize(hist,hist) # hist = hist.flatten() self.addrImg.append(image) self.lbpHistogram.append(hist) self.tagNo.append(self.trainDict.get(image)) joblib.dump((self.addrImg, self.lbpHistogram, self.tagNo), "lbp.pkl", compress=3)
def predict(self): print "Predicting" predicted_labels = np.zeros(len(self.test_data)) for i in range(0, len(self.test_data)): # calculate the distance between this target point and all train data dist = np.linalg.norm(self.train_data - self.test_data[i], axis=1, ord=2) # find k smallest distance from train. This outputs a list of (index, distance) smallest_k_distances_index_pair = hq.nsmallest(self.k, enumerate(dist), key=lambda d: d[1]) # extract the labels nearest_labels = [ self.train_label[pair[0]] for pair in smallest_k_distances_index_pair ] majority_label = max(set(nearest_labels), key=nearest_labels.count) predicted_labels[i] = majority_label print majority_label # populate frequency table freq = itemfreq(nearest_labels) self.label_frequency_table[i, :][freq[:, 0].astype( dtype=int)] = freq[:, 1] return predicted_labels
def get_model_prediction(self, clf, where=None): with stopwatch("getting model predictions"): with pd.HDFStore(self.store_path, mode='a') as store: n_rows = len(store.select_as_coordinates(self.tables[0], where=where) ) chunksize = n_rows // 1 new_table_name = self.tables[0] + "_pred" if new_table_name in store.keys(): store.remove(new_table_name) for chunk in store.select(self.tables[0], chunksize=chunksize, ): indexer = chunk.ANH y_pred = clf.predict(chunk.loc[indexer,self.feat_cols]) chunk.loc[indexer, "predicted_label"] = \ self.label_encoder.inverse_transform(y_pred) freq = stats.itemfreq(self.label_encoder.inverse_transform(y_pred)) for f in freq: print(f[0], f[1]) store.append(new_table_name, chunk, format='table', append=False) return y_pred
def main_breeds(labels_raw, Nber_breeds, all_breeds='TRUE'): labels_freq_pd = itemfreq( labels_raw["breed"] ) # get frequency of each label, shape(120, 2) 120 label #arg_sort() will find index of given array to sort via low to high order, #Example: df = [9, 3, 5, 3, 1] => df.arg_sort() = [4, 1, 3, 2, 0] # verify by df[df.arg_sort()] labels_freq_pd = labels_freq_pd[labels_freq_pd[:, 1].argsort() [::-1]] # ger freq from high to low if all_breeds == 'FALSE': main_labels = labels_freq_pd[:, 0][ 0:Nber_breeds] # get 8 first item of columm (breed type) else: main_labels = labels_freq_pd[:, 0][:] labels_raw_np = labels_raw["breed"].as_matrix( ) # convert series of breed type to array, shape as 10222 labels_raw_np = labels_raw_np.reshape( labels_raw_np.shape[0], 1) # convert to array 2D shape (102222, 1) # below result returned contains 2 array with length 922, 1 array for index of labels_raw_np # and 1 array for index of main_labels which has same length and where value is same labels_filtered_index = np.where(labels_raw_np == main_labels) return labels_filtered_index
def _predict(self, candidate_mask): """ Generate prediction vectors for the unlabelled candidates. """ n_samples = len(self.pool[candidate_mask]) n_classes = len(self.committee.classes_) avg_probs = np.zeros((n_samples, n_classes)) prob_list = [] class_freq = itemfreq(self.labels[~self.labels.mask]) for member in self.committee.estimators_: member_prob = member.predict_proba(self.pool[candidate_mask]) member_n_classes = member_prob.shape[1] if n_classes == member_n_classes: avg_probs += member_prob prob_list.append(member_prob) else: member_classes = class_freq[:,1].argsort()[::-1] member_classes = member_classes[:member_n_classes] full_member_prob = np.zeros((n_samples, n_classes)) full_member_prob[:, member_classes] += member_prob[:, range(member_n_classes)] avg_probs += full_member_prob prob_list.append(full_member_prob) # average out the probabilities avg_probs /= len(self.committee.estimators_) return (avg_probs, prob_list)
def replace_lower_by_higher_prob(s,p0=0.3): # input: s: 1D numpy array ; threshold p0 # output: s in which element having p < p0 were placed by elements with p > p0, according to prob f = itemfreq(s) # element and number of occurence a,p = f[:,0],f[:,1].astype(float) # probabilities p /= float(p.sum()) # find elements having p > p0: iapmax = np.argwhere(p>p0).reshape((-1,)) # position apmax = a[iapmax].reshape((-1,)) # name of aminoacid pmax = p[iapmax].reshape((-1,)) # probability # find elements having p < p0 apmin = a[np.argwhere(p < p0)].reshape((-1,)) if apmin.shape[0] > 0: for a in apmin: ia = np.argwhere(s==a).reshape((-1,)) for iia in ia: s[iia] = value_with_prob(apmax,pmax) return s
def _predict(self, candidate_mask): """ Generate prediction vectors for the unlabelled candidates. """ n_samples = len(self.pool[candidate_mask]) n_classes = len(self.committee.classes_) avg_probs = np.zeros((n_samples, n_classes)) prob_list = [] class_freq = itemfreq(self.labels[~self.labels.mask]) for member in self.committee.estimators_: member_prob = member.predict_proba(self.pool[candidate_mask]) member_n_classes = member_prob.shape[1] if n_classes == member_n_classes: avg_probs += member_prob prob_list.append(member_prob) else: member_classes = class_freq[:, 1].argsort()[::-1] member_classes = member_classes[:member_n_classes] full_member_prob = np.zeros((n_samples, n_classes)) full_member_prob[:, member_classes] += member_prob[:, range( member_n_classes )] avg_probs += full_member_prob prob_list.append(full_member_prob) # average out the probabilities avg_probs /= len(self.committee.estimators_) return (avg_probs, prob_list)
def export_corpus(corpus, outfolder, context_type='document'): """ Converts a vsm.corpus.Corpus object into a lda-c compatible data file. Creates two files: 1. "vocab.txt" - contains the integer-word mappings 2. "corpus.dat" - contains the corpus object in the format described in [lda-c documentation](http://www.cs.princeton.edu/~blei/lda-c/readme.txt): Under LDA, the words of each document are assumed exchangeable. Thus, each document is succinctly represented as a sparse vector of word counts. The data is a file where each line is of the form: [M] [term_1]:[count] [term_2]:[count] ... [term_N]:[count] where [M] is the number of unique terms in the document, and the [count] associated with each term is how many times that term appeared in the document. Note that [term_1] is an integer which indexes the term; it is not a string. :param corpus: VSM Corpus object to convert to lda-c file :type corpus: vsm.corpus.Corpus :param outfolder: Directory to output "vocab.txt" and "corpus.dat" :type string: path """ if not os.path.exists(outfolder): os.makedirs(outfolder) vocabfilename = os.path.join(outfolder, 'vocab.txt') with codecs.open(vocabfilename,'w','utf8') as vocabfile: for word in corpus.words: vocabfile.write(word+'\n') corpusfilename = os.path.join(outfolder, 'corpus.dat') corpusitemnames = os.path.join(outfolder,'names.dat') #print "METADATA",len(corpus.view_metadata(context_type)) #print len(corpus.view_contexts(context_type)) #vw_ctx = corpus.view_contexts(context_type) #vw_mtd = corpus.view_metadata(context_type) #for i,item in enumerate(vw_mtd): # if i < 1: # print vw_mtd[i][1],vw_mtd[i][0],len(vw_ctx[i]) # else: # print vw_mtd[i][1], vw_mtd[i][0]-vw_mtd[i-1][0],len(vw_ctx[i]) #vw_mtd = corpus.view_metadata(context_type) #names_file = open(corpusitemnames,'w') with open(corpusfilename,'w') as corpusfile: for i,ctx in enumerate(corpus.view_contexts(context_type)): M = len(np.unique(ctx)) corpusfile.write("{0}".format(M)) #names_file.write("{0} {1}\n".format(vw_mtd[i][1],vw_mtd[i][0])) for token in itemfreq(ctx): corpusfile.write(" {term}:{count}".format( term=token[0],count=token[1])) corpusfile.write("\n")
def p_list(connpnts,p): list = [] freq = itemfreq(connpnts) for i in range(freq.shape[0]): if freq[i][1] == p: list.append(i) return list
def generateHist(self): temp = [] for x in self.unfilledp_percent: temp.append(hist(x)) up = np.array(temp) print "unfilled:" print itemfreq(up) ftemp = [] for x in self.filledp_percent: ftemp.append(hist(x)) fup = np.array(ftemp) print "filled:" print itemfreq(fup)
def _unscheduled_penalty(self): """ Each course has a predetermined amount of lectures that must be given. As many of these lectures as possible must be sched- uled. Each course that has a lecture which is not scheduled gives a penalty of 10 points. A whole individual is checked in order to calculate the penalty """ individual = self.schedule penalty = 10 value = 0 occurrences_scheduled = dict( (entry[0], entry[1]) for entry in itemfreq(individual.flatten())) occurrences_desired = dict( (int(course[1:]), info["number_of_lectures"]) for (course, info) in self.data["courses"].iteritems()) for key in occurrences_desired: value += occurrences_desired[key] if key in occurrences_scheduled: value -= occurrences_scheduled[key] return value * penalty
def compare_position_booking(booked, clicked): arr_position = np.load("data_numpy/train_position.npy") for position in itemfreq(arr_position)[:,0]: booked_subset = booked[ arr_position == position ] num_of_booked = np.sum(booked_subset) num_of_instances = len(booked_subset) print position, num_of_booked, num_of_instances, num_of_booked/float(num_of_instances)*100.0
def lbp(im_gray): """Returns LBP histogram of an image""" global SIZE, WINDOW, UNIFORM_PATTERNS, BASE im_gray = cv2.resize(im_gray, (SIZE, SIZE)) lbp_hist = np.array([]) for i1 in range(0,SIZE,WINDOW): for j1 in range(0,SIZE,WINDOW): box = im_gray[j1:j1+WINDOW, i1:i1+WINDOW] # figure() # imshow(box) # gray() # show() lbp = my_lbp(box) # print lbp.shape lbp = lbp.ravel() map_array = np.zeros((lbp.shape[0] + 1)) i = 0 for x in np.nditer(lbp): try: map_array[i] = np.where(UNIFORM_PATTERNS==x)[0][0] except: map_array[i] = 58 i += 1 map_array = np.concatenate((map_array, BASE)) x = itemfreq(map_array) hist = np.array(x[:,1] - 1).astype('int') # print x # print type(x) # print sum(hist) # print hist lbp_hist = np.concatenate((lbp_hist, hist)) return lbp_hist
def out_put_pixel(): im = Image.open('33/beer2.png') im_data = np.array(list(im.getdata())) #print(list(im.getdata())) #print(len(list(im.getdata()))) #print(im_data) #[ 1 43 7 ... 19 1 7] #print(im_data.shape) #(19044,) #print(im.getpixel((0,0))) #print(im.getpixel((0,1))) im_data_stat = itemfreq(im_data) #pprint(im_data_stat) #pprint(im_data_stat[:, 1]) #print(im_data_stat.shape) #pprint([i for i in np.cumsum(im_data_stat[:, 1])]) pprint([np.sqrt(i) for i in np.cumsum(im_data_stat[:, 1])]) for i in range(im_data_stat.shape[0] - 1, 0, -2): newIm_data = im_data[np.where(im_data <= im_data_stat[i, 0])] idx_0 = np.where(newIm_data == newIm_data.max()) idx_1 = np.where(newIm_data != newIm_data.max()) newIm_data[idx_0] = 0 newIm_data[idx_1] = 1 size = int(np.sqrt(len(newIm_data))) newIm = Image.new('1', (size, size)) newIm.putdata(newIm_data) newIm.save('33/%i.png' % i)
def evaluate(self, X): if not isinstance(X, np.ndarray): X = np.array(X) flatX = [] for x in X: for xi in x: flatX.append(xi) flatX = np.array(flatX) counts = flatX.size items = np.unique(flatX) itemfreqs = itemfreq(flatX) freqitems = itemfreqs[itemfreqs[:, 1] >= counts * self.min_support][:, 0] freqs = np.array(freqitems, dtype=np.object) itemnum = 1 while (itemnum <= len(freqitems)): candidates = self._get_candidates(freqs, freqitems, itemnum) itemnum += 1 if len(candidates) == 0: break for candidate in candidates: count = 0 for x in X: idx = 0 for xi in x: if xi == candidate[idx]: idx += 1 if idx == itemnum: count += 1 if count >= counts * self.min_support: freqs.append(candidate) print freqs
def get_x_train(x_raw_train_input, y_val): x_train = np.empty([0,BINS+26+BINS], int) y_train = np.empty([0], str) for x_raw_train in x_raw_train_input[0:SAMPLES]: # Build grayscale hist feature data x_train_gray_scale = x_raw_train[0] #hist2 = plt.hist(x, bins=BINS) hist_gs = np.histogram(x_train_gray_scale, bins=BINS) x_reduced = hist_gs[0].reshape(1,-1) # Build LBP hist feature data lbp = x_raw_train[1] x = itemfreq(lbp.ravel()) hist_lbp = x[:, 1]/sum(x[:, 1]) x_reduced = np.append(x_reduced, hist_lbp.reshape(1, hist_lbp.shape[0]))[np.newaxis] # Build HSV hist feature data x_train_hsv = x_raw_train[2] hist_hsv = np.histogram(x_train_hsv[:,2].ravel(), bins=BINS) x_reduced = np.append(x_reduced, hist_hsv[0].reshape(1,-1))[np.newaxis] x_train = np.append(x_train, x_reduced, axis=0) y_train = np.append(y_train, y_val) return x_train, y_train
def call_freq(tree, name_only = False): """ arguments: tree is an xml.etree.ElementTree object returns: a dictionary mapping 'first_call-x' to 1 if x was the first system call made, and 'last_call-y' to 1 if y was the last system call made. (in other words, it returns a dictionary indicating what the first and last system calls made by an executable were.) """ callz = [] in_all_section = False first = True # is this the first system call last_call = None # keep track of last call we've seen for el in tree.iter(): # ignore everything outside the "all_section" element if el.tag == "all_section" and not in_all_section: in_all_section = True elif el.tag == "all_section" and in_all_section: in_all_section = False elif in_all_section: callz.append(el.tag) # finally, count the frequencies freqList = stats.itemfreq(callz) if name_only == True: c = set(callz) else: c = Counter() for item in freqList: c["sys_call-" +item[0]] = int(item[1]) return c
def plotGaussian(X, y, obj, featureNames): """Plot Gausian fit on top of X. """ save_path = '../MSPrediction-Python/plots/'+obj+'/'+'BayesGaussian2' clf = classifiers["BayesGaussian2"] clf,_,_ = fitAlgo(clf, X,y, opt= True, param_dict = param_dist_dict["BayesGaussian2"]) unique_y = np.unique(y) theta = clf.theta_ sigma = clf.sigma_ class_prior = clf.class_prior_ norm_func = lambda x, sigma, theta: 1 if np.isnan(x) else -0.5 * np.log(2 * np.pi*sigma) - 0.5 * ((x - theta)**2/sigma) norm_func = np.vectorize(norm_func) n_samples = X.shape[0] for j in range(X.shape[1]): fcol = X[:,j] jfeature = featureNames[j] jpath = save_path +'_'+jfeature+'.pdf' fig = pl.figure(figsize=(8,6),dpi=150) for i, y_i in enumerate(unique_y): fcoli = fcol[y == y_i] itfreq = itemfreq(fcoli) uniqueVars = itfreq[:,0] freq = itfreq[:,1] freq = freq/sum(freq) the = theta[i, j] sig = sigma[i,j] pred = np.exp(norm_func(uniqueVars, sig, the)) pl.plot(uniqueVars, pred, label= str(y_i)+'_model') pl.plot(uniqueVars, freq, label= str(y_i) +'_true') pl.xlabel(jfeature) pl.ylabel("density") pl.legend(loc='best') pl.tight_layout() # pl.show() fig.savefig(jpath)
def mating_selection(population, Range, n): """ Mating selection in RSEA :param population: current population :param n: number of selected individuals :param Range: the range of the objective vectors :return: next generation population """ pop_obj = population[1] N = np.shape(pop_obj)[0] pop_obj = (pop_obj - np.tile(Range[0], (N, 1))) / \ np.tile(Range[1] - Range[0], (N, 1)) con = np.sqrt(np.sum(pop_obj**2, axis=1)) site, _ = radar_grid(pop_obj, np.ceil(np.sqrt(N))) crowd_g = itemfreq(site)[:, 1] mating_pool = np.zeros(np.ceil(N / 2).astype(int) * 2) grids = tournament(2, len(mating_pool), crowd_g.reshape((crowd_g.size, 1))) for i in range(len(mating_pool)): current = np.nonzero(site == grids[i])[0] if current is None: mating_pool[i] = np.random.randint(0, N, 1) else: parents = current[np.random.randint(0, len(current), 4)] best = np.argmin(con[parents]) mating_pool[i] = parents[best] return mating_pool.astype(int)
def dP(dml,m0,m0_min,m0_max): ProbL=[] m0L=[] j=0 for m0 in np.arange( m0_min, m0_max+0.10,0.10): m0 = round(m0,2) dml = np.array(dml) data1f = np.array(dml[dml<m0]) xf,list1f = ecdf(data1f,norm=False) # USING scipystats itemfreq function: freq = itemfreq(xf) # taking the number of occurences (itemfreq has 2 outputs; Column 1 contains sorted, unique values from data # , column 2 contains their respective counts.): counts1 = freq[:,1] prob1 = np.sum(counts1/( len(dml) ) ) probt = [ prob1 for x in [j] ] dplist = [(probt)] ProbL.extend(dplist) mt = [ m0 for x in [j] ] m0list = [(mt)] m0L.extend(m0list) j+=1 if (INIT==1): global kk kk=1 cdfs(data1f) return m0L,ProbL
def plot_histogram(times, time_range, t_step=1.): """ For visual comparison to Poisson distribution """ counts = count_events(times, time_range, t_step=t_step)[0] observed = np.array(stats.itemfreq(counts)) bins = np.arange(np.ceil(1.5 * observed[-1, 0])) ideal = len(times) / counts.mean() * stats.poisson.pmf(bins, counts.mean()) color_cycle = plt.gca()._get_lines.color_cycle plt.vlines(bins, 0, ideal, label='Poisson', color=next(color_cycle), lw=12, alpha=0.3) plt.vlines(observed[:, 0], 0, observed[:, 1], label='observed', color=next(color_cycle), lw=4) plt.xlim(bins[0], bins[-1])
def silencePerClass(y_pred, y_GT, classesDict, silenceClassNum): """ Calculate and print percentage of silence for each class """ silenceCountPerClass = np.zeros(len(classesDict)/2) for i in range(y_pred.shape[0]): if y_pred[i] == silenceClassNum: # increment each class that was the ground truth for that point: for j in range(y_GT.shape[1]): if int(y_GT[i,j]) != -1: # ignore invalid (-1) class entries silenceCountPerClass[int(y_GT[i,j])] += 1 gtFreq = itemfreq(y_GT.flat).astype(int) for i in range(silenceCountPerClass.shape[0]): if i in gtFreq[:,0]: silencePercentage = round(silenceCountPerClass[i] / float( gtFreq[np.where(gtFreq[:,0] == i)[0][0], 1]) * 100, 2) print("Class " + classesDict[i] + " contains " + str(silencePercentage) + "% silence") print("-----")
def GetFreqsAttns(self, freqTuningHisto): # Frequency Tuning Curve method """ Helper method for ShowSTH() to organize the frequencies in ascending order separated for each intensity. :param freqTuningHisto: dict of pandas.DataFrames with spike data :type freqTuningHisto: str :returns: ordered list of frequencies (DataFrame keys()) numpy array of frequencies numpy array of intensities """ freqs = np.array([]) attns = np.array([]) for histoKey in list(freqTuningHisto): if histoKey != 'None_None': freq = histoKey.split('_')[0] freqs = np.hstack([freqs, float(freq) / 1000]) attn = histoKey.split('_')[1] attns = np.hstack([attns, float(attn)]) attnCount = stats.itemfreq(attns) freqs = np.unique(freqs) attns = np.unique(attns) if np.max(attnCount[:, 1]) != np.min(attnCount[:, 1]): abortedAttnIdx = np.where( attnCount[:, 1] != np.max(attnCount[:, 1])) attns = np.delete(attns, abortedAttnIdx) orderedKeys = [] for attn in attns: freqList = [] for freq in freqs: key = str(int(freq * 1000)) + '_' + str(int(attn)) freqList.append(key) orderedKeys.append(freqList) return orderedKeys, freqs, attns
def removeInvalids(y_pred, y_GT, silenceClassNum): """ Remove all points from the prediction and the ground truth array where not ground truth was provided or where the point was silent @return: y_pred, y_GT: the updated arrays """ # Positions where no silence predicted: maskValid = (y_pred != silenceClassNum) y_GT = y_GT[maskValid] y_pred = y_pred[maskValid] # We also ignore those samples where not ground truth was provided, # i.e. we delete those entries from the GT and the prediction array: invalidRow = np.array([-1,-1,-1,-1,-1]) # has to match the max allow GT labels per points maskValid = ~np.all(y_GT==invalidRow,axis=1) y_GT = y_GT[maskValid] y_pred = y_pred[maskValid] # Print for how many points no ground truth was provided: noGtFreq = itemfreq(maskValid).astype(int) validCount = noGtFreq[np.where(noGtFreq[:,0] == 1)[0][0], 1] totalCount = sum(noGtFreq[:,1]) percentValid = round(validCount/float(totalCount) * 100, 2) print("GT was provided for " + str(percentValid) + "% of all (non-silent) samples") print("-----") return y_pred, y_GT
def search_dimer(oneframe, crit): #pick random residue nmon, natoms, topology = oneframe.n_residues, oneframe.n_atoms, oneframe.topology m2 = -1 #if first guess fails, need to seek another. So make as while loop while True: m1 = int(random.random() * nmon) atomids = topology.select('resid ' + str(m1)) search = [x for x in range(natoms) if x not in atomids] neigh = md.compute_neighbors(oneframe, crit[0], atomids, haystack_indices=search, periodic=False) resilist = [] for x in neigh[0]: resilist.append(topology.atom(x).residue.index) freq = itemfreq(resilist) for f in freq: if f[1] >= crit[1]: m2 = f[0] break if m2 != -1: break #print(m1,m2) dimer = oneframe.atom_slice( topology.select('resid ' + str(m1) + ' or resid ' + str(m2))) atomids = topology.select('resid ' + str(m1)) #dimer=dimer.image_molecules(inplace=False,anchor_molecules=atomids) return dimer
def getROIColors(self): def camelToUnderscore(word): new = "" for l in word: if l.isupper(): new += " %s" % l.lower() else: new += l return new arr = np.float32(self.ROI) pixels = arr.reshape((-1, 3)) n_colors = 3 criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 200, .1) flags = cv2.KMEANS_RANDOM_CENTERS _, labels, centroids = cv2.kmeans(pixels, n_colors, None, criteria, 10, flags) palette = np.uint8(centroids) quantized = palette[labels.flatten()] quantized = quantized.reshape(self.ROI.shape) dominant_color = palette[np.argmax(stats.itemfreq(labels)[:, -1])] r, g, b = dominant_color[2], dominant_color[1], dominant_color[0] dom_col = camelToUnderscore( colors.ColorNames.findNearestColorName( r, g, b, colors.ColorNames.WebColorMap)) cr, cg, cb = colors.ColorNames.complement(r, g, b) com_col = camelToUnderscore( colors.ColorNames.findNearestColorName( cr, cg, cb, colors.ColorNames.WebColorMap)) self.colors.add(dom_col) self.colors.add(com_col)
def GetFreqsAttns(self, freqTuningHisto): # Frequency Tuning Curve method """ Helper method for ShowSTH() to organize the frequencies in ascending order separated for each intensity. :param freqTuningHisto: dict of pandas.DataFrames with spike data :type freqTuningHisto: str :returns: ordered list of frequencies (DataFrame keys()) numpy array of frequencies numpy array of intensities """ freqs = np.array([]) attns = np.array([]) for histoKey in list(freqTuningHisto): if histoKey != 'None_None': freq = histoKey.split('_')[0] freqs = np.hstack([freqs, float(freq) / 1000]) attn = histoKey.split('_')[1] attns = np.hstack([attns, float(attn)]) attnCount = stats.itemfreq(attns) freqs = np.unique(freqs) attns = np.unique(attns) if np.max(attnCount[:, 1]) != np.min(attnCount[:, 1]): abortedAttnIdx = np.where(attnCount[:, 1] != np.max(attnCount[:, 1])) attns = np.delete(attns, abortedAttnIdx) orderedKeys = [] for attn in attns: freqList = [] for freq in freqs: key = str(int(freq * 1000)) + '_' + str(int(attn)) freqList.append(key) orderedKeys.append(freqList) return orderedKeys, freqs, attns
def solid_coord_fulldomain(id_field_file): # 'id_field_file' is the file name of the full ID field in *nc format # NOTE: This input *nc file is not the raw CT file (which has 1->solid; 0->fluid) # Assume the fulldomain is raw, i.e. no postprocessed reservoir layers or porous plate # Assume the node type convention for LBPM-WIA simulations, which says # id = 0 -> solid phase # id = 1 -> non-wetting phase # id = 2 -> wetting phase # ------------------------------------- print "**Info: Load the image file: "+id_field_file domain = read_NetCDF_file_py(id_field_file,'segmented') print "**Info: Start analysing the solid coordinate number ......" domain = np.logical_not(domain) # Now 1 -> solid nodes; 0 -> fluid nodes domain = domain.astype(np.int8) # Define the D3Q19 lattice cx=np.array([0, 1, -1, 0, 0, 0, 0, 1, -1, 1, -1, 1, -1, 1, -1, 0, 0, 0, 0]) cy=np.array([0, 0, 0, 1, -1, 0, 0, 1, -1, -1, 1, 0, 0, 0, 0, 1, -1, 1,-1]) cz=np.array([0, 0, 0, 0, 0, 1, -1, 0, 0, 0, 0, 1, -1, -1, 1, 1, -1, -1, 1]) z_axis = 2 y_axis = 1 x_axis = 0 domain_temp = np.zeros_like(domain) for idx in np.arange(1,19): domain_temp += np.roll(np.roll(np.roll(domain,cx[idx],axis=x_axis),cy[idx],axis=y_axis),cz[idx],axis=z_axis) #end for domain_temp = domain_temp[domain==0] # only extract the coordinate number for pore space nodes # NOTE that we have 0 -> fluid nodes and 1 -> solid nodes in domain return stats.itemfreq(domain_temp)
def plotMixNB(X, y, obj, featureNames, whichMix): """Plot MixNB fit on top of X. """ save_path = '../MSPrediction-Python/plots/'+obj+'/'+whichMix clf = classifiers[whichMix] clf,_,_ = fitAlgo(clf, X,y, opt= True, param_dict = param_dist_dict[whichMix]) unique_y = np.unique(y) # norm_func = lambda x, sigma, theta: 1 if np.isnan(x) else -0.5 * np.log(2 * np.pi*sigma) - 0.5 * ((x - theta)**2/sigma) # norm_func = np.vectorize(norm_func) n_samples = X.shape[0] for j in range(X.shape[1]): fcol = X[:,j] optmodel = clf.optmodels[:,j] distname = clf.distnames[j] jfeature = featureNames[j] jpath = save_path +'_'+jfeature+'.pdf' fig = pl.figure(figsize=(8,6),dpi=150) for i, y_i in enumerate(unique_y): fcoli = fcol[y == y_i] itfreq = itemfreq(fcoli) uniqueVars = itfreq[:,0] freq = itfreq[:,1] freq = freq/sum(freq) pred = np.exp(optmodel[i](uniqueVars)) # print pred # print pred pl.plot(uniqueVars, pred, label= str(y_i)+'_model') pl.plot(uniqueVars, freq, label= str(y_i) +'_true') pl.xlabel(jfeature) pl.ylabel("density") pl.title(distname) pl.legend(loc='best') pl.tight_layout() # pl.show() fig.savefig(jpath)
def getC(self): """ Return C. """ idx = itemfreq(self.idxC) C = self.X[:, idx[:, 0]] return C * np.sqrt(idx[:, 1])