def tables(docompute, dowrite, complib, verbose): # Filenames ifilename = os.path.join(OUT_DIR, "expression-inputs.h5") ofilename = os.path.join(OUT_DIR, "expression-outputs.h5") # Filters shuffle = True if complib == 'blosc': filters = tb.Filters(complevel=1, complib='blosc', shuffle=shuffle) elif complib == 'lzo': filters = tb.Filters(complevel=1, complib='lzo', shuffle=shuffle) elif complib == 'zlib': filters = tb.Filters(complevel=1, complib='zlib', shuffle=shuffle) else: filters = tb.Filters(complevel=0, shuffle=False) if verbose: print("Will use filters:", filters) if dowrite: f = tb.open_file(ifilename, 'w') # Build input arrays t0 = time() root = f.root a = f.create_carray(root, 'a', tb.Float32Atom(), shape, filters=filters) b = f.create_carray(root, 'b', tb.Float32Atom(), shape, filters=filters) if verbose: print("chunkshape:", a.chunkshape) print("chunksize:", np.prod(a.chunkshape) * a.dtype.itemsize) #row = np.linspace(0, 1, ncols) row = np.arange(0, ncols, dtype='float32') for i in range(nrows): a[i] = row * (i + 1) b[i] = row * (i + 1) * 2 f.close() print("[tables.Expr] Time for creating inputs:", round(time() - t0, 3)) if docompute: f = tb.open_file(ifilename, 'r') fr = tb.open_file(ofilename, 'w') a = f.root.a b = f.root.b r1 = f.create_carray(fr.root, 'r1', tb.Float32Atom(), shape, filters=filters) # The expression e = tb.Expr(expr) e.set_output(r1) t0 = time() e.eval() if verbose: print("First ten values:", r1[0, :10]) f.close() fr.close() print("[tables.Expr] Time for computing & save:", round(time() - t0, 3))
def compute_up(expr, data, **kwargs): if len(expr._children) != 1: raise ValueError("Only one child in Broadcast allowed") s = expr._scalars[0] cols = [s[field] for field in s.fields] expr_str = print_numexpr(cols, expr._scalar_expr) uservars = dict((c, getattr(data.cols, c)) for c in s.fields) e = tb.Expr(expr_str, uservars=uservars, truediv=True) return e.eval()
def _filter_inds(col_dict, query): match_string, uservars = _build_search_string(query) # link user vars to existing columns in memory for name in list(uservars.keys()): uservars[name] = col_dict[uservars[name]] # run filtering and subselect columns inds = tb.Expr(match_string, uservars=uservars).eval() return inds
def multiply_table_column_by(): path = "/skynet3_rech1/huziy/hdf_store/quebec_0.1_crcm5-r_spinup.hdf" var_name = "AV" h = tb.open_file(path, mode="a") var_table = h.get_node("/", var_name) coef = 3 * 60 * 60 # output step expr = tb.Expr("c * m", uservars={"c": var_table.cols.field, "m": coef}) column = var_table.cols.field expr.set_output(column) expr.eval() var_table.flush() h.close()
def find_a_model(z0, z1, h0, h1): """ Find a SAM galaxy within the range of redshift and h band magnitude of the hubble image """ fir, d, p, h = readhdf5() z_sam = fir.root.data.col('z') h_sam = d.root.data.col('wfc3f160w') bc = tables.Expr( '(z_sam>%.3f) & (z_sam<%.3f) & (h_sam>%.3f) & (h_sam<%.3f)' % (z0, z1, h0, h1)).eval() igal = np.nonzero(bc) try: rr = np.random.randint(0, len(igal[0])) except ValueError: return -99, len(igal[0]) return igal[0][rr], len(igal[0])
def compute_tables(): """Compute the polynomial with tables.Expr.""" f = tb.openFile(h5fname, "a") x = f.root.x # get the x input # Create container for output atom = tb.Atom.from_dtype(dtype) filters = tb.Filters(complib=clib, complevel=clevel) r = f.createCArray(f.root, "r", atom=atom, shape=(N, ), filters=filters) # Do the actual computation and store in output ex = tb.Expr(expr) # parse the expression ex.setOutput(r) # where is stored the result? # when commented out, the result goes in-memory ex.eval() # evaluate! f.close() print_filesize(h5fname) return N
def get_expression_data(self, expression, table_loc=None, filename=None): import tables if table_loc is None: table_loc = self.data_path target_table = self.chest.get_node(table_loc) uv = target_table.colinstances # apply any shortcuts/macros expression = self.remap_distance_expressions(expression) # evaluate the math expression data = tables.Expr(expression, uv).eval() if filename is None: filename = self.get_active_name() elif filename == "all": return data # pick out the indices for only the active image indices = target_table.get_where_list( #'(omit==False) & (filename == "%s")' % self.get_active_name()) '(filename == "%s")' % filename) # access the array data for those indices data = data[indices] return data
def process_file(kind, prec, clevel, synth): if kind == "numpy": lib = "none" else: lib = kind if synth: prefix = 'synth/synth-' else: prefix = 'cellzome/cellzome-' iname = '%s/%s-%s%d-%s.h5' % (dirname, prefix, kind, clevel, prec) f = tb.open_file(iname, "r") a_ = f.root.maxarea b_ = f.root.mascotscore oname = '%s/%s-%s%d-%s-r.h5' % (dirname, prefix, kind, clevel, prec) f2 = tb.open_file(oname, "w") if lib == "none": filters = None else: filters = tb.Filters(complib=lib, complevel=clevel, shuffle=shuffle) if prec == "single": type_ = tb.Float32Atom() else: type_ = tb.Float64Atom() r = f2.create_carray('/', 'r', type_, a_.shape, filters=filters) if kind == "numpy": a2, b2 = a_[:], b_[:] t0 = time() r = eval(expression, {'a': a2, 'b': b2}) print "%5.2f" % round(time() - t0, 3) else: expr = tb.Expr(expression, {'a': a_, 'b': b_}) expr.set_output(r) expr.eval() f.close() f2.close() size = float(os.stat(iname)[6]) + float(os.stat(oname)[6]) return size
def MCLA(hdf5_file_name, cluster_runs, verbose=False, N_clusters_max=None): """Meta-CLustering Algorithm for a consensus function. Parameters ---------- hdf5_file_name : file handle or string cluster_runs : array of shape (n_partitions, n_samples) verbose : bool, optional (default = False) N_clusters_max : int, optional (default = None) Returns ------- A vector specifying the cluster label to which each sample has been assigned by the MCLA approximation algorithm for consensus clustering. Reference --------- A. Strehl and J. Ghosh, "Cluster Ensembles - A Knowledge Reuse Framework for Combining Multiple Partitions". In: Journal of Machine Learning Research, 3, pp. 583-617. 2002 """ print('\n*****') print('INFO: Cluster_Ensembles: MCLA: consensus clustering using MCLA.') if N_clusters_max == None: N_clusters_max = int(np.nanmax(cluster_runs)) + 1 N_runs = cluster_runs.shape[0] N_samples = cluster_runs.shape[1] print( "INFO: Cluster_Ensembles: MCLA: preparing graph for meta-clustering.") hypergraph_adjacency = load_hypergraph_adjacency(hdf5_file_name) w = hypergraph_adjacency.sum(axis=1) N_rows = hypergraph_adjacency.shape[0] print( "INFO: Cluster_Ensembles: MCLA: done filling hypergraph adjacency matrix. " "Starting computation of Jaccard similarity matrix.") # Next, obtain a matrix of pairwise Jaccard similarity scores between the rows of the hypergraph adjacency matrix. with tables.open_file(hdf5_file_name, 'r+') as fileh: FILTERS = get_compression_filter(4 * (N_rows**2)) similarities_MCLA = fileh.create_carray(fileh.root.consensus_group, 'similarities_MCLA', tables.Float32Atom(), (N_rows, N_rows), "Matrix of pairwise Jaccard " "similarity scores", filters=FILTERS) scale_factor = 100.0 print("INFO: Cluster_Ensembles: MCLA: " "starting computation of Jaccard similarity matrix.") squared_MCLA = hypergraph_adjacency.dot( hypergraph_adjacency.transpose()) squared_sums = hypergraph_adjacency.sum(axis=1) squared_sums = np.squeeze(np.asarray(squared_sums)) chunks_size = get_chunk_size(N_rows, 7) for i in range(0, N_rows, chunks_size): n_dim = min(chunks_size, N_rows - i) temp = squared_MCLA[i:min(i + chunks_size, N_rows), :].todense() temp = np.squeeze(np.asarray(temp)) x = squared_sums[i:min(i + chunks_size, N_rows)] x = x.reshape(-1, 1) x = np.dot(x, np.ones((1, squared_sums.size))) y = np.dot(np.ones((n_dim, 1)), squared_sums.reshape(1, -1)) temp = np.divide(temp, x + y - temp) temp *= scale_factor Jaccard_matrix = np.rint(temp) similarities_MCLA[i:min(i + chunks_size, N_rows)] = Jaccard_matrix del Jaccard_matrix, temp, x, y gc.collect() # Done computing the matrix of pairwise Jaccard similarity scores. print("INFO: Cluster_Ensembles: MCLA: done computing the matrix of " "pairwise Jaccard similarity scores.") cluster_labels = cmetis(hdf5_file_name, N_clusters_max, w) cluster_labels = one_to_max(cluster_labels) # After 'cmetis' returns, we are done with clustering hyper-edges # We are now ready to start the procedure meant to collapse meta-clusters. N_consensus = np.amax(cluster_labels) + 1 fileh = tables.open_file(hdf5_file_name, 'r+') FILTERS = get_compression_filter(4 * N_consensus * N_samples) clb_cum = fileh.create_carray( fileh.root.consensus_group, 'clb_cum', tables.Float32Atom(), (N_consensus, N_samples), 'Matrix of mean memberships, forming meta-clusters', filters=FILTERS) chunks_size = get_chunk_size(N_samples, 7) for i in range(0, N_consensus, chunks_size): x = min(chunks_size, N_consensus - i) matched_clusters = np.where(cluster_labels == np.reshape( np.arange(i, min(i + chunks_size, N_consensus)), newshape=(x, 1))) M = np.zeros((x, N_samples)) for j in range(x): coord = np.where(matched_clusters[0] == j)[0] M[j] = np.asarray( hypergraph_adjacency[matched_clusters[1][coord], :].mean( axis=0)) clb_cum[i:min(i + chunks_size, N_consensus)] = M # Done with collapsing the hyper-edges into a single meta-hyper-edge, # for each of the (N_consensus - 1) meta-clusters. del hypergraph_adjacency gc.collect() # Each object will now be assigned to its most associated meta-cluster. chunks_size = get_chunk_size(N_consensus, 4) N_chunks, remainder = divmod(N_samples, chunks_size) if N_chunks == 0: null_columns = np.where(clb_cum[:].sum(axis=0) == 0)[0] else: szumsz = np.zeros(0) for i in range(N_chunks): M = clb_cum[:, i * chunks_size:(i + 1) * chunks_size] szumsz = np.append(szumsz, M.sum(axis=0)) if remainder != 0: M = clb_cum[:, N_chunks * chunks_size:N_samples] szumsz = np.append(szumsz, M.sum(axis=0)) null_columns = np.where(szumsz == 0)[0] if null_columns.size != 0: print( "INFO: Cluster_Ensembles: MCLA: {} objects with all zero associations " "in 'clb_cum' matrix of meta-clusters.".format(null_columns.size)) clb_cum[:, null_columns] = np.random.rand(N_consensus, null_columns.size) random_state = np.random.RandomState() tmp = fileh.create_carray(fileh.root.consensus_group, 'tmp', tables.Float32Atom(), (N_consensus, N_samples), "Temporary matrix to help with " "collapsing to meta-hyper-edges", filters=FILTERS) chunks_size = get_chunk_size(N_samples, 2) N_chunks, remainder = divmod(N_consensus, chunks_size) if N_chunks == 0: tmp[:] = random_state.rand(N_consensus, N_samples) else: for i in range(N_chunks): tmp[i * chunks_size:(i + 1) * chunks_size] = random_state.rand( chunks_size, N_samples) if remainder != 0: tmp[N_chunks * chunks_size:N_consensus] = random_state.rand( remainder, N_samples) expr = tables.Expr("clb_cum + (tmp / 10000)") expr.set_output(clb_cum) expr.eval() expr = tables.Expr("abs(tmp)") expr.set_output(tmp) expr.eval() chunks_size = get_chunk_size(N_consensus, 2) N_chunks, remainder = divmod(N_samples, chunks_size) if N_chunks == 0: sum_diag = tmp[:].sum(axis=0) else: sum_diag = np.empty(0) for i in range(N_chunks): M = tmp[:, i * chunks_size:(i + 1) * chunks_size] sum_diag = np.append(sum_diag, M.sum(axis=0)) if remainder != 0: M = tmp[:, N_chunks * chunks_size:N_samples] sum_diag = np.append(sum_diag, M.sum(axis=0)) fileh.remove_node(fileh.root.consensus_group, "tmp") # The corresponding disk space will be freed after a call to 'fileh.close()'. inv_sum_diag = np.reciprocal(sum_diag.astype(float)) if N_chunks == 0: clb_cum *= inv_sum_diag max_entries = np.amax(clb_cum, axis=0) else: max_entries = np.zeros(N_samples) for i in range(N_chunks): clb_cum[:, i * chunks_size:(i + 1) * chunks_size] *= inv_sum_diag[i * chunks_size:(i + 1) * chunks_size] max_entries[i * chunks_size:(i + 1) * chunks_size] = np.amax( clb_cum[:, i * chunks_size:(i + 1) * chunks_size], axis=0) if remainder != 0: clb_cum[:, N_chunks * chunks_size:N_samples] *= inv_sum_diag[ N_chunks * chunks_size:N_samples] max_entries[N_chunks * chunks_size:N_samples] = np.amax( clb_cum[:, N_chunks * chunks_size:N_samples], axis=0) cluster_labels = np.zeros(N_samples, dtype=int) winner_probabilities = np.zeros(N_samples) chunks_size = get_chunk_size(N_samples, 2) for i in reversed(range(0, N_consensus, chunks_size)): ind = np.where( np.tile(max_entries, ( min(chunks_size, N_consensus - i), 1)) == clb_cum[i:min(i + chunks_size, N_consensus)]) cluster_labels[ind[1]] = i + ind[0] winner_probabilities[ind[1]] = clb_cum[(ind[0] + i, ind[1])] # Done with competing for objects. cluster_labels = one_to_max(cluster_labels) print("INFO: Cluster_Ensembles: MCLA: delivering " "{} clusters.".format(np.unique(cluster_labels).size)) print("INFO: Cluster_Ensembles: MCLA: average posterior " "probability is {}".format(np.mean(winner_probabilities))) if cluster_labels.size <= 7: print( "INFO: Cluster_Ensembles: MCLA: the winning posterior probabilities are:" ) print(winner_probabilities) print( "'INFO: Cluster_Ensembles: MCLA: the full posterior probabilities are:" ) print(clb_cum) fileh.remove_node(fileh.root.consensus_group, "clb_cum") fileh.close() return cluster_labels
def CSPA(hdf5_file_name, cluster_runs, verbose=False, N_clusters_max=None): """Cluster-based Similarity Partitioning Algorithm for a consensus function. Parameters ---------- hdf5_file_name : file handle or string cluster_runs : array of shape (n_partitions, n_samples) verbose : bool, optional (default = False) N_clusters_max : int, optional (default = None) Returns ------- A vector specifying the cluster label to which each sample has been assigned by the CSPA heuristics for consensus clustering. Reference --------- A. Strehl and J. Ghosh, "Cluster Ensembles - A Knowledge Reuse Framework for Combining Multiple Partitions". In: Journal of Machine Learning Research, 3, pp. 583-617. 2002 """ print('*****') print("INFO: Cluster_Ensembles: CSPA: consensus clustering using CSPA.") if N_clusters_max == None: N_clusters_max = int(np.nanmax(cluster_runs)) + 1 N_runs = cluster_runs.shape[0] N_samples = cluster_runs.shape[1] if N_samples > 20000: raise ValueError( "\nERROR: Cluster_Ensembles: CSPA: cannot efficiently " "deal with too large a number of cells.") hypergraph_adjacency = load_hypergraph_adjacency(hdf5_file_name) s = scipy.sparse.csr_matrix.dot(hypergraph_adjacency.transpose().tocsr(), hypergraph_adjacency) s = np.squeeze(np.asarray(s.todense())) del hypergraph_adjacency gc.collect() checks(np.divide(s, float(N_runs)), verbose) e_sum_before = s.sum() sum_after = 100000000.0 scale_factor = sum_after / float(e_sum_before) with tables.open_file(hdf5_file_name, 'r+') as fileh: atom = tables.Float32Atom() FILTERS = get_compression_filter(4 * (N_samples**2)) S = fileh.create_carray(fileh.root.consensus_group, 'similarities_CSPA', atom, (N_samples, N_samples), "Matrix of similarities arising " "in Cluster-based Similarity Partitioning", filters=FILTERS) expr = tables.Expr("s * scale_factor") expr.set_output(S) expr.eval() chunks_size = get_chunk_size(N_samples, 3) for i in range(0, N_samples, chunks_size): tmp = S[i:min(i + chunks_size, N_samples)] S[i:min(i + chunks_size, N_samples)] = np.rint(tmp) return metis(hdf5_file_name, N_clusters_max)
#Create 1000 column row extendable array (EArray) n = 1000 ear = h5.createEArray(h5.root, 'ear', atom=tb.Float64Atom(), shape=(0, n)) #populate chunkwise time1 = time.time() rand = np.random.standard_normal((n, n)) for i in range(750): ear.append(rand) ear.flush() print time.time() - time1 #135.915999889 seconds #Check size logcally and pyscially ear ear.size_on_disk #Create target output array out = h5.createEArray(h5.root, 'out', atom=tb.Float64Atom(), shape=(0, n)) #Create expresion ie y=3sin(x) +sqrt(abs(x)) expr = tb.Expr('3*sin(ear)+sqrt(abs(ear))') expr.setOutput(out, append_mode=True) time1 = time.time() expr.eval() print time.time() - time1 #187.100000143 seconds h5.close()
def MCLA(cluster_runs, verbose=False, N_clusters_max=None): cluster_ensemble = [] score = np.empty(0) if N_clusters_max == None: N_clusters_max = int(np.nanmax(cluster_runs)) + 1 N_runs = cluster_runs.shape[0] N_samples = cluster_runs.shape[1] #Cluster_Ensembles: MCLA: preparing graph for meta-clustering. hypergraph_adjacency = build_hypergraph_adjacency(cluster_runs) w = hypergraph_adjacency.sum(axis=1) N_rows = hypergraph_adjacency.shape[0] # Next, obtain a matrix of pairwise Jaccard similarity scores between the rows of the hypergraph adjacency matrix. scale_factor = 100.0 #starting computation of Jaccard similarity matrix. squared_MCLA = hypergraph_adjacency.dot(hypergraph_adjacency.transpose()) squared_sums = hypergraph_adjacency.sum(axis=1) squared_sums = np.squeeze(np.asarray(squared_sums)) chunks_size = get_chunk_size(N_rows, 7) for i in range(0, N_rows, chunks_size): n_dim = min(chunks_size, N_rows - i) temp = squared_MCLA[i:min(i + chunks_size, N_rows), :].todense() temp = np.squeeze(np.asarray(temp)) x = squared_sums[i:min(i + chunks_size, N_rows)] x = x.reshape(-1, 1) x = np.dot(x, np.ones((1, squared_sums.size))) y = np.dot(np.ones((n_dim, 1)), squared_sums.reshape(1, -1)) temp = np.divide(temp, x + y - temp) temp *= scale_factor Jaccard_matrix = np.rint(temp) # print(Jaccard_matrix) # del Jaccard_matrix, temp, x, y del temp, x, y gc.collect() # Done computing the matrix of pairwise Jaccard similarity scores. #################################################################################################### e_mat = Jaccard_matrix # print(e_mat[0]) # print(N_rows) N_cols = e_mat.shape[1] w *= scale_factor w = np.rint(w) vwgt = [] for sublist in w.tolist(): for item in sublist: vwgt.append(int(item)) # print(vwgt) diag_ind = np.diag_indices(N_rows) e_mat[diag_ind] = 0 adjncy = [] adjwgt = [] xadj = [] xadjind = 0 xadj.append(0) #first element always starts with 0 chunks_size = get_chunk_size(N_cols, 7) for i in range(0, N_rows, chunks_size): M = e_mat[i:min(i + chunks_size, N_rows)] for j in range(M.shape[0]): edges = np.where(M[j] > 0)[0] weights = M[j, edges] xadjind += edges.size xadj.append(xadjind) adjncy.extend(edges) adjwgt.extend(weights) adjwgt = list(map(int, adjwgt)) # max_w = np.max(vwgt) # min_w = np.min(vwgt) # vwgt_norm = (vwgt-min_w)/(max_w-min_w) # print("vwgt : ", vwgt) # print("vwgt_norm : ", vwgt_norm+1) # print("adjwgt : ", adjwgt) # N_rows = 12 # N_clusters_max = 10 # xadj = [0, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21, 22, ] # adjncy = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 5, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ] # vwgt = [90300, 200, 11600, 11400, 9600, 11600, 7200, 8600, 9700, 5800, 7100, 7900, ] # adjwgt = [13, 13, 11, 13, 8, 10, 11, 6, 8, 9, 2, 13, 13, 11, 13, 2, 8, 10, 11, 6, 8, 9, ] # print(cluster_runs) # print("xadj: ", xadj) # print("adjncy : ", adjncy) # print("vwgt : ", vwgt) # print("adjwgt : ", adjwgt) # print("\n") xadj = (idx_t * len(xadj))(*xadj) adjncy = (idx_t * len(adjncy))(*adjncy) adjwgt = (idx_t * len(adjwgt))(*adjwgt) vwgt = (idx_t * len(vwgt))(*vwgt) ncon = idx_t(1) G = METIS_Graph(idx_t(N_rows), ncon, xadj, adjncy, vwgt, None, adjwgt) # print(G) (edgecuts, parts) = metis.part_graph(G, N_clusters_max) cluster_labels = parts # print("parts") # print(parts) ########################################################################################################## cluster_labels = one_to_max(cluster_labels) # print(cluster_labels) # After 'metis' returns, we are done with clustering hyper-edges # We are now ready to start the procedure meant to collapse meta-clusters. N_consensus = np.amax(cluster_labels) + 1 clb_cum = np.zeros(shape=(N_consensus, N_samples)) chunks_size = get_chunk_size(N_samples, 7) for i in range(0, N_consensus, chunks_size): x = min(chunks_size, N_consensus - i) matched_clusters = np.where(cluster_labels == np.reshape( np.arange(i, min(i + chunks_size, N_consensus)), newshape=(x, 1))) M = np.zeros((x, N_samples)) for j in range(x): coord = np.where(matched_clusters[0] == j)[0] M[j] = np.asarray( hypergraph_adjacency[matched_clusters[1][coord], :].mean( axis=0)) clb_cum[i:min(i + chunks_size, N_consensus)] = M # Done with collapsing the hyper-edges into a single meta-hyper-edge, # for each of the (N_consensus - 1) meta-clusters. del hypergraph_adjacency gc.collect() # print(clb_cum[10]) # Each object will now be assigned to its most associated meta-cluster. chunks_size = get_chunk_size(N_consensus, 4) N_chunks, remainder = divmod(N_samples, chunks_size) if N_chunks == 0: null_columns = np.where(clb_cum[:].sum(axis=0) == 0)[0] else: szumsz = np.zeros(0) for i in range(N_chunks): M = clb_cum[:, i * chunks_size:(i + 1) * chunks_size] szumsz = np.append(szumsz, M.sum(axis=0)) if remainder != 0: M = clb_cum[:, N_chunks * chunks_size:N_samples] szumsz = np.append(szumsz, M.sum(axis=0)) null_columns = np.where(szumsz == 0)[0] if null_columns.size != 0: # print("INFO: Cluster_Ensembles: MCLA: {} objects with all zero associations " # "in 'clb_cum' matrix of meta-clusters.".format(null_columns.size)) clb_cum[:, null_columns] = np.random.rand(N_consensus, null_columns.size) random_state = np.random.RandomState() tmp = np.zeros(shape=(N_consensus, N_samples)) chunks_size = get_chunk_size(N_samples, 2) N_chunks, remainder = divmod(N_consensus, chunks_size) if N_chunks == 0: tmp[:] = random_state.rand(N_consensus, N_samples) else: for i in range(N_chunks): tmp[i * chunks_size:(i + 1) * chunks_size] = random_state.rand( chunks_size, N_samples) if remainder != 0: tmp[N_chunks * chunks_size:N_consensus] = random_state.rand( remainder, N_samples) expr = tables.Expr("clb_cum + (tmp / 10000)") expr.set_output(clb_cum) expr.eval() expr = tables.Expr("abs(tmp)") expr.set_output(tmp) expr.eval() chunks_size = get_chunk_size(N_consensus, 2) N_chunks, remainder = divmod(N_samples, chunks_size) if N_chunks == 0: sum_diag = tmp[:].sum(axis=0) else: sum_diag = np.empty(0) for i in range(N_chunks): M = tmp[:, i * chunks_size:(i + 1) * chunks_size] sum_diag = np.append(sum_diag, M.sum(axis=0)) if remainder != 0: M = tmp[:, N_chunks * chunks_size:N_samples] sum_diag = np.append(sum_diag, M.sum(axis=0)) inv_sum_diag = np.reciprocal(sum_diag.astype(float)) if N_chunks == 0: clb_cum *= inv_sum_diag max_entries = np.amax(clb_cum, axis=0) else: max_entries = np.zeros(N_samples) for i in range(N_chunks): clb_cum[:, i * chunks_size:(i + 1) * chunks_size] *= inv_sum_diag[i * chunks_size:(i + 1) * chunks_size] max_entries[i * chunks_size:(i + 1) * chunks_size] = np.amax( clb_cum[:, i * chunks_size:(i + 1) * chunks_size], axis=0) if remainder != 0: clb_cum[:, N_chunks * chunks_size:N_samples] *= inv_sum_diag[ N_chunks * chunks_size:N_samples] max_entries[N_chunks * chunks_size:N_samples] = np.amax( clb_cum[:, N_chunks * chunks_size:N_samples], axis=0) cluster_labels = np.zeros(N_samples, dtype=int) winner_probabilities = np.zeros(N_samples) chunks_size = get_chunk_size(N_samples, 2) for i in reversed(range(0, N_consensus, chunks_size)): ind = np.where( np.tile(max_entries, ( min(chunks_size, N_consensus - i), 1)) == clb_cum[i:min(i + chunks_size, N_consensus)]) cluster_labels[ind[1]] = i + ind[0] winner_probabilities[ind[1]] = clb_cum[(ind[0] + i, ind[1])] # Done with competing for objects. cluster_labels = one_to_max(cluster_labels) return cluster_labels
def pandas_io(): data = np.random.standard_normal((1000, 5)).round(5) # sample data set filename = path + 'numbs' # query = 'CREATE TABLE numbers (No1 real, No2 real,\ # No3 real, No4 real, No5 real)' con = sq3.Connection(filename + '.db') # Dont want to do these every run # con.execute(query) # con.executemany('INSERT INTO numbers VALUES (?, ?, ?, ?, ?)', data) # con.commit() temp = con.execute('SELECT * FROM numbers').fetchall() print(temp[:2]) query = 'SELECT * FROM numbers WHERE No1 > 0 AND No2 < 0' res = np.array(con.execute(query).fetchall()).round(3) plt.plot(res[:, 0], res[:, 1], 'ro') plt.grid(True); plt.xlim(-0.5, 4.5); plt.ylim(-4.5, 0.5) plt.savefig(PNG_PATH + 'query.png', dpi=300) plt.close() data = pds.read_sql('SELECT * FROM numbers', con) print(data.head()) print(data[(data['No1'] > 0) & (data['No2'] < 0)].head()) res = data[['No1', 'No2']][((data['No1'] > 0.5) | (data['No1'] < -0.5)) & ((data['No2'] < -1) | (data['No2'] > 1))] plt.plot(res.No1, res.No2, 'ro') plt.grid(True); plt.axis('tight') plt.savefig(PNG_PATH + 'x_scatter.png', dpi=300) plt.close() h5s = pd.HDFStore(filename + '.h5s', 'w') h5s['data'] = data print(h5s) h5s.close() h5s = pd.HDFStore(filename + '.h5s', 'r') temp = h5s['data'] h5s.close() np.allclose(np.array(temp), np.array(data)) data.to_csv(filename + '.csv') # REMEMBER THISSSSSSSSSSSSSSSSSSS # Can do mpl on pandas or numpy and then just call plot() and savefig a = pd.read_csv(filename + '.csv')[['No1', 'No2', 'No3', 'No4']].hist(bins=20) plt.plot() plt.savefig(PNG_PATH + 'hist.png', dpi=300) plt.close() data[:1000].to_excel(filename + '.xlsx') pd.read_excel(filename + '.xlsx', 'Sheet1').cumsum().plot() plt.plot() plt.savefig(PNG_PATH + 'excel.png', dpi=300) plt.close() filename = path + 'tab.h5' h5 = tb.open_file(filename, 'w') rows = 1000 row_des = { 'Date': tb.StringCol(26, pos=1), 'No1': tb.IntCol(pos=2), 'No2': tb.IntCol(pos=3), 'No3': tb.Float64Col(pos=4), 'No4': tb.Float64Col(pos=5) } filters = tb.Filters(complevel=0) # no compression tab = h5.create_table('/', 'ints_floats', row_des, title='Integers and Floats', expectedrows=rows, filters=filters) print(tab) pointer = tab.row ran_int = np.random.randint(0, 10000, size=(rows, 2)) ran_flo = np.random.standard_normal((rows, 2)).round(5) for i in range(rows): pointer['Date'] = dt.datetime.now() pointer['No1'] = ran_int[i, 0] pointer['No2'] = ran_int[i, 1] pointer['No3'] = ran_flo[i, 0] pointer['No4'] = ran_flo[i, 1] pointer.append() # this appends the data and # moves the pointer one row forward tab.flush() # flush = commit in sqlite print(tab) dty = np.dtype([('Date', 'S26'), ('No1', '<i4'), ('No2', '<i4'), ('No3', '<f8'), ('No4', '<f8')]) sarray = np.zeros(len(ran_int), dtype=dty) sarray['Date'] = dt.datetime.now() sarray['No1'] = ran_int[:, 0] sarray['No2'] = ran_int[:, 1] sarray['No3'] = ran_flo[:, 0] sarray['No4'] = ran_flo[:, 1] h5.create_table('/', 'ints_floats_from_array', sarray, title='Integers and Floats', expectedrows=rows, filters=filters) print(h5) h5.remove_node('/', 'ints_floats_from_array') print(tab[:3]) print(tab[:4]['No4']) print(np.sum(tab[:]['No3'])) print(np.sum(np.sqrt(tab[:]['No1']))) plt.hist(tab[:]['No3'], bins=30) plt.grid(True) print(len(tab[:]['No3'])) plt.plot() plt.savefig(PNG_PATH + 'h5.png', dpi=300) plt.close() res = np.array([(row['No3'], row['No4']) for row in tab.where('((No3 < -0.05) | (No3 > 0.05)) \ & ((No4 < -0.1) | (No4 > 0.1))')])[::100] plt.plot(res.T[0], res.T[1], 'ro') plt.grid(True) plt.savefig(PNG_PATH + 'h5_x.png', dpi=300) plt.close() values = tab.cols.No3[:] print("Max %18.3f" % values.max()) print("Ave %18.3f" % values.mean()) print("Min %18.3f" % values.min()) print("Std %18.3f" % values.std()) results = [(row['No1'], row['No2']) for row in tab.where('((No1 > 9800) | (No1 < 200)) \ & ((No2 > 4500) & (No2 < 5500))')] for res in results[:4]: print(res) results = [(row['No1'], row['No2']) for row in tab.where('(No1 == 1234) & (No2 > 9776)')] for res in results: print(res) filename = path + 'tab.h5c' h5c = tb.open_file(filename, 'w') filters = tb.Filters(complevel=4, complib='blosc') tabc = h5c.create_table('/', 'ints_floats', sarray, title='Integers and Floats', expectedrows=rows, filters=filters) res = np.array([(row['No3'], row['No4']) for row in tabc.where('((No3 < -0.5) | (No3 > 0.5)) \ & ((No4 < -1) | (No4 > 1))')])[::100] arr_non = tab.read() arr_com = tabc.read() h5c.close() arr_int = h5.create_array('/', 'integers', ran_int) arr_flo = h5.create_array('/', 'floats', ran_flo) print(h5) h5.close() filename = path + 'array.h5' h5 = tb.open_file(filename, 'w') n = 100 ear = h5.create_earray(h5.root, 'ear', atom=tb.Float64Atom(), shape=(0, n)) rand = np.random.standard_normal((n, n)) for i in range(750): ear.append(rand) ear.flush() print(ear) print(ear.size_on_disk) out = h5.create_earray(h5.root, 'out', atom=tb.Float64Atom(), shape=(0, n)) expr = tb.Expr('3 * sin(ear) + sqrt(abs(ear))') expr.set_output(out, append_mode=True) print(expr.eval()) imarray = ear.read() import numexpr as ne expr = '3 * sin(imarray) + sqrt(abs(imarray))' ne.set_num_threads(16) print(ne.evaluate(expr)[0, :10]) h5.close()
def compute_up(c, t, **kwargs): uservars = dict((col, getattr(t.cols, col)) for col in c.active_columns()) e = tb.Expr(str(c._expr), uservars=uservars, truediv=True) return e.eval()