def k_prototypes(X, categorical, n_clusters, max_iter, num_dissim, cat_dissim, gamma, init, n_init, verbose, random_state, n_jobs): """k-prototypes algorithm""" random_state = check_random_state(random_state) if sparse.issparse(X): raise TypeError("k-prototypes does not support sparse data.") if categorical is None or not categorical: raise NotImplementedError( "No categorical data selected, effectively doing k-means. " "Present a list of categorical columns, or use scikit-learn's " "KMeans instead.") if isinstance(categorical, int): categorical = [categorical] assert len(categorical) != X.shape[1], \ "All columns are categorical, use k-modes instead of k-prototypes." assert max(categorical) < X.shape[1], \ "Categorical index larger than number of columns." ncatattrs = len(categorical) nnumattrs = X.shape[1] - ncatattrs n_points = X.shape[0] assert n_clusters <= n_points, "Cannot have more clusters ({}) " \ "than data points ({}).".format(n_clusters, n_points) Xnum, Xcat = _split_num_cat(X, categorical) Xnum, Xcat = check_array(Xnum), check_array(Xcat, dtype=None) # Convert the categorical values in Xcat to integers for speed. # Based on the unique values in Xcat, we can make a mapping to achieve this. Xcat, enc_map = encode_features(Xcat) # Are there more n_clusters than unique rows? Then set the unique # rows as initial values and skip iteration. unique = get_unique_rows(X) n_unique = unique.shape[0] if n_unique <= n_clusters: max_iter = 0 n_init = 1 n_clusters = n_unique init = list(_split_num_cat(unique, categorical)) init[1], _ = encode_features(init[1], enc_map) # Estimate a good value for gamma, which determines the weighing of # categorical values in clusters (see Huang [1997]). if gamma is None: gamma = 0.5 * Xnum.std() # std计算矩阵标准差 # print(gamma) results = [] seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init) if n_jobs == 1: for init_no in range(n_init): results.append( k_prototypes_single(Xnum, Xcat, nnumattrs, ncatattrs, n_clusters, n_points, max_iter, num_dissim, cat_dissim, gamma, init, init_no, verbose, seeds[init_no])) else: results = Parallel(n_jobs=n_jobs, verbose=0)( delayed(k_prototypes_single) (Xnum, Xcat, nnumattrs, ncatattrs, n_clusters, n_points, max_iter, num_dissim, cat_dissim, gamma, init, init_no, verbose, seed) for init_no, seed in enumerate(seeds)) all_centroids, all_labels, all_costs, all_n_iters, all_epoch_costs = zip( *results) best = np.argmin(all_costs) if n_init > 1 and verbose: print("Best run was number {}".format(best + 1)) # Note: return gamma in case it was automatically determined. return all_centroids[best], enc_map, all_labels[best], all_costs[best], \ all_n_iters[best], all_epoch_costs[best], gamma
stream3c[1].stats['asdf'] = a2 stream3c[2].stats['asdf'] = a3 stream3c.trim2(-25, 75, 'onset') return stream3c print "Lets start the show..." data = read_rf('DATA/7X-event_waveforms_for_rf.h5', 'H5') print "Data in..." ''' # we can exclude bad stations inc_set = list(set([tr.stats.inclination for tr in data])) data_filtered = RFStream([tr for tr in data if tr.stats.inclination in inc_set and tr.stats.station not in ['MIJ2', 'MIL2']]) ''' stream = RFStream() rf_streams = Parallel(n_jobs=-1, verbose=1)(map(delayed(do_rf), IterMultipleComponents(data, 'onset', 3))) for i, rf in enumerate(rf_streams): event_id = {'event_id': 0} event_id['event_id'] = i for tr in rf: tr.stats.update(event_id) stream.extend(rf) stream.write('DATA/7X-rf_zrt', 'H5') print "No worries, mate..."
def make_dataset_scada(self): X = dict() for project in self.projects: X[project['_id']] = pd.DataFrame() if self.isfortest: file_nwp = 'weather_data_test.csv' else: file_nwp = 'weather_data.csv' if not os.path.exists( os.path.join(self.projects[0]['static_data']['path_data'], file_nwp)): lats, longs = self.lats_longs() nwp = self.stack_daily_nwps(self.dates[-1], self.data, lats, longs, self.path_nwp, self.nwp_model, self.projects, self.variables, self.compress) nwp_daily = Parallel(n_jobs=self.njobs)( delayed(self.stack_daily_nwps)( t, self.data, lats, longs, self.path_nwp, self.nwp_model, self.projects, self.variables, self.compress) for t in self.dates) for nwp in nwp_daily: for project in self.projects: if nwp[0][project['_id']].shape[0] != 0: X[project['_id']] = pd.concat( [X[project['_id']], nwp[0][project['_id']]]) self.logger.info('All Inputs stacked for date %s', nwp[1]) for project in self.projects: X[project['_id']].to_csv( os.path.join(project['static_data']['path_data'], file_nwp)) else: for project in self.projects: X[project['_id']] = pd.read_csv(os.path.join( project['static_data']['path_data'], file_nwp), header=0, index_col=0, parse_dates=True, dayfirst=True) for project in self.projects: data_path = project['static_data']['path_data'] if self.isfortest: dataset_X, dataset_y, X_3d = self.create_dataset( X[project['_id']], data_path, start_index=9001, test=self.isfortest) if dataset_y.isna().any().values[0]: dataset_X = dataset_X.drop(dataset_y.index[np.where( dataset_y.isna())[0]]) if len(X_3d.shape) > 1: X_3d = np.delete(X_3d, np.where(dataset_y.isna())[0], axis=0) dataset_y = dataset_y.drop(dataset_y.index[np.where( dataset_y.isna())[0]]) dataset_X.to_csv( os.path.join(project['static_data']['path_data'], 'dataset_X_test.csv')) dataset_y.to_csv( os.path.join(project['static_data']['path_data'], 'dataset_y_test.csv')) joblib.dump( X_3d, os.path.join(project['static_data']['path_data'], 'dataset_lstm_test.pickle')) self.logger.info('Datasets saved for project %s', project['_id']) else: dataset_X, dataset_y, X_3d = self.create_dataset( X[project['_id']], data_path, start_index=9001, test=self.isfortest) if dataset_y.isna().any().values[0]: dataset_X = dataset_X.drop(dataset_y.index[np.where( dataset_y.isna())[0]]) if len(X_3d.shape) > 1: X_3d = np.delete(X_3d, np.where(dataset_y.isna())[0], axis=0) dataset_y = dataset_y.drop(dataset_y.index[np.where( dataset_y.isna())[0]]) dataset_X.to_csv( os.path.join(project['static_data']['path_data'], 'dataset_X.csv')) dataset_y.to_csv( os.path.join(project['static_data']['path_data'], 'dataset_y.csv')) joblib.dump( X_3d, os.path.join(project['static_data']['path_data'], 'dataset_lstm.pickle')) self.logger.info('Datasets saved for project %s', project['_id'])
opfname = f.replace('.json', '.WL' + str(h)) else: opfname = f.replace('.gexf', '.WL' + str(h)) subgraph2vec_sentences = get_graph_as_bow(g, h) with open(opfname, 'w') as fh: for w in subgraph2vec_sentences: print >> fh, w logging.debug('dumped wlk file in {} sec'.format(round(time() - T0, 2))) if __name__ == '__main__': # if sys.argv[1] in ['-h','--help']: # print 'command line args: <gexf/json graph_dir> <height of WL kernel> <num of cpu cores for multi-processing>' # exit (0) graph_dir = "/home/annamalai/OLMD/OLMD/MKLDroid/tmp/amd_dataset_graphs_wlfiles/adgs" #folder containing the graph's gexf/json format files h = 2 #height of WL kernel (i.e., degree of neighbourhood to consdider) n_cpus = 36 # number of cpus to be used for multiprocessing extn = '.gexf' files_to_process = get_files(dirname=graph_dir, extn=extn) print files_to_process raw_input( 'have to procees a total of {} files with {} parallel processes... hit any key to proceed...' .format(len(files_to_process), n_cpus)) Parallel(n_jobs=n_cpus)(delayed(dump_subgraph2vec_sentences)(f, h) for f in files_to_process)
def parallelNelderMead( objFunc, guess, perturb=None, P=1, ftol=0.000001, xtol=0.00000001, maxiter=np.inf, maxeval=np.inf, r_param=1.0, e_param=1.0, c_param=0.5, s_param=0.5, maxthreads=None, name=None, resume=False, savefreq=None, verbose=1, ): """ A parallel implementation of the Nelder-Mead minimization algorithm, as described in Lee and Wiswall. For long optimization procedures, it can save progress between iterations and resume later. Parameters ---------- objFunc : function The objective function to be minimized. Takes a single 1D array as input. guess : np.array Initial starting point for the simplex, representing an input for objFunc. perturb : np.array Perturbation vector for the simplex, of the same length as an input to objFunc. If perturb[j] is non-zero, a simplex point will be created that perturbs the j-th element of guess by perturb[j]; if it is zero, then the j-th parameter of objFunc will not be optimized over. By default, perturb=None, indicating that all parameters should be optimized, with an initial perturbation of 0.1*guess. P : int Degree of parallelization: the number of vertices of the simplex to try to update on each iteration of the process. ftol : float Absolute tolerance of the objective function for convergence. If suc- cessive iterations return minimum function values that differ by less than ftol, the process terminates successfully. xtol : float Absolute tolerance of the input values for convergence. If the maximum distance between the current minimum point and the worst point in the simplex is less than xtol, then the process terminates successfully. maxiter : int Maximum number of Nelder-Mead iterations; reaching iters=maxiter is reported as an "unsuccessful" minimization. maxeval : int Maximum number of evaluations of objFunc (across all processes); reaching evals=maxeval is reported as an "unsuccessful" minimization. r_param: float Parameter indicating magnitude of the reflection point calculation. e_param: float Parameter indicating magnitude of the expansion point calculation. c_param: float Parameter indicating magnitude of the contraction point calculation. s_param: float Parameter indicating magnitude of the shrink calculation. maxthreads : int The maximum number of CPU cores that the optimization should use, regardless of the size of the problem. name : string A filename for (optionally) saving the progress of the Nelder-Mead search, and for resuming a previous search (when resume=True). Useful for long searches that could potentially be interrupted by computer down time. resume : boolean An indicator for whether the search should resume from earlier progress. When True, the process will load a progress file named in input name. savefreq : int When not None, search progress will be saved to name.txt every savefreq iterations, to be loaded later with resume=True). verbose : int Indicator for the verbosity of the optimization routine. Higher values generate more text output; verbose=0 produces no text output. Returns ------- min_point : np.array The input that minimizes objFunc, as found by the minimization. fmin : float The minimum of objFunc; fmin = objFunc(min_point). """ # If this is a resumed search, load the data if resume: simplex, fvals, iters, evals = loadNelderMeadData(name) dim_count = fvals.size - 1 N = dim_count + 1 # Number of points in simplex K = simplex.shape[1] # Total number of parameters # Otherwise, construct the initial simplex and array of function values else: if perturb is None: # Default: perturb each parameter by 10% perturb = 0.1 * guess guess[guess == 0] = 0.1 params_to_opt = np.where( perturb != 0)[0] # Indices of which parameters to optimize dim_count = params_to_opt.size # Number of parameters to search over N = dim_count + 1 # Number of points in simplex K = guess.size # Total number of parameters simplex = np.tile(guess, (N, 1)) for j in range( dim_count ): # Perturb each parameter to optimize by the specified distance simplex[j + 1, params_to_opt[j]] = (simplex[j + 1, params_to_opt[j]] + perturb[params_to_opt[j]]) # Initialize iteration and evaluation counts, plus a 1D array of function values fvals = np.zeros(dim_count + 1) + np.nan iters = 0 evals = 0 # Make sure degree of parallelization is not illegal if P > N - 1: print("Requested degree of simplex parallelization is " + str(P) + ", but dimension of optimization problem is only " + str(N - 1) + ".") print("Degree of parallelization has been reduced to " + str(N - 1) + ".") P = N - 1 # Create the pool of worker processes cpu_cores = multiprocessing.cpu_count( ) # Total number of available CPU cores cores_to_use = min(cpu_cores, dim_count) if maxthreads is not None: # Cap the number of cores if desired cores_to_use = min(cores_to_use, maxthreads) parallel = Parallel(n_jobs=cores_to_use) # Begin a new Nelder-Mead search if not resume: temp_simplex = list(simplex) # Evaluate the initial simplex fvals = np.array( parallel(delayed(objFunc)(params) for params in temp_simplex)) evals += N # Reorder the initial simplex order = np.argsort(fvals) fvals = fvals[order] simplex = simplex[order, :] fmin = fvals[0] f_dist = np.abs(fmin - fvals[-1]) x_dist = np.max( np.sqrt( np.sum((simplex - np.tile(simplex[0, :], (N, 1)))**2.0, axis=1))) if verbose > 0: print("Evaluated the initial simplex: fmin=" + str(fmin) + ", f_dist=" + str(f_dist) + ", x_dist=" + str(x_dist)) if savefreq is not None: saveNelderMeadData(name, simplex, fvals, iters, evals) if verbose > 0: print("Saved search progress in " + name + ".txt") else: # Resume an existing search that was cut short if verbose > 0: print("Resuming search after " + str(iters) + " iterations and " + str(evals) + " function evaluations.") # Initialize some inputs for the multithreader j_list = range(N - P, N) opt_params = [r_param, c_param, e_param] # Run the Nelder-Mead algorithm until a terminal condition is met go = True while go: t_start = time() iters += 1 if verbose > 0: print("Beginning iteration #" + str(iters) + " now.") # Update the P worst points of the simplex output = parallel( delayed(parallelNelderMeadWorker)(objFunc, simplex, fvals, j, P, opt_params) for j in j_list) new_subsimplex = np.zeros((P, K)) + np.nan new_vals = np.zeros(P) + np.nan new_evals = 0 for i in range(P): new_subsimplex[i, :] = output[i][0] new_vals[i] = output[i][1] new_evals += output[i][2] evals += new_evals # Check whether any updates actually happened old_subsimplex = simplex[(N - P):N, :] if np.max(np.abs(new_subsimplex - old_subsimplex)) == 0: if verbose > 0: print("Updated the simplex, but must perform a shrink step.") # If every attempted update was unsuccessful, must shrink the simplex simplex = (s_param * np.tile(simplex[0, :], (N, 1)) + (1.0 - s_param) * simplex) temp_simplex = list(simplex[1:N, :]) fvals = np.array([fvals[0]] + parallel( delayed(objFunc)(params) for params in temp_simplex)) new_evals += N - 1 evals += N - 1 else: if verbose > 0: print("Updated the simplex successfully.") # Otherwise, update the simplex with the new results simplex[(N - P):N, :] = new_subsimplex fvals[(N - P):N] = new_vals # Reorder the simplex from best to worst order = np.argsort(fvals) fvals = fvals[order] simplex = simplex[order, :] fmin = fvals[0] f_dist = np.abs(fmin - fvals[-1]) x_dist = np.max( np.sqrt( np.sum((simplex - np.tile(simplex[0, :], (N, 1)))**2.0, axis=1))) t_end = time() if verbose > 0: t_iter = t_end - t_start print("Finished iteration #" + str(iters) + " with " + str(new_evals) + " evaluations (" + str(evals) + " cumulative) in " + str(t_iter) + " seconds.") print("Simplex status: fmin=" + str(fmin) + ", f_dist=" + str(f_dist) + ", x_dist=" + str(x_dist)) # Check for terminal conditions if iters >= maxiter: go = False print("Maximum iterations reached, terminating unsuccessfully.") if evals >= maxeval: go = False print("Maximum evaluations reached, terminating unsuccessfully.") if f_dist < ftol: go = False print("Function tolerance reached, terminating successfully.") if x_dist < xtol: go = False print("Parameter tolerance reached, terminating successfully.") # Save the progress of the estimation if desired if savefreq is not None: if (iters % savefreq) == 0: saveNelderMeadData(name, simplex, fvals, iters, evals) if verbose > 0: print("Saved search progress in " + name + ".txt") # Return the results xopt = simplex[0, :] return xopt, fmin
clistfile_h.close() # print(cfile_lines[0]) cline_basename = cfile_lines[0] cline_path = os.path.join(preclinedir, cline_basename) if not os.path.isfile(cline_path): print("cannot find", cline_path) sys.exit(-1) clinedata, header = nrrd.read(cline_path) clines = get_center_lines(clinedata, point_cnt=500) slistfile_h = open(slistfile, "r") sfile_lines = slistfile_h.readlines() sfile_lines = [line.rstrip() for line in sfile_lines] slistfile_h.close() # if progress: # bar = Bar('Processing', max=len(sfile_lines)) if parallel: # Parallel(n_jobs=n_jobs, backend="multiprocessing", require='sharedmem')( # delayed(crop_along_cline)(sfilename) for sfilename in sfile_lines) Parallel(n_jobs=n_jobs, require='sharedmem')(delayed(crop_along_cline)(sfilename) for sfilename in sfile_lines) else: for sfilename in sfile_lines: crop_along_cline(sfilename) # if progress: # bar.finish()
def main(): data_path = '/mnt/nvme102/alex/sat/any/data' log_dir = '/mnt/nvme102/alex/sat/any/logs2022/only_sat' batch_sz = 250 epochs = 30 seed = 42 n_vars = 10 n_ops = 55 use_cuda = True data_debug = False print('loading train data') train_dataset = TreeFormulasDataset(os.path.join(data_path, 'train.txt'), n_ops, n_vars, data_debug, only_sat=True) print('loading test data') test_dataset = TreeFormulasDataset(os.path.join(data_path, 'test.txt'), n_ops, n_vars, data_debug) print('loading validation data') validation_dataset = TreeFormulasDataset( os.path.join(data_path, 'validation.txt'), n_ops, n_vars, data_debug) experiment_target = 'steps_dim_cl' dim_options = [ # 1, 2, 4, 8, 16, 32, 16 ] cl_options = [ # 1, 2, 3, 5, 6, 7 ] # 5=min, 6=prod,7=luk rnn_steps_options = [ # 0, 1, 2, 3, 4, 5, 10, 20, 30, 40, 50 10 ] experiments = itertools.product(dim_options, cl_options, rnn_steps_options) opt = 'adam' lr = 0.005 if opt == 'sgd' else 0.0005 momentum = 0.75 nesterov = False wdecay = 1e-4 eps = 1e-8 grad_clip = 0.2 n_gpus = 1 gpus = ['cuda:{}'.format(i) for i in range(n_gpus)] works = [] skip_e = None for i, e in enumerate(experiments): if skip_e is not None: if e == skip_e: skip_e = None else: continue works.append((i, e)) works_by_gpus = [(gpus[i], works[i::len(gpus)]) for i in range(len(gpus))] wlist = [ delayed(list_worker)(gpu, wks, seed, opt, epochs, batch_sz, lr, wdecay, nesterov, train_dataset, test_dataset, validation_dataset, momentum, eps, log_dir, grad_clip) for gpu, wks in works_by_gpus ] Parallel(len(wlist), 'threading')(wlist)
f"{args.feature_dir}/forecasting_features_{args.mode}.pkl") if __name__ == "__main__": """Load sequences and save the computed features.""" args = parse_arguments() start = time.time() map_features_utils_instance = MapFeaturesUtils() social_features_utils_instance = SocialFeaturesUtils() sequences = os.listdir(args.data_dir) temp_save_dir = tempfile.mkdtemp() num_sequences = _FEATURES_SMALL_SIZE if args.small else len(sequences) Parallel(n_jobs=-2)(delayed(load_seq_save_features)( i, sequences, temp_save_dir, map_features_utils_instance, social_features_utils_instance, ) for i in range(0, num_sequences, args.batch_size)) merge_saved_features(temp_save_dir) shutil.rmtree(temp_save_dir) print( f"Feature computation for {args.mode} set completed in {(time.time()-start)/60.0} mins" )
'shape_i': (data_i.shape[0], ys_i[i + 1] - ys_i[i], xs_i[j + 1] - xs_i[j]), 'shape_m': (data_m.shape[0], ys_m[i + 1] - ys_m[i], xs_m[j + 1] - xs_m[j]), #'shape_p': (data_p.shape[0], ys_p[i+1] - ys_p[i], xs_p[j+1] - xs_p[j]) } save_pickle('cache/meta/%s_%d_%d.pickle' % (loc, i, j), meta) write_location_images(loc, data_i, xs_i, ys_i, 'I') write_location_images(loc, data_m, xs_m, ys_m, 'M') #write_location_images(loc, data_p, xs_p, ys_p, 'P') write_location_images(loc, normalize(data_m), xs_m, ys_m, 'MN') # Write location-normalized M channels write_location_images(loc, compute_filters(data_i), xs_i, ys_i, 'IF') write_location_images(loc, compute_indices(data_m), xs_m, ys_m, 'MI') #data_a, xs_a, ys_a = read_location_images(loc, 'sixteen_band', 'A', resize_to='shape_m') #write_location_images(loc, data_a, xs_a, ys_a, 'A') print "Preparing image data..." # Prepare locations Parallel(n_jobs=2)(delayed(prepare_location)(loc) for loc in locations) print "Done."
for arg in sys.argv[1:]: data = vtkio.getBlockByName(reader.GetOutput(), arg) merger.AddInputData(data) merger.Update() ds = dsa.WrapDataObject(merger.GetOutput()) times = [] for i in range(reader.GetTimeSets().GetNumberOfItems()): array = reader.GetTimeSets().GetItem(i) for j in range(array.GetNumberOfTuples()): times.append(array.GetComponent(j, 0)) eigvalues = [] timecoefficients = [] del merger, ds, reader snaps = Parallel(n_jobs=6, max_nbytes=1e9, verbose=30)(delayed(compute_snapshot)(files[0], time, sys.argv[1:]) for time in times[1:]) #pdb.set_trace() N = len(times)-1 np.savez('cache_snapshots.npz', snaps=snaps) fft_values = np.empty((snaps[0].shape[0], N//2)) xf = fftfreq(N, 0.001) snapshots = np.empty((snaps[0].shape[0], N)) for i, snap in enumerate(snaps): snapshots[:,i] = snap ffts = Parallel(n_jobs=6, max_nbytes=1e9, verbose=30, prefer='threads')(delayed(fft)(snapshots[i,:]) for i in range(snapshots.shape[0])) for i, snap in enumerate(ffts): fft_values[i, :] = 2.0 / N * np.abs(snap[:N//2]) np.savez(cacheFile, fft_values=fft_values, xf=xf) else: data = np.load(cacheFile)
def node2vec( G, dimensions=128, walk_length=80, num_walks=10, p=1.0, q=1.0, weight_key=None, workers=None, **skip_gram_params, ): """Graph embedding via Node2Vec. Parameters ---------- G : easygraph.Graph or easygraph.DiGraph dimensions : int Embedding dimensions, optional(default: 128) walk_length : int Number of nodes in each walk, optional(default: 80) num_walks : int Number of walks per node, optional(default: 10) p : float The return hyper parameter, optional(default: 1.0) q : float The input parameter, optional(default: 1.0) weight_key : string or None (default: None) On weighted graphs, this is the key for the weight attribute workers : int or None, optional(default : None) The number of workers generating random walks (default: None). None if not using only one worker. skip_gram_params : dict Parameters for gensim.models.Word2Vec - do not supply 'size', it is taken from the 'dimensions' parameter Returns ------- embedding_vector : dict The embedding vector of each node most_similar_nodes_of_node : dict The most similar nodes of each node and its similarity Examples -------- >>> node2vec(G, ... dimensions=128, # The graph embedding dimensions. ... walk_length=80, # Walk length of each random walks. ... num_walks=10, # Number of random walks. ... p=1.0, # The `p` possibility in random walk in [1]_ ... q=1.0, # The `q` possibility in random walk in [1]_ ... weight_key='weight', ... skip_gram_params=dict( # The skip_gram parameters in Python package gensim. ... window=10, ... min_count=1, ... batch_words=4 ... )) References ---------- .. [1] https://arxiv.org/abs/1607.00653 """ G_index, index_of_node, node_of_index = G.to_index_node_graph() if workers is None: walks = simulate_walks( G_index, walk_length=walk_length, num_walks=num_walks, p=p, q=q, weight_key=weight_key, ) else: from joblib import Parallel from joblib import delayed num_walks_lists = np.array_split(range(num_walks), workers) walks = Parallel(n_jobs=workers)(delayed(simulate_walks)( G_index, walk_length, len(num_walks), p, q, weight_key) for num_walks in num_walks_lists) # Change multidimensional array to one dimensional array walks = [walk for walk_group in walks for walk in walk_group] model = learn_embeddings(walks=walks, dimensions=dimensions, **skip_gram_params) ( embedding_vector, most_similar_nodes_of_node, ) = _get_embedding_result_from_gensim_skipgram_model( G=G, index_of_node=index_of_node, node_of_index=node_of_index, model=model) del G_index return embedding_vector, most_similar_nodes_of_node
def classify(self, sequences_fname: str, verbose=False) -> List[SingleResult]: """Perform a two-step classification. Parameters ---------- sequences_fname : a path to fasta file to classify Returns ------- predictions: a list of lists containing SingleResult objects. """ with open(sequences_fname, "r") as sequences_handle: seqs = list(SimpleFastaParser(sequences_handle)) # seqs = [(desc, "".join([l for l in seq.upper() if l in allowed_letters])) for desc, seq in seqs] seqs = [x for x in seqs if len(x[1]) >= self.min_len] do = delayed(fun) executor = Parallel(n_jobs=self.threads) tasks = ( do(x[1], 0, self.predictors[0].transformer, self.params[0]["fragment_len"]) for x in seqs ) cont_manager = ( time_context_manager("Calculating first stage sequence representations") if verbose else suppress() ) with cont_manager: seqs = list( zip([x[0] for x in seqs], [x[1] for x in seqs], executor(tasks)) ) # Two-step classification if verbose: print("Performing first stage of classification.") fst_stage_results = [] for seq in tqdm(seqs): fst_stage_results.append(self.predictors[0].make_prediction(seq)) else: # tasks = (do(seq) for seq in seqs) # fst_stage_results = executor(tasks) fst_stage_results = [ self.predictors[0].make_prediction(seq) for seq in seqs ] if verbose: print("Done") predictions = [] to_second_stage = [] for prediction in fst_stage_results: if prediction.cls[0] == "organelle": to_second_stage.append(prediction) else: predictions.append(prediction) if to_second_stage: tasks = ( do( record.seq, 1, self.predictors[1].transformer, self.params[1]["fragment_len"], ) for record in to_second_stage ) cont_manager = ( time_context_manager( "Calculating second stage sequence representations" ) if verbose else suppress() ) with cont_manager: seqs2 = list( zip( [record.desc for record in to_second_stage], [record.seq for record in to_second_stage], executor(tasks), ) ) if verbose: print("Performing second stage of classification.") snd_stage_results = [] for seq in tqdm(seqs2): snd_stage_results.append(self.predictors[1].make_prediction(seq)) else: # tasks = (do_second_stage(seq) for seq in seqs2) # snd_stage_results = executor(tasks) snd_stage_results = [ self.predictors[1].make_prediction(seq) for seq in seqs2 ] for fst, snd in zip(to_second_stage, snd_stage_results): assert fst.desc == snd.desc, "Descriptions not the same" assert fst.seq == snd.seq, "Sequences not the same" predictions.append( SingleResult( desc=fst.desc, seq=fst.seq, cls=[fst.cls[0], snd.cls[1]], probs=[fst.probs[0], snd.probs[1]], ) ) return predictions
import pandas as pd import numpy as np from utils import particle from joblib import delayed, Parallel from tqdm import tqdm part_dir = '../output/pipeline/particles/parts_filtered.csv' savedir = '../output/pipeline/particles/parts_allframesimputed.csv' parts = pd.read_csv(part_dir) # forward-fill particle coordinates for missing frames coords_complete = Parallel(n_jobs=12)( delayed(particle.impute_coords)(coords_df) for _, coords_df in tqdm(parts.groupby(['roi', 'mov_name']))) coords_complete = pd.concat(coords_complete, ignore_index=True) # add the rest of the columns back; new rows get NaNs in all of these coords_complete = pd.merge(coords_complete, parts, on=list(coords_complete.columns), how='outer') coords_complete.to_csv(savedir, index=False)
def main(): train = read_train_data(path=os.path.join( input.__path__[0], 'petfinder-adoption-prediction/train/')) test = read_test_data(path=os.path.join( input.__path__[0], 'petfinder-adoption-prediction/test/')) if adoption_shuffle: train['AdoptionSpeed'] = random.sample( train['AdoptionSpeed'].values.tolist(), len(train)) if densenet_predict: dnet_model = densenet_model(weight_path=os.path.join( input.__path__[0], 'densenet-keras/DenseNet-BC-121-32-no-top.h5')) train_feats = predict_using_img( dnet_model, train, img_path=os.path.join( input.__path__[0], 'petfinder-adoption-prediction/train_images/')) test_feats = predict_using_img( dnet_model, test, img_path=os.path.join( input.__path__[0], 'petfinder-adoption-prediction/test_images/')) train_feats.to_pickle('densenet_train_predict.pkl') test_feats.to_pickle('densenet_test_predict.pkl') else: with open('./densenet_train_predict.pkl', 'rb') as f: train_feats = pickle.load(f) with open('./densenet_test_predict.pkl', 'rb') as f: test_feats = pickle.load(f) all_ids = pd.concat([train, test], axis=0, ignore_index=True, sort=False)[['PetID']] svd_col = adopt_svd(train_feats, test_feats) img_features = pd.concat([all_ids, svd_col], axis=1) labels_breed = pd.read_csv( os.path.join(input.__path__[0], 'petfinder-adoption-prediction/breed_labels.csv')) labels_color = pd.read_csv( os.path.join(input.__path__[0], 'petfinder-adoption-prediction/color_labels.csv')) labels_state = pd.read_csv( os.path.join(input.__path__[0], 'my_state_labels/my_state_labels.csv')) train_image_files = sorted( glob.glob( os.path.join(input.__path__[0], 'petfinder-adoption-prediction/train_images/*.jpg'))) train_metadata_files = sorted( glob.glob( os.path.join( input.__path__[0], 'petfinder-adoption-prediction/train_metadata/*.json'))) train_sentiment_files = sorted( glob.glob( os.path.join( input.__path__[0], 'petfinder-adoption-prediction/train_sentiment/*.json'))) test_image_files = sorted( glob.glob( os.path.join(input.__path__[0], 'petfinder-adoption-prediction/test_images/*.jpg'))) test_metadata_files = sorted( glob.glob( os.path.join( input.__path__[0], 'petfinder-adoption-prediction/test_metadata/*.json'))) test_sentiment_files = sorted( glob.glob( os.path.join( input.__path__[0], 'petfinder-adoption-prediction/test_sentiment/*.json'))) # Metadata: train_df_metadata = pd.DataFrame(train_metadata_files) train_df_metadata.columns = ['metadata_filename'] train_df_sentiment = pd.DataFrame(train_sentiment_files) train_df_sentiment.columns = ['sentiment_filename'] # Metadata: test_df_metadata = pd.DataFrame(test_metadata_files) test_df_metadata.columns = ['metadata_filename'] test_df_sentiment = pd.DataFrame(test_sentiment_files) test_df_sentiment.columns = ['sentiment_filename'] train_pet_ids = train.PetID.unique() test_pet_ids = test.PetID.unique() if exe_extract_additional_feature: dfs_train = Parallel(n_jobs=12, verbose=1)( delayed(extract_additional_features)(i, mode='train') for i in train_pet_ids) dfs_test = Parallel(n_jobs=12, verbose=1)( delayed(extract_additional_features)(i, mode='test') for i in test_pet_ids) train_dfs_sentiment = [ x[0] for x in dfs_train if isinstance(x[0], pd.DataFrame) ] train_dfs_metadata = [ x[1] for x in dfs_train if isinstance(x[1], pd.DataFrame) ] train_dfs_sentiment = pd.concat(train_dfs_sentiment, ignore_index=True, sort=False) train_dfs_metadata = pd.concat(train_dfs_metadata, ignore_index=True, sort=False) test_dfs_sentiment = [ x[0] for x in dfs_test if isinstance(x[0], pd.DataFrame) ] test_dfs_metadata = [ x[1] for x in dfs_test if isinstance(x[1], pd.DataFrame) ] test_dfs_sentiment = pd.concat(test_dfs_sentiment, ignore_index=True, sort=False) test_dfs_metadata = pd.concat(test_dfs_metadata, ignore_index=True, sort=False) train_dfs_metadata.to_pickle('train_dfs_metadata.pkl') train_dfs_sentiment.to_pickle('train_dfs_sentiment.pkl') test_dfs_metadata.to_pickle('test_dfs_metadata.pkl') test_dfs_sentiment.to_pickle('test_dfs_sentiment.pkl') else: with open('./train_dfs_metadata.pkl', 'rb') as f: train_dfs_metadata = pickle.load(f) with open('./train_dfs_sentiment.pkl', 'rb') as f: train_dfs_sentiment = pickle.load(f) with open('./test_dfs_metadata.pkl', 'rb') as f: test_dfs_metadata = pickle.load(f) with open('./test_dfs_sentiment.pkl', 'rb') as f: test_dfs_sentiment = pickle.load(f) # ### group extracted features by PetID: train_proc = agg_feature(train, train_dfs_metadata, train_dfs_sentiment) test_proc = agg_feature(test, test_dfs_metadata, test_dfs_sentiment) train_proc = merge_labels_breed(train_proc, labels_breed) test_proc = merge_labels_breed(test_proc, labels_breed) train_proc, test_proc = merge_labels_state(train_proc, test_proc, labels_state) train_proc = fill_and_drop_feature(train_proc) test_proc = fill_and_drop_feature(test_proc) train_proc = add_feature(train_proc) test_proc = add_feature(test_proc) X = pd.concat([train_proc, test_proc], ignore_index=True, sort=False) X_temp = X.copy() text_columns = [ 'Description', 'metadata_annots_top_desc', 'sentiment_entities' ] categorical_columns = ['main_breed_BreedName', 'second_breed_BreedName'] to_drop_columns = ['PetID', 'Name', 'RescuerID'] rescuer_count = X.groupby(['RescuerID'])['PetID'].count().reset_index() rescuer_count.columns = ['RescuerID', 'RescuerID_COUNT'] X_temp = X_temp.merge(rescuer_count, how='left', on='RescuerID') for i in categorical_columns: try: X_temp.loc[:, i] = pd.factorize(X_temp.loc[:, i])[0] except: pass X_text = X_temp[text_columns] for i in X_text.columns: X_text.loc[:, i] = X_text.loc[:, i].fillna('none') X_temp['Length_Description'] = X_text['Description'].map(len) X_temp['Length_metadata_annots_top_desc'] = X_text[ 'metadata_annots_top_desc'].map(len) X_temp['Lengths_sentiment_entities'] = X_text['sentiment_entities'].map( len) X_temp = parse_tfidf(X_temp, X_text) X_temp = X_temp.merge(img_features, how='left', on='PetID') agg_train_imgs = agg_img_feature(train_image_files) agg_test_imgs = agg_img_feature(test_image_files) agg_imgs = pd.concat([agg_train_imgs, agg_test_imgs], axis=0).reset_index(drop=True) X_temp = X_temp.merge(agg_imgs, how='left', on='PetID') # ### Drop ID, name and rescuerID X_temp = X_temp.drop(to_drop_columns, axis=1) X_train = X_temp.loc[np.isfinite(X_temp.AdoptionSpeed), :] X_test = X_temp.loc[~np.isfinite(X_temp.AdoptionSpeed), :] X_test = X_test.drop(['AdoptionSpeed'], axis=1) assert X_train.shape[0] == train.shape[0] assert X_test.shape[0] == test.shape[0] train_cols = X_train.columns.tolist() train_cols.remove('AdoptionSpeed') test_cols = X_test.columns.tolist() assert np.all(train_cols == test_cols) X_train_non_null = X_train.fillna(-1) X_test_non_null = X_test.fillna(-1) X_train_non_null.isnull().any().any(), X_test_non_null.isnull().any().any() xgb_params = { 'eval_metric': 'rmse', 'object': 'reg:squarederror', 'seed': 1337, 'eta': 0.0123, 'subsample': 0.8, 'colsample_bytree': 0.85, 'tree_method': 'gpu_hist', 'device': 'gpu', 'silent': 1, } X_train_non_null = fill_and_drop_feature_end(X_train_non_null) X_test_non_null = fill_and_drop_feature_end(X_test_non_null) X_train_non_null.to_csv('./X_train.csv') model, oof_train, oof_test, feature_score = run_xgb( xgb_params, X_train_non_null, X_test_non_null) optR = OptimizedRounder() optR.fit(oof_train, X_train['AdoptionSpeed'].values) coefficients = optR.coefficients() valid_pred = optR.predict(oof_train, coefficients) qwk = quadratic_weighted_kappa(X_train['AdoptionSpeed'].values, valid_pred) print("QWK = ", qwk) coefficients_ = coefficients.copy() coefficients_[0] = 1.66 coefficients_[1] = 2.13 coefficients_[3] = 2.85 train_predictions = optR.predict(oof_train, coefficients_).astype(np.int8) test_predictions = optR.predict(oof_test.mean(axis=1), coefficients_).astype(np.int8) valid_pred = optR.predict(oof_train, coefficients_) qwk_change = quadratic_weighted_kappa(X_train['AdoptionSpeed'].values, valid_pred) print("QWK_change = ", qwk_change) submission = pd.DataFrame({ 'PetID': test['PetID'].values, 'AdoptionSpeed': test_predictions }) submission.to_csv('submission.csv', index=False) str_metric_score = 'qwk' + '_0' + str(int(qwk * 100000)) storage_process(submission, str_metric_score, qwk, qwk_change, feature_score)
def get_items(self, indexes): items = Parallel(n_jobs=1)(delayed(self.get_item)(i) for i in indexes) images, meta_info = zip(*items) images = torch.stack(images, dim=0) return images, meta_info
def fit( self, train_loader, epochs=100, log_interval=100, test_loader=None, save_model=True, save_dir=None, ): self._validate_parameters(epochs, log_interval) self.n_outputs = self._decide_n_outputs(train_loader) # Instantiate a pool of base estimators, optimizers, and schedulers. estimators = [] for _ in range(self.n_estimators): estimators.append(self._make_estimator()) optimizers = [] for i in range(self.n_estimators): optimizers.append( set_module.set_optimizer(estimators[i], self.optimizer_name, **self.optimizer_args)) if self.use_scheduler_: scheduler_ = set_module.set_scheduler(optimizers[0], self.scheduler_name, **self.scheduler_args) # Utils criterion = nn.CrossEntropyLoss() best_acc = 0.0 # Internal helper function on pesudo forward def _forward(estimators, data): outputs = [ F.softmax(estimator(data), dim=1) for estimator in estimators ] proba = op.average(outputs) return proba # Maintain a pool of workers with Parallel(n_jobs=self.n_jobs) as parallel: # Training loop for epoch in range(epochs): self.train() if self.use_scheduler_: cur_lr = scheduler_.get_last_lr()[0] else: cur_lr = None if self.n_jobs and self.n_jobs > 1: msg = "Parallelization on the training epoch: {:03d}" self.logger.info(msg.format(epoch)) rets = parallel( delayed(_parallel_fit_per_epoch)( train_loader, estimator, cur_lr, optimizer, criterion, idx, epoch, log_interval, self.device, True, ) for idx, ( estimator, optimizer) in enumerate(zip(estimators, optimizers))) estimators, optimizers = [], [] for estimator, optimizer in rets: estimators.append(estimator) optimizers.append(optimizer) # Validation if test_loader: self.eval() with torch.no_grad(): correct = 0 total = 0 for _, (data, target) in enumerate(test_loader): data = data.to(self.device) target = target.to(self.device) output = _forward(estimators, data) _, predicted = torch.max(output.data, 1) correct += (predicted == target).sum().item() total += target.size(0) acc = 100 * correct / total if acc > best_acc: best_acc = acc self.estimators_ = nn.ModuleList() self.estimators_.extend(estimators) if save_model: io.save(self, save_dir, self.logger) msg = ("Epoch: {:03d} | Validation Acc: {:.3f}" " % | Historical Best: {:.3f} %") self.logger.info(msg.format(epoch, acc, best_acc)) if self.tb_logger: self.tb_logger.add_scalar("voting/Validation_Acc", acc, epoch) # Update the scheduler with warnings.catch_warnings(): # UserWarning raised by PyTorch is ignored because # scheduler does not have a real effect on the optimizer. warnings.simplefilter("ignore", UserWarning) if self.use_scheduler_: scheduler_.step() self.estimators_ = nn.ModuleList() self.estimators_.extend(estimators) if save_model and not test_loader: io.save(self, save_dir, self.logger)
data = get_data(filename) print('Loaded data from: {}\nLength: {}'.format(filename, len(data['mass']))) plot_distributions(data, scenario) print('[{}] Plotted Distrbutions (1/7)'.format(scenario)) plot_abundances(data, scenario) print('[{}] Plotted Abundances (2/7)'.format(scenario)) plot_chisq_distribution(data, scenario) print('[{}] Plotted Chi Squared Distrbutions (3/7)'.format(scenario)) plot_mchi_omegab_contours(data, scenario, 'BBN') plot_mchi_omegab_contours(data, scenario, 'CMB') plot_mchi_omegab_contours(data, scenario, 'BBN+CMB') print('[{}] Plotted Omegab vs Mchi Contours (4/7)'.format(scenario)) plot_joint_mchi_omegab(data, scenario) print('[{}] Plotted joint contours (5/7)'.format(scenario)) plot_deltachisq(data, scenario, zoom=False) plot_deltachisq(data, scenario, zoom=True) print('[{}] Plotted Delta Chi curves (6/7)'.format(scenario)) print('[{}] Saving results (7/7)'.format(scenario)) save_results(data, scenario) if __name__ == '__main__': scenarios = [ 'EE_Neutral_Scalar', 'EE_Maj', 'Nu_Complex_Scalar', 'Nu_Neutral_Scalar', 'Nu_Zp' ] Parallel(n_jobs=-1)(delayed(run_scenario)(scenario=scenario) for scenario in scenarios)
def fit( self, train_loader, epochs=100, log_interval=100, test_loader=None, save_model=True, save_dir=None, ): self._validate_parameters(epochs, log_interval) self.n_outputs = self._decide_n_outputs(train_loader) # Instantiate a pool of base estimators, optimizers, and schedulers. estimators = [] for _ in range(self.n_estimators): estimators.append(self._make_estimator()) optimizers = [] for i in range(self.n_estimators): optimizers.append( set_module.set_optimizer(estimators[i], self.optimizer_name, **self.optimizer_args)) if self.use_scheduler_: scheduler_ = set_module.set_scheduler(optimizers[0], self.scheduler_name, **self.scheduler_args) # Utils criterion = nn.MSELoss() best_mse = float("inf") # Internal helper function on pesudo forward def _forward(estimators, data): outputs = [estimator(data) for estimator in estimators] pred = op.average(outputs) return pred # Maintain a pool of workers with Parallel(n_jobs=self.n_jobs) as parallel: # Training loop for epoch in range(epochs): self.train() if self.use_scheduler_: cur_lr = scheduler_.get_last_lr()[0] else: cur_lr = None if self.n_jobs and self.n_jobs > 1: msg = "Parallelization on the training epoch: {:03d}" self.logger.info(msg.format(epoch)) rets = parallel( delayed(_parallel_fit_per_epoch)( train_loader, estimator, cur_lr, optimizer, criterion, idx, epoch, log_interval, self.device, False, ) for idx, ( estimator, optimizer) in enumerate(zip(estimators, optimizers))) estimators, optimizers = [], [] for estimator, optimizer in rets: estimators.append(estimator) optimizers.append(optimizer) # Validation if test_loader: self.eval() with torch.no_grad(): mse = 0.0 for _, (data, target) in enumerate(test_loader): data = data.to(self.device) target = target.to(self.device) output = _forward(estimators, data) mse += criterion(output, target) mse /= len(test_loader) if mse < best_mse: best_mse = mse self.estimators_ = nn.ModuleList() self.estimators_.extend(estimators) if save_model: io.save(self, save_dir, self.logger) msg = ("Epoch: {:03d} | Validation MSE:" " {:.5f} | Historical Best: {:.5f}") self.logger.info(msg.format(epoch, mse, best_mse)) if self.tb_logger: self.tb_logger.add_scalar("voting/Validation_MSE", mse, epoch) # Update the scheduler with warnings.catch_warnings(): warnings.simplefilter("ignore", UserWarning) if self.use_scheduler_: scheduler_.step() self.estimators_ = nn.ModuleList() self.estimators_.extend(estimators) if save_model and not test_loader: io.save(self, save_dir, self.logger)
pi.set_epsilon(epsilon) mdp.set_episode_end(True) core.learn(n_steps=evaluation_frequency, n_steps_per_fit=train_frequency, quiet=args.quiet) if args.save: agent.approximator.model.save() print('- Evaluation:') # evaluation step pi.set_eval(True) pi.set_epsilon(epsilon_test) mdp.set_episode_end(False) dataset = core.evaluate(n_steps=test_samples, render=args.render, quiet=args.quiet) scores.append(get_stats(dataset)) np.save(folder_name + '/scores.npy', scores) return scores if __name__ == '__main__': n_experiments = 1 out = Parallel(n_jobs=-1)(delayed(experiment)() for _ in range(n_experiments)) tf.reset_default_graph()
homoFile = energyFile = corrFile = ASMFile = merge, xx, yy, gt = read_raster(in_raster) merge[np.isnan(merge)] = 0 Z,ind = sliding_window(merge,(win,win),(win,win)) Ny, Nx = np.shape(merge) w = Parallel(n_jobs = cpu_count(), verbose=0)(delayed(p_me)(Z[k]) for k in xrange(len(Z))) cont = [a[0] for a in w] diss = [a[1] for a in w] h**o = [a[2] for a in w] eng = [a[3] for a in w] corr = [a[4] for a in w] ASM = [a[5] for a in w] #Reshape to match number of windows plt_cont = np.reshape(cont , ( ind[0], ind[1] ) ) plt_diss = np.reshape(diss , ( ind[0], ind[1] ) ) plt_homo = np.reshape(h**o , ( ind[0], ind[1] ) ) plt_eng = np.reshape(eng , ( ind[0], ind[1] ) ) plt_corr = np.reshape(corr , ( ind[0], ind[1] ) )
# Script for grabbing congressional graph data from joblib import Parallel, delayed import multiprocessing import time from src import write_graph data_dir = 'graphs' num_cores = multiprocessing.cpu_count() print('Running on {} cores.'.format(num_cores)) def grab_data(congress): start = time.time() write_graph(congress) end = time.time() print( 'Finished graph for congress {}.\nElapsed time {:.03f} s.\n\n'.format( congress, end - start)) Parallel(n_jobs=num_cores)(delayed(grab_data)(congress) for congress in range(101, 115))
"yRange": (0, YRAN[1] * 1) } # print('YRAN: {}'.format(STYLE)) ############################################################################### # Plot ############################################################################### (fNum, digs) = monet.lenAndDigits(subset) Parallel(n_jobs=JOB)( delayed(dbg.exportPstTracesParallel)(exIx, fNum, aux.STABLE_T, 0, QNT, STYLE, pt_img, digs=digs, border=True, autoAspect=True, labelPos=(.8, .15), poePrint=False, mnfPrint=False, ticksHide=TICKS_HIDE, transparent=True, sampRate=aux.SAMP_RATE) for exIx in subset) # Export gene legend ------------------------------------------------------ # repDta = pkl.load(expsIter[0][1]) # monet.exportGeneLegend( # repDta['genotypes'], [i[:-2]+'cc' for i in CLR], # PT_IMG+'/legend_{}.png'.format(TRC), 500 # )
def fit( self, train_loader, epochs=100, use_reduction_sum=True, log_interval=100, test_loader=None, save_model=True, save_dir=None, ): # Instantiate base estimators and set attributes for _ in range(self.n_estimators): self.estimators_.append(self._make_estimator()) self._validate_parameters(epochs, log_interval) self.n_outputs = self._decide_n_outputs(train_loader) # Utils criterion = (nn.MSELoss( reduction="sum") if use_reduction_sum else nn.MSELoss()) total_iters = 0 # Set up optimizer and learning rate scheduler optimizer = set_module.set_optimizer(self, self.optimizer_name, **self.optimizer_args) if self.use_scheduler_: scheduler = set_module.set_scheduler( optimizer, self.scheduler_name, **self.scheduler_args # noqa: E501 ) for epoch in range(epochs): self.train() for batch_idx, elem in enumerate(train_loader): data, target = io.split_data_target(elem, self.device) output = [estimator(*data) for estimator in self.estimators_] # Compute pseudo residuals in parallel rets = Parallel(n_jobs=self.n_jobs)( delayed(_parallel_compute_pseudo_residual)( output, target, i, self.shrinkage_rate, self.n_outputs, self.is_classification, ) for i in range(self.n_estimators)) # Compute sGBM loss loss = torch.tensor(0.0, device=self.device) for idx, estimator in enumerate(self.estimators_): loss += criterion(output[idx], rets[idx]) optimizer.zero_grad() loss.backward() optimizer.step() # Print training status if batch_idx % log_interval == 0: with torch.no_grad(): msg = "Epoch: {:03d} | Batch: {:03d} | RegLoss: {:.5f}" self.logger.info(msg.format(epoch, batch_idx, loss)) if self.tb_logger: self.tb_logger.add_scalar("sGBM/Train_Loss", loss, total_iters) total_iters += 1 # Validation if test_loader: flag = self._evaluate_during_fit(test_loader, epoch) if save_model and flag: io.save(self, save_dir, self.logger) # Update the scheduler if self.use_scheduler_: scheduler.step() if save_model and not test_loader: io.save(self, save_dir, self.logger)
def returnMatchTable(rootDir, visit, ccdList, outfile=None, fakeCat=None, overwrite=False, filt=None, tol=1.0, pixMatch=False, multiband=False, reffMatch=False, pix=0.168, multijobs=1, includeMissing=True, minRad=None, raCol='RA', decCol='Dec'): """ Driver (main function) for return match to fakes. INPUT: rootDir = rerun directory visit = visit id (int) (or tracts) ccdList = list of ccds to look at (or patches) outdir = output directory for matched file, None means no output written fakeCat = fake catalog to match to, None means the fake sources are just extracted from the header of the CCDs based on position but no matching is done overwrite = whether to overwrite the existing output file, default is False pixMatch = do pixel matching instead of ra/dec matching even if there is a catalog supplied multiband = whether match to forced photometry catalogs from multiband process reffMatch = whether match fake sources in pixel radius or using tol x Reff (Only for Ra, Dec match) OUTPUT: returns an astropy.table.Table with all the entries from the source catalog for objects which match in pixel position to the fake sources """ butler = dafPersist.Butler(rootDir) slist = None if multijobs > 1: try: from joblib import Parallel, delayed mlist = Parallel(n_jobs=multijobs)( delayed(returnMatchSingle)(butler, None, visit, ccd, filt=filt, fakeCat=fakeCat, includeMissing=includeMissing, pixMatch=pixMatch, reffMatch=reffMatch, tol=tol, multiband=multiband, minRad=minRad, pix=pix, decCol=decCol, raCol=raCol) for ccd in ccdList) for m in mlist: if m is not None: if slist is None: slist = m.copy(True) else: slist.extend(m, True) del m except ImportError: print("# Can not import joblib, stop multiprocessing!") for ccd in ccdList: slist = returnMatchSingle(butler, slist, visit, ccd, filt=filt, fakeCat=fakeCat, includeMissing=includeMissing, pixMatch=pixMatch, reffMatch=reffMatch, tol=tol, pix=pix, multiband=multiband, minRad=minRad, raCol=raCol, decCol=decCol) else: for ccd in ccdList: slist = returnMatchSingle(butler, slist, visit, ccd, filt=filt, fakeCat=fakeCat, includeMissing=includeMissing, pixMatch=pixMatch, reffMatch=reffMatch, tol=tol, pix=pix, multiband=multiband, minRad=minRad, raCol=raCol, decCol=decCol) if slist is None: print("Returns no match....!") return None else: astroTable = getAstroTable(slist, mags=True) if fakeCat is not None: astroTable = matchToFakeCatalog(astroTable, fakeCat) if outfile is not None: try: astroTable.write(outfile + '.fits', format='fits', overwrite=overwrite) except IOError: print("Try setting the option -w to overwrite the file.") raise return astroTable
def create_tarenc_agg_features(mode_target_persons, mode_target_cols): logger.info(f"tarenc_agg_tp{mode_target_persons}_tc{mode_target_cols}") mprof_timestamp(f"tarenc_agg_tp{mode_target_persons}_tc{mode_target_cols}") _ = Parallel(n_jobs=args.n_jobs//2, verbose=args.verbose_joblib) \ ([delayed(create_tarenc_agg_features_1fold)(fold, mode_target_persons, mode_target_cols, "tarenc") for fold in range(args.FOLD_NUM)])
os.makedirs(savedir) size = 512,512 colors = ['red','green','blue','yellow'] imgList = pd.read_csv("../input/HPAv18/HPAv18RBGY_wodpl.csv") print(len(imgList)) #for i in tqdm(imgList['Id']): def save_image(i): for color in colors: img_path = i + "_" + color + ".jpg" img_name = i + "_" + color + ".png" img_full_path = img_dir + img_path fname = os.path.join(savedir,img_name) if not os.path.exists(fname): x = Image.open(img_full_path, 'r') x = x.convert('L') # makes it greyscale y = np.asarray(x.getdata(), dtype=np.float64).reshape((x.size[1], x.size[0])) y = np.asarray(y, dtype=np.uint8) # if values still in range 0-255! w = Image.fromarray(y, mode='L') w.thumbnail(size, Image.ANTIALIAS) w.save(os.path.join(savedir,img_name)) num_cores = 6 Parallel(n_jobs=num_cores, prefer="threads")(delayed(save_image)(i) \ for i in tqdm(imgList['Id'])) print('done')
import subprocess as sp from joblib import Parallel, delayed def job(i): sp.call([ "python", "/home/mszul/git/DANC_MEG_learning_beta/pipeline_08_epoch_qc.py", str(i), "settings_hdd.json" ]) Parallel(n_jobs=-1)(delayed(job)(i) for i in range(0, 38))
mask_2d = np.zeros_like(np_fix) ellipse = opt_ellipse(np_fix, ellipse_tuple, sigma=sigma, steps=steps) ((cx, cy), (M, m), theta) = ellipse (M, m) = (M + dilate, m + dilate) ellipse = ((cx, cy), (M, m), theta) np_cnt_help = np.zeros_like(np_fix, dtype=np.uint8) cv2.ellipse(np_cnt_help, ellipse, 255, -1) contours, _ = cv2.findContours(np_cnt_help, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) cnt = contours[0] cv2.drawContours(mask_2d, [cnt], 0, 1, -1) mask_2d = cv2.resize(mask_2d, (xS, yS), cv2.INTER_NEAREST) mask_2d = np.expand_dims(mask_2d, -1) mask = mask + mask_2d # broadcast # print(mask.shape) nrrd.write(dstpath, mask) filelist = os.listdir(srcdir) if parallel: Parallel(n_jobs=n_jobs, backend="multiprocessing")( delayed(draw_ellipse)(filename, ellipse_tuple) for filename in filelist) else: for filename in filelist: draw_ellipse(filename, ellipse_tuple)
def Prediction(best_parameters_dct, df_dict, n_jobs=-1): """Predication of the model for the best parameters. Take as argument a list of the best parameters for the three datasets. """ # Extract the best model data_aug = best_parameters_dct["Data Augmentation"]["Function"] embedding = best_parameters_dct["Embedding"]["Function"] model = best_parameters_dct["Model"]["Function"] # Array of prediction pred = np.zeros((3 * 1000, 2), dtype=int) def subPredictions(k, df_dct=df_dict): """Compute predictions for a given dataset.""" # Extraction of the data X_train_k = df_dct[k][0]["seq"].values y_train_k = df_dct[k][2]["Bound"].values X_test_k = df_dct[k][1]["seq"].values # Data augmentation X_train_k, y_train_k = data_aug.call(X_train_k, y_train_k) # Embedding X_train_k = embedding.call(X_train_k, train=True) # Training of the model model.fit(X_train_k, y_train_k) # Compute average score score = model.score(X_train_k, y_train_k) / 3 # Prediction of test data X_test_k = embedding.call(X_test_k, train=False, X_train=df_dct[k][0]["seq"].values) y_pred_k = model.predict(X_test_k) return y_pred_k, score # Paralleisation of the cv if n_jobs != 1: preds_score = Parallel(n_jobs=n_jobs)(delayed(subPredictions)(k) for k in tqdm(range(3))) elif n_jobs == 1: preds_score = [subPredictions(k) for k in range(3)] # Initialisation of average score final_score = 0 # Loop to extract the predictions and scores for k in range(3): # Update pred pred[1000 * k: 1000 * (k + 1), 0] = df_dict[k][1]["Id"].values pred[1000 * k: 1000 * (k + 1), 1] = preds_score[k][0].reshape(-1) # Update final_score final_score += preds_score[k][1] # Display average score print(final_score) return pred
def preprocessData_v3(img_size, RGB): # %% train_perc = 0.7 filelist = glob.glob('data/train/*') filelist_train = filelist[:int(len(filelist) * train_perc)] filelist_val = filelist[int(len(filelist) * train_perc):] def processImageBN(fname, store_dir, store_dir_full, source): fname = fname.split('/')[2].split('.')[0] img = Image.open(source + fname + '.jpg').resize( img_size, Image.ANTIALIAS).convert('L') img.save(store_dir + fname + '.png') if store_dir_full: img.save(store_dir_full + fname + '.png') def processImageRGB(fname, store_dir, store_dir_full, source): fname = fname.split('/')[2].split('.')[0] img = Image.open(source + fname + '.jpg').resize( img_size, Image.ANTIALIAS) img.save(store_dir + fname + '.png') if store_dir_full: img.save(store_dir_full + fname + '.png') def processImageBN_mask(fname, store_dir_full, store_dir): fname = fname.split('/')[2].split('.')[0] img = Image.open('data/train_masks/' + fname + '_mask.gif').resize( img_size, Image.ANTIALIAS) img.save(store_dir + fname + '.png') if store_dir_full: img.save(store_dir_full + fname + '.png') data_func = processImageRGB if RGB else processImageBN rgb_sufix = '_RGB' if RGB else '' # Get taining data t = time.time() store_dir = 'data/train_' + str(img_size) + rgb_sufix + '/data/' store_dir_full = 'data/full_' + str(img_size) + rgb_sufix + '/data/' if not os.path.exists(store_dir): os.makedirs(store_dir) if not os.path.exists(store_dir_full): os.makedirs(store_dir_full) Parallel(n_jobs=8)( delayed(data_func)(fname, store_dir, store_dir_full, 'data/train/') for fname in filelist_train) store_dir = 'data/train_mask_' + str(img_size) + '/data/' store_dir_full = 'data/full_mask_' + str(img_size) + '/data/' if not os.path.exists(store_dir): os.makedirs(store_dir) if not os.path.exists(store_dir_full): os.makedirs(store_dir_full) Parallel(n_jobs=8)( delayed(processImageBN_mask)(fname, store_dir, store_dir_full) for fname in filelist_train) print "Train. Time elapsed:", (time.time() - t) / 60 # Get validation data t = time.time() store_dir = 'data/val_' + str(img_size) + rgb_sufix + '/data/' store_dir_full = 'data/full_' + str(img_size) + rgb_sufix + '/data/' if not os.path.exists(store_dir): os.makedirs(store_dir) if not os.path.exists(store_dir_full): os.makedirs(store_dir) Parallel(n_jobs=8)( delayed(data_func)(fname, store_dir, store_dir_full, 'data/train/') for fname in filelist_val) store_dir = 'data/val_mask_' + str(img_size) + '/data/' store_dir_full = 'data/full_mask_' + str(img_size) + '/data/' if not os.path.exists(store_dir): os.makedirs(store_dir) Parallel(n_jobs=8)( delayed(processImageBN_mask)(fname, store_dir, store_dir_full) for fname in filelist_val) print "Validation. Time elapsed:", (time.time() - t) / 60 # Get test data t = time.time() print "Processing Test..." filelist = glob.glob('data/test/*') store_dir = 'data/test_' + str(img_size) + rgb_sufix + '/data/' if not os.path.exists(store_dir): os.makedirs(store_dir) step = len(filelist) / 10 for i in np.arange(0, len(filelist), step): st = time.time() Parallel(n_jobs=8)( delayed(data_func)(fname, store_dir, None, 'data/test/') for fname in filelist[i:i + step]) # print i,'/', 10, '\t', time.strftime("%H:%M:%S"), '-', (time.time()-st) print '{0}/{1}\t{2} - {3:.2f}'.format(i, 10, time.strftime("%H:%M:%S"), (time.time() - st)) print "Test. Time elapsed:", (time.time() - t) / 60