def _job_shares(n_jobs, trials): if n_jobs == -1: n_jobs = cpu_count() shares = [trials // n_jobs] * n_jobs for i in range(trials - sum(shares)): shares[i] += 1 return shares
def _hdbscan_boruvka_balltree(X, min_samples=5, alpha=1.0, metric='minkowski', p=2, leaf_size=40, approx_min_span_tree=True, gen_min_span_tree=False, core_dist_n_jobs=4, **kwargs): if leaf_size < 3: leaf_size = 3 if core_dist_n_jobs < 1: core_dist_n_jobs = max(cpu_count() + 1 + core_dist_n_jobs, 1) if X.dtype != np.float64: X = X.astype(np.float64) tree = BallTree(X, metric=metric, leaf_size=leaf_size, **kwargs) alg = BallTreeBoruvkaAlgorithm(tree, min_samples, metric=metric, leaf_size=leaf_size // 3, approx_min_span_tree=approx_min_span_tree, n_jobs=core_dist_n_jobs, **kwargs) min_spanning_tree = alg.spanning_tree() # Sort edges of the min_spanning_tree by weight min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :] # Convert edge list into standard hierarchical clustering format single_linkage_tree = label(min_spanning_tree) if gen_min_span_tree: return single_linkage_tree, min_spanning_tree else: return single_linkage_tree, None
def _get_n_jobs(n_jobs): """Get number of jobs for the computation. See sklearn/utils/__init__.py for more information. This function reimplements the logic of joblib to determine the actual number of jobs depending on the cpu count. If -1 all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Parameters ---------- n_jobs : int Number of jobs stated in joblib convention. Returns ------- n_jobs : int The actual number of jobs as positive integer. Examples -------- >>> from sklearn.utils import _get_n_jobs >>> _get_n_jobs(4) 4 >>> jobs = _get_n_jobs(-2) >>> assert jobs == max(cpu_count() - 1, 1) >>> _get_n_jobs(0) Traceback (most recent call last): ... ValueError: Parameter n_jobs == 0 has no meaning. """ if n_jobs < 0: return max(cpu_count() + 1 + n_jobs, 1) elif n_jobs == 0: raise ValueError('Parameter n_jobs == 0 has no meaning.') else: return n_jobs
def compute_geodesic_distance_matrix(verts, tris): print "precomputing geodesic distance..." n_chunks = cpu_count() chunk_size = int(np.ceil(len(verts) / float(n_chunks))) sources = np.arange(len(verts)) D = Parallel(n_chunks)( delayed(compute_geodesic_distances)(verts, tris, sources[i: i + chunk_size]) for i in xrange(0, len(verts), chunk_size)) return np.vstack(D)
def compute_geodesic_distance_matrix(verts, tris): print "precomputing geodesic distance..." n_chunks = cpu_count() chunk_size = int(np.ceil(len(verts) / float(n_chunks))) sources = np.arange(len(verts)) D = Parallel(n_chunks)( delayed(compute_geodesic_distances)(verts, tris, sources[i:i + chunk_size]) for i in xrange(0, len(verts), chunk_size)) return np.vstack(D)
def junction_make(config, *args, **kwargs): click.echo(green_fg("\n{} Junction Make {}\n".format(">" * 10, "<" * 10))) threads = kwargs['threads'] if kwargs['threads'] else parallel.cpu_count() input_data_folder = 'unmapped_sam_files' if kwargs[ 'unmapped'] else 'sam_files' junction_folder = 'junction_files' # Manage name of junction reads output folder here blast_results_folder = 'blast_results' # Manage name of blast results output folder here blast_results_query = 'blast_results_query' # Manage name of blast results dictionary output folder here junction_sequence = junction_sequences[kwargs['genome']].replace( " ", "").split(",") if kwargs['seq'] != "": junction_sequence = kwargs['seq'].replace(" ", "").split(",") exclusion_sequence = kwargs['exclude_seq'].replace(" ", "") blast_db = blast_dbs[kwargs['genome']] gene_list_file = gene_lists[kwargs['genome']] # verify if the options provided are valid verify_options(*args, **kwargs) # create folders for junction make check_and_create_folders( kwargs['dir'], ['junction_files', 'blast_results', 'blast_results_query'], interactive=kwargs['interactive']) if kwargs['interactive']: if not click.confirm( magenta_fg('\nDo you want to search junctions and blast?')): click.echo(red_fg("...Skipping search junctions and blast...")) else: # search for junctions junction_search(kwargs['dir'], junction_folder, input_data_folder, blast_results_folder, junction_sequence, exclusion_sequence, threads) # blast the junctions blast_search(kwargs['dir'], blast_db, blast_results_folder) if not click.confirm( magenta_fg('\nDo you want to parse blast results')): click.echo(red_fg("ABORTING...")) sys.exit(1) else: # parse blast results parse_blast_results(kwargs['dir'], blast_results_folder, blast_results_query, gene_list_file, threads) else: # search for junctions junction_search(kwargs['dir'], junction_folder, input_data_folder, blast_results_folder, junction_sequence, exclusion_sequence, threads) # blast the junctions blast_search(kwargs['dir'], blast_db, blast_results_folder) # parse blast results parse_blast_results(kwargs['dir'], blast_results_folder, blast_results_query, gene_list_file, threads)
def test_nested_parallelism_limit(backend): with parallel_backend(backend, n_jobs=2): backend_types_and_levels = _recursive_backend_info() if cpu_count() == 1: second_level_backend_type = 'SequentialBackend' else: second_level_backend_type = 'ThreadingBackend' top_level_backend_type = backend.title() + 'Backend' expected_types_and_levels = [ (top_level_backend_type, 0), (second_level_backend_type, 1), ('SequentialBackend', 2), ('SequentialBackend', 3) ] assert backend_types_and_levels == expected_types_and_levels
def _calculate_n_jobs_and_actual_iters(self): # because HpBandSter assigns n_iter jobs to each worker, we need to divide n_jobs = self.n_jobs if not n_jobs: n_jobs = 1 elif n_jobs < 0: try: import psutil cpus = int( os.environ.get("LOKY_MAX_CPU_COUNT", psutil.cpu_count(logical=False))) except: cpus = cpu_count() n_jobs = max(cpus + 1 + n_jobs, 1) if n_jobs > self.n_iter: n_jobs = self.n_iter actual_iterations = self.n_iter // n_jobs + (self.n_iter % n_jobs > 0) return (n_jobs, actual_iterations)
def blast_search(directory, db_name, blast_results_folder): suffix = '' if _platform.startswith('win'): suffix = '.exe' blast_path = os.path.join(os.path.expanduser('~'), ".deepn", "data", "blast") db_path = os.path.join(os.path.expanduser('~'), ".deepn", db_name) click.echo(green_fg("\n>>> Selected Blast DB: %s" % db_name)) file_list = get_file_list(directory, blast_results_folder, ".fa") for file_name in file_list: if not os.path.getsize( os.path.join(directory, blast_results_folder, file_name)) == 0: start = time.time() output_file = os.path.join( directory, blast_results_folder, file_name.replace(".junctions.fa", '.blast.txt')) click.echo( yellow_fg("\n>>> Running BLAST search for file: " + file_name)) blast_command_list = [ os.path.join(blast_path, 'blastn' + suffix), '-query', os.path.join(directory, blast_results_folder, file_name), '-db', db_path, '-task', 'blastn', '-dust', 'no', '-num_threads', str(parallel.cpu_count()), '-outfmt', '7', '-out', output_file, '-evalue', '0.2', '-max_target_seqs', '10' ] blast_pipe = subprocess.Popen(blast_command_list, shell=False) blast_pipe.wait() finish = time.time() hr, min, sec = elapsed_time(start, finish) click.echo( cyan_fg( "\nFinished blasting file %s in time %d hr, %d min, %d sec" % (file_name, hr, min, sec))) else: click.echo( red_fg("\n>>> ERROR: File %s does not have any junctions, " "please check if they right genome was chosen." % file_name)) sys.exit(1)
num_mutations = randint(1, max_mutations) for mutation in range(num_mutations): street = choice(streets) idx = choice(street) chromosome[idx] = randint(0, 6) return key_chromosome(chromosome) if __name__ == '__main__': population_size = 50 nr_populations = 10 remain_perc = 0.2 mutate_perc = 0.01 population_size = cpu_count() * ((population_size / cpu_count()) + 1) population = set(generate_random() for _ in xrange(population_size)) evaluated = {} for generation in range(1, nr_populations + 1): results = Parallel(n_jobs=-1, verbose=100)(delayed(calculate)(chromosome, buy_all) for chromosome in population) for score, chromosome in results: evaluated[chromosome] = score winners = [] for chromosome, score in evaluated.iteritems(): winners.append((score, chromosome)) winners.sort(reverse=True)
def test_cpu_count(): assert cpu_count() > 0
for mutation in range(num_mutations): street = choice(streets) idx = choice(street) chromosome[idx] = randint(0, 6) return key_chromosome(chromosome) if __name__ == '__main__': population_size = 50 nr_populations = 25 remain_perc = 0.2 mutate_perc = 0.01 population_size = cpu_count() * ((population_size / cpu_count()) + 1) population = set(generate_random() for _ in xrange(population_size)) population.add(buy_all) # population.add(key_chromosome(map(int, '0,6,0,5,0,2,4,0,4,4,0,6,0,5,5,2,4,0,5,3,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,4,0,5,0,3'.split(",")))) # population.add(key_chromosome(map(int, '0,6,0,6,0,4,4,0,4,4,0,3,0,5,5,3,4,0,5,6,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,4,0,5,0,2'.split(",")))) for generation in range(1, nr_populations + 1): to_schedule = [] for chromosome in population: get_player_stats(chromosome) for chromosome in population: opponents = [ buy_all,
class Significance(object): """Test for pairwise significance between systems""" METHODS = { 'permute': count_permutation_trials, #'bootstrap': count_bootstrap_trials, } def __init__(self, systems, gold, trials=10000, method='permute', n_jobs=1, metrics=['precision', 'recall', 'fscore'], fmt='json'): if len(systems) < 2: raise ValueError('Require at least two systems to compare') if method not in self.METHODS: raise ValueError('Unsupported method: {}'.format(method)) # Check whether import worked, generate a more useful error. if Parallel is None: raise ImportError( 'Package: "joblib" not available, please install to run significance tests.' ) self.systems = systems self.gold = gold self.method = method self.trials = trials self.n_jobs = n_jobs self.metrics = metrics self.fmt = FMTS[fmt] if fmt is not callable else fmt def __call__(self): all_counts = defaultdict(dict) gold = sorted(Reader(open(self.gold))) for path in self.systems: system = sorted(Reader(open(path))) for match, per_doc, overall in Evaluate.count_all(system, gold): all_counts[match][path] = (per_doc, overall) results = [ { 'sys1': sys1, 'sys2': sys2, 'match': match, 'stats': self.significance(match_counts[sys1], match_counts[sys2]) } for sys1, sys2 in itertools.combinations(self.systems, 2) for match, match_counts in sorted( all_counts.iteritems(), key=lambda (k, v): MATCHES.index(k)) ] return self.fmt(results, self.metrics) def significance(self, (per_doc1, overall1), (per_doc2, overall2)): # TODO: limit to metrics base_diff = _result_diff(overall1, overall2) randomized_diffs = functools.partial(self.METHODS[self.method], per_doc1, per_doc2, base_diff) n_jobs = self.n_jobs if n_jobs == -1: n_jobs = cpu_count() shares = [self.trials // n_jobs] * n_jobs for i in range(self.trials - sum(shares)): shares[i] += 1 results = Parallel(n_jobs=self.n_jobs)(delayed(randomized_diffs)(share) for share in shares) all_counts = [] for result in results: metrics, counts = zip(*result.iteritems()) all_counts.append(counts) return { metric: { 'diff': base_diff[metric], 'p': (sum(counts) + 1) / (self.trials + 1) } for metric, counts in zip(metrics, zip(*all_counts)) }
def druhg(X, max_ranking=16, limit1=None, limit2=None, exclude=None, fix_outliers=0, metric='minkowski', p=2, algorithm='best', leaf_size=40, verbose=False, core_n_jobs=None, **kwargs): """Perform DRUHG clustering from a vector array or distance matrix. Parameters ---------- X : array matrix of shape (n_samples, n_features), or \ array of shape (n_samples, n_samples) A feature array, or array of distances between samples if ``metric='precomputed'``. max_ranking : int, optional (default=15) The maximum number of neighbors to search. Affects performance vs precision. limit1 : float, optional (default=sqrt(size)) Clusters that are smaller than this limit treated as noise. Use 1 to find True outliers. Numbers under 1 treated as percentage of the dataset size limit2 : float, optional (default=size/2) Clusters with size OVER this limit treated as noise. Use it to break down big clusters. Numbers under 1 treated as percentage of the dataset size exclude: list, optional (default=None) Clusters with these indexes would not be formed. Use it for surgical cluster removal. fix_outliers: int, optional (default=0) In case of 1 - all outliers will be assigned to the nearest cluster metric : string or callable, optional (default='minkowski') The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of the options allowed by metrics.pairwise.pairwise_distances for its metric parameter. If metric is "precomputed", X is assumed to be a distance matrix and must be square. p : int, optional (default=2) p value to use if using the minkowski metric. leaf_size : int, optional (default=40) Leaf size for trees responsible for fast nearest neighbour queries. algorithm : string, optional (default='best') Exactly, which algorithm to use; DRUHG has variants specialized for different characteristics of the data. By default, this is set to ``best`` which chooses the "best" algorithm given the nature of the data. You can force other options if you believe you know better. Options are: * ``best`` * ``kdtree`` * ``balltree`` If you want it to be accurate add: * ``slow`` core_n_jobs : int, optional (default=None) Number of parallel jobs to run in neighbors distance computations (if supported by the specific algorithm). For default, (n_cpus + 1 + core_dist_n_jobs) is used. **kwargs : optional Arguments passed to the distance metric Returns ------- labels : ndarray, shape (n_samples) Cluster labels for each point. Noisy samples are given the label -1. min_spanning_tree : ndarray, shape (2*n_samples - 2) The minimum spanning tree as edgepairs. values_edges : ndarray, shape (n_samples - 1) Values of the edges. References ---------- None """ if type(X) is list: raise ValueError('X must be array! Not a list!') size = X.shape[0] if core_n_jobs is None: core_n_jobs = max(cpu_count(), 1) elif core_n_jobs < 0: core_n_jobs = max(cpu_count() + 1 + core_n_jobs, 1) if max_ranking is not None: if type(max_ranking) is not int: raise ValueError('Max ranking must be integer!') if max_ranking < 0: raise ValueError('Max ranking must be non-negative integer!') if leaf_size < 1: raise ValueError('Leaf size must be greater than 0!') if metric == 'minkowski': if p is None: raise TypeError('Minkowski metric given but no p value supplied!') if p < 0: raise ValueError('Minkowski metric with negative p value is not' ' defined!') printout = '' if max_ranking is None: max_ranking = 16 printout += 'max_ranking is set to ' + str(max_ranking) + ', ' max_ranking = min(size - 1, max_ranking) if limit1 is None: limit1 = int(np.sqrt(size)) printout += 'limit1 is set to ' + str(limit1) + ', ' else: if limit1 < 0: raise ValueError('Limit1 must be non-negative integer!') if limit1 < 1: limit1 = int(limit1 * size) if limit2 is None: limit2 = int(size / 2 + 1) printout += 'limit2 is set to ' + str(limit2) + ', ' else: if limit2 < 0: raise ValueError('Limit2 must be non-negative integer!') if limit2 <= 1: limit2 = int(limit2 * size + 1) if algorithm == 'best': algorithm = 'kd_tree' if X.dtype != np.float64: print('Converting data to numpy float64') X = X.astype(np.float64) algo_code = 0 if "precomputed" in algorithm.lower() or "precomputed" in metric.lower( ) or issparse(X): algo_code = 2 if issparse(X): algo_code = 3 elif len(X.shape) == 2 and X.shape[0] != X.shape[1]: raise ValueError('Precomputed matrix is not a square.') tree = X else: # The Cython routines used require contiguous arrays if not X.flags['C_CONTIGUOUS']: X = np.array(X, dtype=np.double, order='C') if "kd" in algorithm.lower() and "tree" in algorithm.lower(): algo_code = 0 if metric not in KDTree.valid_metrics: raise ValueError('Metric: %s\n' 'Cannot be used with KDTree' % metric) tree = KDTree(X, metric=metric, leaf_size=leaf_size, **kwargs) elif "ball" in algorithm.lower() and "tree" in algorithm.lower(): algo_code = 1 tree = BallTree(X, metric=metric, leaf_size=leaf_size, **kwargs) else: algo_code = 0 if metric not in KDTree.valid_metrics: raise ValueError('Metric: %s\n' 'Cannot be used with KDTree' % metric) tree = KDTree(X, metric=metric, leaf_size=leaf_size, **kwargs) # raise TypeError('Unknown algorithm type %s specified' % algorithm) is_slow_and_deterministic = 0 if "slow" in algorithm.lower(): is_slow_and_deterministic = 1 if printout: print('Druhg is using defaults for: ' + printout) ur = UniversalReciprocity(algo_code, tree, max_neighbors_search=max_ranking, metric=metric, leaf_size=leaf_size // 3, is_slow=is_slow_and_deterministic, n_jobs=core_n_jobs, **kwargs) pairs, values = ur.get_tree() labels = label(pairs, values, size, exclude=exclude, limit1=int(limit1), limit2=int(limit2), fix_outliers=fix_outliers) return (labels, pairs, values)