def multiprocess_performance(pos_iterable, neg_iterable, vectorizer=None, estimator=None, pos_block_size=100, neg_block_size=100, n_jobs=-1): """multiprocess_performance.""" start_time = time.time() if n_jobs == -1: pool = mp.Pool() else: pool = mp.Pool(n_jobs) pos_results = [apply_async( pool, serial_pre_process, args=(seqs, vectorizer)) for seqs in chunks(pos_iterable, pos_block_size)] neg_results = [apply_async( pool, serial_pre_process, args=(seqs, vectorizer)) for seqs in chunks(neg_iterable, neg_block_size)] logger.debug('Setup %.2f secs' % (time.time() - start_time)) logger.debug('Performance evaluation') start_time = time.time() preds = [] binary_preds = [] true_targets = [] for i, (p, n) in enumerate(izip(pos_results, neg_results)): loc_start_time = time.time() pos_data_matrix = p.get() y = [1] * pos_data_matrix.shape[0] neg_data_matrix = n.get() y += [-1] * neg_data_matrix.shape[0] y = np.array(y) true_targets.append(y) data_matrix = vstack([pos_data_matrix, neg_data_matrix]) pred = estimator.decision_function(data_matrix) preds.append(pred) binary_pred = estimator.predict(data_matrix) binary_preds.append(binary_pred) d_time = time.time() - start_time d_loc_time = time.time() - loc_start_time size = pos_data_matrix.shape logger.debug('%d %s (%.2f secs) (delta: %.2f)' % (i, size, d_time, d_loc_time)) pool.close() pool.join() preds = np.hstack(preds) binary_preds = np.hstack(binary_preds) true_targets = np.hstack(true_targets) return preds, binary_preds, true_targets
def multiprocess_vectorize(iterable, vectorizer=None, pos_block_size=100, n_jobs=-1): """multiprocess_vectorize.""" start_time = time.time() if n_jobs == -1: pool = mp.Pool() else: pool = mp.Pool(n_jobs) results = [apply_async( pool, serial_pre_process, args=(seqs, vectorizer)) for seqs in chunks(iterable, pos_block_size)] logger.debug('Setup %.2f secs' % (time.time() - start_time)) logger.debug('Vectorizing') start_time = time.time() matrices = [] for i, p in enumerate(results): loc_start_time = time.time() pos_data_matrix = p.get() matrices += pos_data_matrix d_time = time.time() - start_time d_loc_time = time.time() - loc_start_time size = pos_data_matrix.shape logger.debug('%d %s (%.2f secs) (delta: %.2f)' % (i, size, d_time, d_loc_time)) pool.close() pool.join() data_matrix = vstack(matrices) return data_matrix
def multiprocess_score(iterable, vectorizer=None, estimator=None, block_size=100, n_jobs=-1): """multiprocess_score.""" start_time = time.time() if n_jobs == -1: pool = mp.Pool() else: pool = mp.Pool(n_jobs) results = [apply_async( pool, serial_score, args=(seqs, vectorizer, estimator)) for seqs in chunks(iterable, block_size)] logger.debug('Setup %.2f secs' % (time.time() - start_time)) logger.debug('Predicting') start_time = time.time() scores_items = [] for i, p in enumerate(results): loc_start_time = time.time() scores = p.get() scores_items += scores d_time = time.time() - start_time d_loc_time = time.time() - loc_start_time logger.debug('%d (%.2f secs) (delta: %.2f)' % (i, d_time, d_loc_time)) pool.close() pool.join() return scores_items
def multiprocess_pre_process(iterable, pre_processor=None, pre_processor_args=None, n_blocks=5, block_size=None, n_jobs=8): """multiprocess_pre_process.""" iterable = list(iterable) size = len(iterable) intervals = compute_intervals( size=size, n_blocks=n_blocks, block_size=block_size) if n_jobs == -1: pool = mp.Pool() else: pool = mp.Pool(n_jobs) results = [apply_async(pool, serial_pre_process, args=(iterable[start:end], pre_processor, pre_processor_args)) for start, end in intervals] output = [p.get() for p in results] pool.close() pool.join() return_list = [] for items in output: for item in items: return_list.append(item) return return_list
def mass_annotate_mp(inputs, vectorizer, score_attribute='importance', estimator=None, multi_process=False, invert_score=False): ''' graph annotation is slow. i dont want to do it twice in fit and predict :) ''' # 1st check if already annotated if inputs[0].graph.get('mass_annotate_mp_was_here', False): return inputs if multi_process == False: inputs = filter(lambda v: v is not None, inputs) res = list(vectorizer.annotate(inputs, estimator=estimator)) #if invert_score: # def f(n,d): d['importance'] = -d['importance'] # res=utils.map_node_operation(res,f) res[0].graph['mass_annotate_mp_was_here'] = True return res else: pool = mp.Pool() mpres = [eden.apply_async(pool, mass_annotate_mp, args=(graphs, vectorizer, score_attribute, estimator)) for graphs in eden.grouper(inputs, 50)] result = [] for res in mpres: result += res.get() pool.close() pool.join() return result
def multiprocess_vectorize(graphs, vectorizer=None, fit_flag=False, n_blocks=5, block_size=None, n_jobs=8): graphs = list(graphs) # fitting happens in a serial fashion if fit_flag: vectorizer.fit(graphs) import multiprocessing as mp size = len(graphs) intervals = compute_intervals(size=size, n_blocks=n_blocks, block_size=block_size) if n_jobs == -1: pool = mp.Pool() else: pool = mp.Pool(n_jobs) results = [ apply_async(pool, serial_vectorize, args=(graphs[start:end], vectorizer, False)) for start, end in intervals ] output = [p.get() for p in results] pool.close() pool.join() data_matrix = vstack(output, format="csr") return data_matrix
def multiprocess_subarray( iterable, vectorizer=None, estimator=None, min_subarray_size=5, max_subarray_size=10, block_size=100, n_jobs=-1 ): """multiprocess_subarray.""" start_time = time.time() if n_jobs == -1: pool = mp.Pool() else: pool = mp.Pool(n_jobs) results = [ apply_async(pool, serial_subarray, args=(seqs, vectorizer, estimator, min_subarray_size, max_subarray_size)) for seqs in chunks(iterable, block_size) ] logger.debug("Setup %.2f secs" % (time.time() - start_time)) logger.debug("Annotating") start_time = time.time() subarrays_items = [] for i, p in enumerate(results): loc_start_time = time.time() subarrays_item = p.get() subarrays_items += subarrays_item d_time = time.time() - start_time d_loc_time = time.time() - loc_start_time logger.debug("%d (%.2f secs) (delta: %.2f)" % (i, d_time, d_loc_time)) pool.close() pool.join() return subarrays_items
def multiprocess_pre_process(iterable, pre_processor=None, pre_processor_args=None, n_blocks=5, block_size=None, n_jobs=8): iterable = list(iterable) import multiprocessing as mp size = len(iterable) intervals = compute_intervals( size=size, n_blocks=n_blocks, block_size=block_size) if n_jobs == -1: pool = mp.Pool() else: pool = mp.Pool(n_jobs) results = [apply_async(pool, serial_pre_process, args=(iterable[start:end], pre_processor, pre_processor_args)) for start, end in intervals] output = [p.get() for p in results] pool.close() pool.join() return_list = [] for items in output: for item in items: return_list.append(item) return return_list
def multiprocess_fit(pos_iterable, neg_iterable, vectorizer=None, estimator=None, pos_block_size=100, neg_block_size=100, n_jobs=-1): """multiprocess_fit.""" start_time = time.time() classes = np.array([1, -1]) if n_jobs == -1: pool = mp.Pool() else: pool = mp.Pool(n_jobs) pos_results = [apply_async( pool, serial_pre_process, args=(seqs, vectorizer)) for seqs in chunks(pos_iterable, pos_block_size)] neg_results = [apply_async( pool, serial_pre_process, args=(seqs, vectorizer)) for seqs in chunks(neg_iterable, neg_block_size)] logger.debug('Setup %.2f secs' % (time.time() - start_time)) logger.debug('Fitting') start_time = time.time() for i, (p, n) in enumerate(izip(pos_results, neg_results)): loc_start_time = time.time() pos_data_matrix = p.get() y = [1] * pos_data_matrix.shape[0] neg_data_matrix = n.get() y += [-1] * neg_data_matrix.shape[0] y = np.array(y) data_matrix = vstack([pos_data_matrix, neg_data_matrix]) estimator.partial_fit(data_matrix, y, classes=classes) d_time = time.time() - start_time d_loc_time = time.time() - loc_start_time size = pos_data_matrix.shape logger.debug('%d %s (%.2f secs) (delta: %.2f)' % (i, size, d_time, d_loc_time)) pool.close() pool.join() return estimator
def _multiprocess_graph_motif(self, seqs): size = len(seqs) intervals = compute_intervals(size=size, n_blocks=self.n_blocks, block_size=self.block_size) if self.n_jobs == -1: pool = mp.Pool() else: pool = mp.Pool(processes=self.n_jobs) results = [apply_async(pool, self._serial_graph_motif, args=(seqs[start:end], True)) for start, end in intervals] output = [p.get() for p in results] return list(chain(*output))
def transform(self, graphs): """Transform a list of networkx graphs into a sparse matrix. Parameters ---------- graphs : list[graphs] The input list of networkx graphs. Returns ------- data_matrix : array-like, shape = [n_samples, n_features] Vector representation of input graphs. >>> # transforming the same graph (with different node-ids). >>> import networkx as nx >>> def get_path_graph(length=4): ... g = nx.path_graph(length) ... for n,d in g.nodes(data=True): ... d['label'] = 'C' ... for a,b,d in g.edges(data=True): ... d['label'] = '1' ... return g >>> g = get_path_graph(4) >>> g2 = get_path_graph(5) >>> g2.remove_node(0) >>> g[1][2]['label']='2' >>> g2[2][3]['label']='2' >>> v = Vectorizer() >>> def vec_to_hash(vec): ... return hash(tuple(vec.data + vec.indices)) >>> vec_to_hash(v.transform([g])) == vec_to_hash (v.transform([g2])) True """ if self.n_jobs == 1: return self._transform_serial(graphs) if self.n_jobs == -1: pool = mp.Pool(mp.cpu_count()) else: pool = mp.Pool(self.n_jobs) results = [apply_async( pool, self._transform_serial, args=([subset_graphs])) for subset_graphs in chunks(graphs, self.block_size)] for i, p in enumerate(results): pos_data_matrix = p.get() if i == 0: data_matrix = pos_data_matrix else: data_matrix = vstack([data_matrix, pos_data_matrix]) pool.close() pool.join() return data_matrix
def transform(self, graphs): """Transform a list of networkx graphs into a sparse matrix. Parameters ---------- graphs : list[graphs] The input list of networkx graphs. Returns ------- data_matrix : array-like, shape = [n_samples, n_features] Vector representation of input graphs. >>> # transforming the same graph (with different node-ids). >>> import networkx as nx >>> def get_path_graph(length=4): ... g = nx.path_graph(length) ... for n,d in g.nodes(data=True): ... d['label'] = 'C' ... for a,b,d in g.edges(data=True): ... d['label'] = '1' ... return g >>> g = get_path_graph(4) >>> g2 = get_path_graph(5) >>> g2.remove_node(0) >>> g[1][2]['label']='2' >>> g2[2][3]['label']='2' >>> v = Vectorizer() >>> def vec_to_hash(vec): ... return hash(tuple(vec.data + vec.indices)) >>> vec_to_hash(v.transform([g])) == vec_to_hash (v.transform([g2])) True """ if self.n_jobs == 1: return self._transform_serial(graphs) if self.n_jobs == -1: pool = multiprocessing.Pool(multiprocessing.cpu_count()) else: pool = multiprocessing.Pool(self.n_jobs) results = [ apply_async(pool, self._transform_serial, args=([subset_graphs])) for subset_graphs in chunks(graphs, self.block_size) ] for i, p in enumerate(results): pos_data_matrix = p.get() if i == 0: data_matrix = pos_data_matrix else: data_matrix = vstack([data_matrix, pos_data_matrix]) pool.close() pool.join() return data_matrix
def _optimize_parallel(self, reference_graphs): """optimize_parallel.""" pool = multiprocessing.Pool() res = [ apply_async(pool, self._optimize_single, args=(g, )) for g in reference_graphs ] pareto_set_graphs_list = [p.get() for p in res] pool.close() pool.join() return pareto_set_graphs_list
def multiprocess_vectorize(graphs, vectorizer=None, n_blocks=5, block_size=None, n_jobs=8): graphs = list(graphs) import multiprocessing as mp size = len(graphs) intervals = compute_intervals(size=size, n_blocks=n_blocks, block_size=block_size) if n_jobs == -1: pool = mp.Pool() else: pool = mp.Pool(n_jobs) results = [apply_async(pool, serial_vectorize, args=(graphs[start:end], vectorizer)) for start, end in intervals] output = [p.get() for p in results] pool.close() pool.join() X = vstack(output, format="csr") return X
def mass_annotate_mp(inputs, vectorizer, score_attribute='importance', estimator=None, multi_process=False, annotate_dilude_scores=False): ''' graph annotation is slow. i dont want to do it twice in fit and predict :) ''' # 1st check if already annotated #if inputs[0].graph.get('mass_annotate_mp_was_here', False): # return inputs if multi_process == False: inputs = filter(lambda v: v is not None, inputs) res = list(vectorizer.annotate(inputs, estimator=estimator)) def dilute_graph( graph): for n, d in graph.nodes(data=True): neighsum = [graph.node[other][score_attribute][0] for other in graph.neighbors(n)] if neighsum != []: allfacs = neighsum + [graph.node[n][score_attribute][0]] * len(neighsum) score = sum(allfacs) / float(len(allfacs)) else: score = d[score_attribute][0] d['tmpscore'] = score for n, d in graph.nodes(data=True): d[score_attribute] = [d['tmpscore'], 0] # self.attribute = lambda x: x['tmpscore'] if annotate_dilude_scores: map(dilute_graph,res) #if invert_score: # def f(n,d): d['importance'] = -d['importance'] # res=utils.map_node_operation(res,f) res[0].graph['mass_annotate_mp_was_here'] = True return res else: pool = mp.Pool() mpres = [eden.apply_async(pool, mass_annotate_mp, args=(graphs, vectorizer, score_attribute, estimator)) for graphs in eden.grouper(inputs, 50)] result = [] for res in mpres: result += res.get() pool.close() pool.join() return result
def _multiprocess_graph_motif(self, seqs): size = len(seqs) intervals = compute_intervals(size=size, n_blocks=self.n_blocks, block_size=self.block_size) if self.n_jobs == -1: pool = mp.Pool() else: pool = mp.Pool(processes=self.n_jobs) results = [ apply_async(pool, self._serial_graph_motif, args=(seqs[start:end], True)) for start, end in intervals ] output = [p.get() for p in results] return list(chain(*output))
def multiprocess_vectorize(iterators, vectorizer=None, pre_processor=None, pre_processor_args=None, fit_flag=False, n_blocks=5, block_size=None, n_jobs=8): """multiprocess_vectorize.""" iterators = list(iterators) # fitting happens in a serial fashion if fit_flag: if pre_processor is not None: if pre_processor_args is not None: graphs = pre_processor(iterators, **pre_processor_args) else: graphs = pre_processor(iterators) else: graphs = iterators vectorizer.fit(graphs) size = len(iterators) intervals = compute_intervals(size=size, n_blocks=n_blocks, block_size=block_size) if n_jobs == -1: pool = mp.Pool() else: pool = mp.Pool(n_jobs) results = [apply_async(pool, serial_vectorize, args=(iterators[start:end], vectorizer, pre_processor, pre_processor_args, False)) for start, end in intervals] output = [p.get() for p in results] pool.close() pool.join() data_matrix = vstack(output, format="csr") return data_matrix
def multiprocess_subarray(iterable, vectorizer=None, estimator=None, min_subarray_size=5, max_subarray_size=10, block_size=100, n_jobs=-1): """multiprocess_subarray.""" start_time = time.time() if n_jobs == -1: pool = mp.Pool() else: pool = mp.Pool(n_jobs) results = [apply_async( pool, serial_subarray, args=(seqs, vectorizer, estimator, min_subarray_size, max_subarray_size)) for seqs in chunks(iterable, block_size)] logging.debug('Setup %.2f secs' % (time.time() - start_time)) logging.debug('Annotating') start_time = time.time() subarrays_items = [] for i, p in enumerate(results): loc_start_time = time.time() subarrays_item = p.get() subarrays_items += subarrays_item d_time = time.time() - start_time d_loc_time = time.time() - loc_start_time logging.debug('%d (%.2f secs) (delta: %.2f)' % (i, d_time, d_loc_time)) pool.close() pool.join() return subarrays_items
def vertex_transform(self, graphs): """Transform a list of networkx graphs into a list of sparse matrices. Each matrix has dimension n_nodes x n_features, i.e. each vertex is associated to a sparse vector that encodes the neighborhood of the vertex up to radius + distance. Parameters ---------- graphs : list[graphs] The input list of networkx graphs. Returns ------- matrix_list : array-like, shape = [n_samples, [n_nodes, n_features]] Vector representation of each vertex in the input graphs. """ if self.n_jobs == 1: return self._vertex_transform_serial(graphs) if self.n_jobs == -1: pool = multiprocessing.Pool(multiprocessing.cpu_count()) else: pool = multiprocessing.Pool(self.n_jobs) results = [ apply_async(pool, self._vertex_transform_serial, args=([subset_graphs])) for subset_graphs in chunks(graphs, self.block_size) ] matrix_list = [] for i, p in enumerate(results): matrix_list += p.get() pool.close() pool.join() return matrix_list
def multiprocess_vectorize(iterators, vectorizer=None, pre_processor=None, pre_processor_args=None, fit_flag=False, n_blocks=5, block_size=None, n_jobs=8): iterators = list(iterators) # fitting happens in a serial fashion if fit_flag: if pre_processor is not None: if pre_processor_args is not None: graphs = pre_processor(iterators, **pre_processor_args) else: graphs = pre_processor(iterators) else: graphs = iterators vectorizer.fit(graphs) size = len(iterators) intervals = compute_intervals(size=size, n_blocks=n_blocks, block_size=block_size) if n_jobs == -1: pool = mp.Pool() else: pool = mp.Pool(n_jobs) results = [apply_async(pool, serial_vectorize, args=(iterators[start:end], vectorizer, pre_processor, pre_processor_args, False)) for start, end in intervals] output = [p.get() for p in results] pool.close() pool.join() data_matrix = vstack(output, format="csr") return data_matrix
def vertex_transform(self, graphs): """Transform a list of networkx graphs into a list of sparse matrices. Each matrix has dimension n_nodes x n_features, i.e. each vertex is associated to a sparse vector that encodes the neighborhood of the vertex up to radius + distance. Parameters ---------- graphs : list[graphs] The input list of networkx graphs. Returns ------- matrix_list : array-like, shape = [n_samples, [n_nodes, n_features]] Vector representation of each vertex in the input graphs. """ if self.n_jobs == 1: return self._vertex_transform_serial(graphs) if self.n_jobs == -1: pool = mp.Pool(mp.cpu_count()) else: pool = mp.Pool(self.n_jobs) results = [apply_async( pool, self._vertex_transform_serial, args=([subset_graphs])) for subset_graphs in chunks(graphs, self.block_size)] matrix_list = [] for i, p in enumerate(results): matrix_list += p.get() pool.close() pool.join() return matrix_list