def multiprocess_performance(pos_iterable, neg_iterable,
                             vectorizer=None,
                             estimator=None,
                             pos_block_size=100,
                             neg_block_size=100,
                             n_jobs=-1):
    """multiprocess_performance."""
    start_time = time.time()
    if n_jobs == -1:
        pool = mp.Pool()
    else:
        pool = mp.Pool(n_jobs)

    pos_results = [apply_async(
        pool, serial_pre_process,
        args=(seqs, vectorizer))
        for seqs in chunks(pos_iterable, pos_block_size)]
    neg_results = [apply_async(
        pool, serial_pre_process,
        args=(seqs, vectorizer))
        for seqs in chunks(neg_iterable, neg_block_size)]
    logger.debug('Setup %.2f secs' % (time.time() - start_time))
    logger.debug('Performance evaluation')

    start_time = time.time()
    preds = []
    binary_preds = []
    true_targets = []
    for i, (p, n) in enumerate(izip(pos_results, neg_results)):
        loc_start_time = time.time()
        pos_data_matrix = p.get()
        y = [1] * pos_data_matrix.shape[0]
        neg_data_matrix = n.get()
        y += [-1] * neg_data_matrix.shape[0]
        y = np.array(y)
        true_targets.append(y)
        data_matrix = vstack([pos_data_matrix, neg_data_matrix])
        pred = estimator.decision_function(data_matrix)
        preds.append(pred)
        binary_pred = estimator.predict(data_matrix)
        binary_preds.append(binary_pred)
        d_time = time.time() - start_time
        d_loc_time = time.time() - loc_start_time
        size = pos_data_matrix.shape
        logger.debug('%d %s (%.2f secs) (delta: %.2f)' %
                     (i, size, d_time, d_loc_time))

    pool.close()
    pool.join()
    preds = np.hstack(preds)
    binary_preds = np.hstack(binary_preds)
    true_targets = np.hstack(true_targets)
    return preds, binary_preds, true_targets
예제 #2
0
def multiprocess_performance(pos_iterable, neg_iterable,
                             vectorizer=None,
                             estimator=None,
                             pos_block_size=100,
                             neg_block_size=100,
                             n_jobs=-1):
    """multiprocess_performance."""
    start_time = time.time()
    if n_jobs == -1:
        pool = mp.Pool()
    else:
        pool = mp.Pool(n_jobs)

    pos_results = [apply_async(
        pool, serial_pre_process,
        args=(seqs, vectorizer))
        for seqs in chunks(pos_iterable, pos_block_size)]
    neg_results = [apply_async(
        pool, serial_pre_process,
        args=(seqs, vectorizer))
        for seqs in chunks(neg_iterable, neg_block_size)]
    logger.debug('Setup %.2f secs' % (time.time() - start_time))
    logger.debug('Performance evaluation')

    start_time = time.time()
    preds = []
    binary_preds = []
    true_targets = []
    for i, (p, n) in enumerate(izip(pos_results, neg_results)):
        loc_start_time = time.time()
        pos_data_matrix = p.get()
        y = [1] * pos_data_matrix.shape[0]
        neg_data_matrix = n.get()
        y += [-1] * neg_data_matrix.shape[0]
        y = np.array(y)
        true_targets.append(y)
        data_matrix = vstack([pos_data_matrix, neg_data_matrix])
        pred = estimator.decision_function(data_matrix)
        preds.append(pred)
        binary_pred = estimator.predict(data_matrix)
        binary_preds.append(binary_pred)
        d_time = time.time() - start_time
        d_loc_time = time.time() - loc_start_time
        size = pos_data_matrix.shape
        logger.debug('%d %s (%.2f secs) (delta: %.2f)' %
                     (i, size, d_time, d_loc_time))

    pool.close()
    pool.join()
    preds = np.hstack(preds)
    binary_preds = np.hstack(binary_preds)
    true_targets = np.hstack(true_targets)
    return preds, binary_preds, true_targets
def multiprocess_vectorize(iterable,
                           vectorizer=None,
                           pos_block_size=100,
                           n_jobs=-1):
    """multiprocess_vectorize."""
    start_time = time.time()
    if n_jobs == -1:
        pool = mp.Pool()
    else:
        pool = mp.Pool(n_jobs)

    results = [apply_async(
        pool, serial_pre_process,
        args=(seqs, vectorizer))
        for seqs in chunks(iterable, pos_block_size)]
    logger.debug('Setup %.2f secs' % (time.time() - start_time))
    logger.debug('Vectorizing')

    start_time = time.time()
    matrices = []
    for i, p in enumerate(results):
        loc_start_time = time.time()
        pos_data_matrix = p.get()
        matrices += pos_data_matrix
        d_time = time.time() - start_time
        d_loc_time = time.time() - loc_start_time
        size = pos_data_matrix.shape
        logger.debug('%d %s (%.2f secs) (delta: %.2f)' %
                     (i, size, d_time, d_loc_time))

    pool.close()
    pool.join()
    data_matrix = vstack(matrices)
    return data_matrix
예제 #4
0
def multiprocess_score(iterable,
                       vectorizer=None,
                       estimator=None,
                       block_size=100,
                       n_jobs=-1):
    """multiprocess_score."""
    start_time = time.time()
    if n_jobs == -1:
        pool = mp.Pool()
    else:
        pool = mp.Pool(n_jobs)

    results = [apply_async(
        pool, serial_score,
        args=(seqs,
              vectorizer,
              estimator))
        for seqs in chunks(iterable, block_size)]
    logger.debug('Setup %.2f secs' % (time.time() - start_time))
    logger.debug('Predicting')

    start_time = time.time()
    scores_items = []
    for i, p in enumerate(results):
        loc_start_time = time.time()
        scores = p.get()
        scores_items += scores
        d_time = time.time() - start_time
        d_loc_time = time.time() - loc_start_time
        logger.debug('%d (%.2f secs) (delta: %.2f)' %
                     (i, d_time, d_loc_time))

    pool.close()
    pool.join()
    return scores_items
예제 #5
0
def multiprocess_pre_process(iterable,
                             pre_processor=None,
                             pre_processor_args=None,
                             n_blocks=5,
                             block_size=None,
                             n_jobs=8):
    """multiprocess_pre_process."""
    iterable = list(iterable)
    size = len(iterable)
    intervals = compute_intervals(
        size=size, n_blocks=n_blocks, block_size=block_size)
    if n_jobs == -1:
        pool = mp.Pool()
    else:
        pool = mp.Pool(n_jobs)
    results = [apply_async(pool, serial_pre_process,
                           args=(iterable[start:end],
                                 pre_processor,
                                 pre_processor_args))
               for start, end in intervals]
    output = [p.get() for p in results]
    pool.close()
    pool.join()
    return_list = []
    for items in output:
        for item in items:
            return_list.append(item)
    return return_list
예제 #6
0
def multiprocess_vectorize(iterable,
                           vectorizer=None,
                           pos_block_size=100,
                           n_jobs=-1):
    """multiprocess_vectorize."""
    start_time = time.time()
    if n_jobs == -1:
        pool = mp.Pool()
    else:
        pool = mp.Pool(n_jobs)

    results = [apply_async(
        pool, serial_pre_process,
        args=(seqs, vectorizer))
        for seqs in chunks(iterable, pos_block_size)]
    logger.debug('Setup %.2f secs' % (time.time() - start_time))
    logger.debug('Vectorizing')

    start_time = time.time()
    matrices = []
    for i, p in enumerate(results):
        loc_start_time = time.time()
        pos_data_matrix = p.get()
        matrices += pos_data_matrix
        d_time = time.time() - start_time
        d_loc_time = time.time() - loc_start_time
        size = pos_data_matrix.shape
        logger.debug('%d %s (%.2f secs) (delta: %.2f)' %
                     (i, size, d_time, d_loc_time))

    pool.close()
    pool.join()
    data_matrix = vstack(matrices)
    return data_matrix
def multiprocess_score(iterable,
                       vectorizer=None,
                       estimator=None,
                       block_size=100,
                       n_jobs=-1):
    """multiprocess_score."""
    start_time = time.time()
    if n_jobs == -1:
        pool = mp.Pool()
    else:
        pool = mp.Pool(n_jobs)

    results = [apply_async(
        pool, serial_score,
        args=(seqs,
              vectorizer,
              estimator))
        for seqs in chunks(iterable, block_size)]
    logger.debug('Setup %.2f secs' % (time.time() - start_time))
    logger.debug('Predicting')

    start_time = time.time()
    scores_items = []
    for i, p in enumerate(results):
        loc_start_time = time.time()
        scores = p.get()
        scores_items += scores
        d_time = time.time() - start_time
        d_loc_time = time.time() - loc_start_time
        logger.debug('%d (%.2f secs) (delta: %.2f)' %
                     (i, d_time, d_loc_time))

    pool.close()
    pool.join()
    return scores_items
예제 #8
0
def mass_annotate_mp(inputs, vectorizer, score_attribute='importance', estimator=None, multi_process=False, invert_score=False):
    '''
    graph annotation is slow. i dont want to do it twice in fit and predict :)
    '''
    #  1st check if already annotated
    if inputs[0].graph.get('mass_annotate_mp_was_here', False):
        return inputs

    if multi_process == False:
        inputs = filter(lambda v: v is not None, inputs)
        res = list(vectorizer.annotate(inputs, estimator=estimator))
        #if invert_score:
        #    def f(n,d): d['importance'] = -d['importance']
        #    res=utils.map_node_operation(res,f)

        res[0].graph['mass_annotate_mp_was_here'] = True
        return res
    else:
        pool = mp.Pool()
        mpres = [eden.apply_async(pool, mass_annotate_mp, args=(graphs, vectorizer, score_attribute, estimator)) for
                 graphs in eden.grouper(inputs, 50)]
        result = []
        for res in mpres:
            result += res.get()
        pool.close()
        pool.join()
        return result
예제 #9
0
파일: __init__.py 프로젝트: teresa-m/EDeN
def multiprocess_vectorize(graphs,
                           vectorizer=None,
                           fit_flag=False,
                           n_blocks=5,
                           block_size=None,
                           n_jobs=8):
    graphs = list(graphs)
    # fitting happens in a serial fashion
    if fit_flag:
        vectorizer.fit(graphs)
    import multiprocessing as mp
    size = len(graphs)
    intervals = compute_intervals(size=size,
                                  n_blocks=n_blocks,
                                  block_size=block_size)
    if n_jobs == -1:
        pool = mp.Pool()
    else:
        pool = mp.Pool(n_jobs)
    results = [
        apply_async(pool,
                    serial_vectorize,
                    args=(graphs[start:end], vectorizer, False))
        for start, end in intervals
    ]
    output = [p.get() for p in results]
    pool.close()
    pool.join()
    data_matrix = vstack(output, format="csr")
    return data_matrix
예제 #10
0
def multiprocess_subarray(
    iterable, vectorizer=None, estimator=None, min_subarray_size=5, max_subarray_size=10, block_size=100, n_jobs=-1
):
    """multiprocess_subarray."""
    start_time = time.time()
    if n_jobs == -1:
        pool = mp.Pool()
    else:
        pool = mp.Pool(n_jobs)

    results = [
        apply_async(pool, serial_subarray, args=(seqs, vectorizer, estimator, min_subarray_size, max_subarray_size))
        for seqs in chunks(iterable, block_size)
    ]
    logger.debug("Setup %.2f secs" % (time.time() - start_time))
    logger.debug("Annotating")

    start_time = time.time()
    subarrays_items = []
    for i, p in enumerate(results):
        loc_start_time = time.time()
        subarrays_item = p.get()
        subarrays_items += subarrays_item
        d_time = time.time() - start_time
        d_loc_time = time.time() - loc_start_time
        logger.debug("%d (%.2f secs) (delta: %.2f)" % (i, d_time, d_loc_time))

    pool.close()
    pool.join()
    return subarrays_items
예제 #11
0
파일: __init__.py 프로젝트: teresa-m/EDeN
def multiprocess_pre_process(iterable,
                             pre_processor=None,
                             pre_processor_args=None,
                             n_blocks=5,
                             block_size=None,
                             n_jobs=8):
    iterable = list(iterable)
    import multiprocessing as mp
    size = len(iterable)
    intervals = compute_intervals(
        size=size, n_blocks=n_blocks, block_size=block_size)
    if n_jobs == -1:
        pool = mp.Pool()
    else:
        pool = mp.Pool(n_jobs)
    results = [apply_async(pool, serial_pre_process,
                           args=(iterable[start:end], pre_processor, pre_processor_args))
               for start, end in intervals]
    output = [p.get() for p in results]
    pool.close()
    pool.join()
    return_list = []
    for items in output:
        for item in items:
            return_list.append(item)
    return return_list
예제 #12
0
def multiprocess_fit(pos_iterable, neg_iterable,
                     vectorizer=None,
                     estimator=None,
                     pos_block_size=100,
                     neg_block_size=100,
                     n_jobs=-1):
    """multiprocess_fit."""
    start_time = time.time()
    classes = np.array([1, -1])
    if n_jobs == -1:
        pool = mp.Pool()
    else:
        pool = mp.Pool(n_jobs)

    pos_results = [apply_async(
        pool, serial_pre_process,
        args=(seqs, vectorizer))
        for seqs in chunks(pos_iterable, pos_block_size)]
    neg_results = [apply_async(
        pool, serial_pre_process,
        args=(seqs, vectorizer))
        for seqs in chunks(neg_iterable, neg_block_size)]
    logger.debug('Setup %.2f secs' % (time.time() - start_time))
    logger.debug('Fitting')

    start_time = time.time()
    for i, (p, n) in enumerate(izip(pos_results, neg_results)):
        loc_start_time = time.time()
        pos_data_matrix = p.get()
        y = [1] * pos_data_matrix.shape[0]
        neg_data_matrix = n.get()
        y += [-1] * neg_data_matrix.shape[0]
        y = np.array(y)
        data_matrix = vstack([pos_data_matrix, neg_data_matrix])
        estimator.partial_fit(data_matrix, y, classes=classes)
        d_time = time.time() - start_time
        d_loc_time = time.time() - loc_start_time
        size = pos_data_matrix.shape
        logger.debug('%d %s (%.2f secs) (delta: %.2f)' %
                     (i, size, d_time, d_loc_time))

    pool.close()
    pool.join()

    return estimator
def multiprocess_fit(pos_iterable, neg_iterable,
                     vectorizer=None,
                     estimator=None,
                     pos_block_size=100,
                     neg_block_size=100,
                     n_jobs=-1):
    """multiprocess_fit."""
    start_time = time.time()
    classes = np.array([1, -1])
    if n_jobs == -1:
        pool = mp.Pool()
    else:
        pool = mp.Pool(n_jobs)

    pos_results = [apply_async(
        pool, serial_pre_process,
        args=(seqs, vectorizer))
        for seqs in chunks(pos_iterable, pos_block_size)]
    neg_results = [apply_async(
        pool, serial_pre_process,
        args=(seqs, vectorizer))
        for seqs in chunks(neg_iterable, neg_block_size)]
    logger.debug('Setup %.2f secs' % (time.time() - start_time))
    logger.debug('Fitting')

    start_time = time.time()
    for i, (p, n) in enumerate(izip(pos_results, neg_results)):
        loc_start_time = time.time()
        pos_data_matrix = p.get()
        y = [1] * pos_data_matrix.shape[0]
        neg_data_matrix = n.get()
        y += [-1] * neg_data_matrix.shape[0]
        y = np.array(y)
        data_matrix = vstack([pos_data_matrix, neg_data_matrix])
        estimator.partial_fit(data_matrix, y, classes=classes)
        d_time = time.time() - start_time
        d_loc_time = time.time() - loc_start_time
        size = pos_data_matrix.shape
        logger.debug('%d %s (%.2f secs) (delta: %.2f)' %
                     (i, size, d_time, d_loc_time))

    pool.close()
    pool.join()

    return estimator
예제 #14
0
파일: motif.py 프로젝트: bgruening/EDeN
 def _multiprocess_graph_motif(self, seqs):
     size = len(seqs)
     intervals = compute_intervals(size=size, n_blocks=self.n_blocks, block_size=self.block_size)
     if self.n_jobs == -1:
         pool = mp.Pool()
     else:
         pool = mp.Pool(processes=self.n_jobs)
     results = [apply_async(pool, self._serial_graph_motif, args=(seqs[start:end], True)) for start, end in intervals]
     output = [p.get() for p in results]
     return list(chain(*output))
예제 #15
0
파일: graph.py 프로젝트: smautner/EDeN
    def transform(self, graphs):
        """Transform a list of networkx graphs into a sparse matrix.

        Parameters
        ----------
        graphs : list[graphs]
            The input list of networkx graphs.

        Returns
        -------
        data_matrix : array-like, shape = [n_samples, n_features]
            Vector representation of input graphs.

        >>> # transforming the same graph (with different node-ids).
        >>> import networkx as nx
        >>> def get_path_graph(length=4):
        ...     g = nx.path_graph(length)
        ...     for n,d in g.nodes(data=True):
        ...         d['label'] = 'C'
        ...     for a,b,d in g.edges(data=True):
        ...         d['label'] = '1'
        ...     return g
        >>> g = get_path_graph(4)
        >>> g2 = get_path_graph(5)
        >>> g2.remove_node(0)
        >>> g[1][2]['label']='2'
        >>> g2[2][3]['label']='2'
        >>> v = Vectorizer()
        >>> def vec_to_hash(vec):
        ...     return  hash(tuple(vec.data + vec.indices))
        >>> vec_to_hash(v.transform([g])) == vec_to_hash (v.transform([g2]))
        True
        """
        if self.n_jobs == 1:
            return self._transform_serial(graphs)

        if self.n_jobs == -1:
            pool = mp.Pool(mp.cpu_count())
        else:
            pool = mp.Pool(self.n_jobs)

        results = [apply_async(
            pool, self._transform_serial,
            args=([subset_graphs]))
            for subset_graphs in chunks(graphs, self.block_size)]
        for i, p in enumerate(results):
            pos_data_matrix = p.get()
            if i == 0:
                data_matrix = pos_data_matrix
            else:
                data_matrix = vstack([data_matrix, pos_data_matrix])
        pool.close()
        pool.join()
        return data_matrix
예제 #16
0
파일: graph.py 프로젝트: latticetower/EDeN
    def transform(self, graphs):
        """Transform a list of networkx graphs into a sparse matrix.

        Parameters
        ----------
        graphs : list[graphs]
            The input list of networkx graphs.

        Returns
        -------
        data_matrix : array-like, shape = [n_samples, n_features]
            Vector representation of input graphs.

        >>> # transforming the same graph (with different node-ids).
        >>> import networkx as nx
        >>> def get_path_graph(length=4):
        ...     g = nx.path_graph(length)
        ...     for n,d in g.nodes(data=True):
        ...         d['label'] = 'C'
        ...     for a,b,d in g.edges(data=True):
        ...         d['label'] = '1'
        ...     return g
        >>> g = get_path_graph(4)
        >>> g2 = get_path_graph(5)
        >>> g2.remove_node(0)
        >>> g[1][2]['label']='2'
        >>> g2[2][3]['label']='2'
        >>> v = Vectorizer()
        >>> def vec_to_hash(vec):
        ...     return  hash(tuple(vec.data + vec.indices))
        >>> vec_to_hash(v.transform([g])) == vec_to_hash (v.transform([g2]))
        True
        """
        if self.n_jobs == 1:
            return self._transform_serial(graphs)

        if self.n_jobs == -1:
            pool = multiprocessing.Pool(multiprocessing.cpu_count())
        else:
            pool = multiprocessing.Pool(self.n_jobs)

        results = [
            apply_async(pool, self._transform_serial, args=([subset_graphs]))
            for subset_graphs in chunks(graphs, self.block_size)
        ]
        for i, p in enumerate(results):
            pos_data_matrix = p.get()
            if i == 0:
                data_matrix = pos_data_matrix
            else:
                data_matrix = vstack([data_matrix, pos_data_matrix])
        pool.close()
        pool.join()
        return data_matrix
예제 #17
0
 def _optimize_parallel(self, reference_graphs):
     """optimize_parallel."""
     pool = multiprocessing.Pool()
     res = [
         apply_async(pool, self._optimize_single, args=(g, ))
         for g in reference_graphs
     ]
     pareto_set_graphs_list = [p.get() for p in res]
     pool.close()
     pool.join()
     return pareto_set_graphs_list
예제 #18
0
파일: __init__.py 프로젝트: bgruening/EDeN
def multiprocess_vectorize(graphs, vectorizer=None, n_blocks=5, block_size=None, n_jobs=8):
    graphs = list(graphs)
    import multiprocessing as mp
    size = len(graphs)
    intervals = compute_intervals(size=size, n_blocks=n_blocks, block_size=block_size)
    if n_jobs == -1:
        pool = mp.Pool()
    else:
        pool = mp.Pool(n_jobs)
    results = [apply_async(pool, serial_vectorize, args=(graphs[start:end], vectorizer)) for start, end in intervals]
    output = [p.get() for p in results]
    pool.close()
    pool.join()
    X = vstack(output, format="csr")
    return X
예제 #19
0
def mass_annotate_mp(inputs, vectorizer, score_attribute='importance', estimator=None, multi_process=False, annotate_dilude_scores=False):
    '''
    graph annotation is slow. i dont want to do it twice in fit and predict :)
    '''

    #  1st check if already annotated
    #if inputs[0].graph.get('mass_annotate_mp_was_here', False):
    #    return inputs

    if multi_process == False:
        inputs = filter(lambda v: v is not None, inputs)


        res = list(vectorizer.annotate(inputs, estimator=estimator))

        def dilute_graph( graph):
            for n, d in graph.nodes(data=True):
                neighsum = [graph.node[other][score_attribute][0] for other in graph.neighbors(n)]
                if neighsum != []:
                    allfacs = neighsum + [graph.node[n][score_attribute][0]] * len(neighsum)
                    score = sum(allfacs) / float(len(allfacs))
                else:
                    score = d[score_attribute][0]
                d['tmpscore'] = score

            for n, d in graph.nodes(data=True):
                d[score_attribute] = [d['tmpscore'], 0]
                # self.attribute =  lambda x: x['tmpscore']
        if annotate_dilude_scores:
            map(dilute_graph,res)

        #if invert_score:
        #    def f(n,d): d['importance'] = -d['importance']
        #    res=utils.map_node_operation(res,f)

        res[0].graph['mass_annotate_mp_was_here'] = True
        return res
    else:
        pool = mp.Pool()
        mpres = [eden.apply_async(pool, mass_annotate_mp, args=(graphs, vectorizer, score_attribute, estimator)) for
                 graphs in eden.grouper(inputs, 50)]
        result = []
        for res in mpres:
            result += res.get()
        pool.close()
        pool.join()
        return result
예제 #20
0
 def _multiprocess_graph_motif(self, seqs):
     size = len(seqs)
     intervals = compute_intervals(size=size,
                                   n_blocks=self.n_blocks,
                                   block_size=self.block_size)
     if self.n_jobs == -1:
         pool = mp.Pool()
     else:
         pool = mp.Pool(processes=self.n_jobs)
     results = [
         apply_async(pool,
                     self._serial_graph_motif,
                     args=(seqs[start:end], True))
         for start, end in intervals
     ]
     output = [p.get() for p in results]
     return list(chain(*output))
예제 #21
0
def multiprocess_vectorize(iterators,
                           vectorizer=None,
                           pre_processor=None,
                           pre_processor_args=None,
                           fit_flag=False,
                           n_blocks=5,
                           block_size=None,
                           n_jobs=8):
    """multiprocess_vectorize."""
    iterators = list(iterators)
    # fitting happens in a serial fashion
    if fit_flag:
        if pre_processor is not None:
            if pre_processor_args is not None:
                graphs = pre_processor(iterators, **pre_processor_args)
            else:
                graphs = pre_processor(iterators)
        else:
            graphs = iterators
        vectorizer.fit(graphs)
    size = len(iterators)
    intervals = compute_intervals(size=size,
                                  n_blocks=n_blocks,
                                  block_size=block_size)
    if n_jobs == -1:
        pool = mp.Pool()
    else:
        pool = mp.Pool(n_jobs)
    results = [apply_async(pool, serial_vectorize,
                           args=(iterators[start:end],
                                 vectorizer,
                                 pre_processor,
                                 pre_processor_args,
                                 False))
               for start, end in intervals]
    output = [p.get() for p in results]
    pool.close()
    pool.join()
    data_matrix = vstack(output, format="csr")
    return data_matrix
예제 #22
0
def multiprocess_subarray(iterable,
                          vectorizer=None,
                          estimator=None,
                          min_subarray_size=5,
                          max_subarray_size=10,
                          block_size=100,
                          n_jobs=-1):
    """multiprocess_subarray."""
    start_time = time.time()
    if n_jobs == -1:
        pool = mp.Pool()
    else:
        pool = mp.Pool(n_jobs)

    results = [apply_async(
        pool, serial_subarray,
        args=(seqs,
              vectorizer,
              estimator,
              min_subarray_size,
              max_subarray_size))
        for seqs in chunks(iterable, block_size)]
    logging.debug('Setup %.2f secs' % (time.time() - start_time))
    logging.debug('Annotating')

    start_time = time.time()
    subarrays_items = []
    for i, p in enumerate(results):
        loc_start_time = time.time()
        subarrays_item = p.get()
        subarrays_items += subarrays_item
        d_time = time.time() - start_time
        d_loc_time = time.time() - loc_start_time
        logging.debug('%d (%.2f secs) (delta: %.2f)' %
                      (i, d_time, d_loc_time))

    pool.close()
    pool.join()
    return subarrays_items
예제 #23
0
파일: graph.py 프로젝트: latticetower/EDeN
    def vertex_transform(self, graphs):
        """Transform a list of networkx graphs into a list of sparse matrices.

        Each matrix has dimension n_nodes x n_features, i.e. each vertex is
        associated to a sparse vector that encodes the neighborhood of the
        vertex up to radius + distance.

        Parameters
        ----------
        graphs : list[graphs]
            The input list of networkx graphs.

        Returns
        -------
        matrix_list : array-like, shape = [n_samples, [n_nodes, n_features]]
            Vector representation of each vertex in the input graphs.

        """
        if self.n_jobs == 1:
            return self._vertex_transform_serial(graphs)

        if self.n_jobs == -1:
            pool = multiprocessing.Pool(multiprocessing.cpu_count())
        else:
            pool = multiprocessing.Pool(self.n_jobs)

        results = [
            apply_async(pool,
                        self._vertex_transform_serial,
                        args=([subset_graphs]))
            for subset_graphs in chunks(graphs, self.block_size)
        ]
        matrix_list = []
        for i, p in enumerate(results):
            matrix_list += p.get()
        pool.close()
        pool.join()
        return matrix_list
예제 #24
0
파일: __init__.py 프로젝트: smautner/EDeN
def multiprocess_vectorize(iterators,
                           vectorizer=None,
                           pre_processor=None,
                           pre_processor_args=None,
                           fit_flag=False,
                           n_blocks=5,
                           block_size=None,
                           n_jobs=8):
    iterators = list(iterators)
    # fitting happens in a serial fashion
    if fit_flag:
        if pre_processor is not None:
            if pre_processor_args is not None:
                graphs = pre_processor(iterators, **pre_processor_args)
            else:
                graphs = pre_processor(iterators)
        else:
            graphs = iterators
        vectorizer.fit(graphs)
    size = len(iterators)
    intervals = compute_intervals(size=size, n_blocks=n_blocks, block_size=block_size)
    if n_jobs == -1:
        pool = mp.Pool()
    else:
        pool = mp.Pool(n_jobs)
    results = [apply_async(pool, serial_vectorize,
                           args=(iterators[start:end],
                                 vectorizer,
                                 pre_processor,
                                 pre_processor_args,
                                 False))
               for start, end in intervals]
    output = [p.get() for p in results]
    pool.close()
    pool.join()
    data_matrix = vstack(output, format="csr")
    return data_matrix
예제 #25
0
파일: graph.py 프로젝트: smautner/EDeN
    def vertex_transform(self, graphs):
        """Transform a list of networkx graphs into a list of sparse matrices.

        Each matrix has dimension n_nodes x n_features, i.e. each vertex is
        associated to a sparse vector that encodes the neighborhood of the
        vertex up to radius + distance.

        Parameters
        ----------
        graphs : list[graphs]
            The input list of networkx graphs.

        Returns
        -------
        matrix_list : array-like, shape = [n_samples, [n_nodes, n_features]]
            Vector representation of each vertex in the input graphs.

        """
        if self.n_jobs == 1:
            return self._vertex_transform_serial(graphs)

        if self.n_jobs == -1:
            pool = mp.Pool(mp.cpu_count())
        else:
            pool = mp.Pool(self.n_jobs)

        results = [apply_async(
            pool, self._vertex_transform_serial,
            args=([subset_graphs]))
            for subset_graphs in chunks(graphs, self.block_size)]
        matrix_list = []
        for i, p in enumerate(results):
            matrix_list += p.get()
        pool.close()
        pool.join()
        return matrix_list