Пример #1
1
def call_cv_train_parallel(train_func, args_iterator=None):
    if args_iterator is None:
        args_iterator = get_kfold_ids()
    from multiprocessing import Pool
    pool = Pool(get_core_count())
    retval = unlist_dataframe(pool.starmap(train_func, args_iterator))
    pool.terminate()
    return retval
Пример #2
1
 def run_all_games(self, multi_thread=True):
     if multi_thread:
         pool = Pool(3)
         games = pool.starmap(self.run_game, [(Bandit(self._n_arms), \
         self._agent_cls(self._n_arms, **self._cls_args), self._n_plays) for i in range(0, self._n_games)])
         self._games = games
     else:
         games = []
         for i in range(0, self._n_games):
             games.append(self.run_game(Bandit(self._n_arms), self._agent_cls(self._n_arms, **self._cls_args), self._n_plays))
         self._games = games
Пример #3
0
def process_image_list(l):
    db.close_old_connections()

    pool = Pool()
    pool.starmap(migrate_image_resize, l)
    pool.close()
    pool.join()
Пример #4
0
def main():
    pool = Pool()
    a_args = [1,2,3]
    second_arg = 1
    argzip=zip(a_args, itertools.repeat(second_arg))
    #pool.map(func, argzip)
    pool.starmap(func, [(1,2),(2,3)])
Пример #5
0
def call_func_parallel(func, args_iterator, workers=-1):
    from multiprocessing import Pool
    if workers == -1:
        workers = get_core_count()
    pool = Pool(workers)
    pool.starmap(func, args_iterator)
    pool.terminate()
Пример #6
0
def run_sub_folders2(MotherFolder, DaisyFileName, DaisyExecutabl, NumberOfProcesses=6, recursive=False):
    """
    Runs all the Daisy simulations found below the MotherFolder
    """
    pp = Pool(NumberOfProcesses)
    input=[]
    for i in range(NumberOfProcesses):
        input.append( (MotherFolder, DaisyFileName, DaisyExecutabl, i, recursive, None)  )
    pp.starmap(run_single2, input)
    pp.terminate()
Пример #7
0
def main():
    num_process = 4
    pool = Pool(num_process)

    num_post = post_detail_infos.find({'segmented':{'$ne':1}}).count()
    print('Number to segment: ' + str(num_post))

    batch_size = int(num_post/num_process)
    print('Batch size: ' + str(batch_size))
    args = [(idx*batch_size, (idx+1)*batch_size) for idx in range(num_process)]
    pool.starmap(segmentation, args)
    pool.close()
    pool.join()
Пример #8
0
def repeat_databases(source_path, database_count, exploration_count=0, exploitation_count=0, random_count=0, processes=7,
                     add_new=False, exploit_method='furthest', record_errors=True, start_number=0):
    # Check source path
    if not os.path.isdir(source_path):
        print('Unable to find source:' + source_path)
        return

    # Generate threading lists
    if add_new:
        items = os.listdir(source_path)
        databases = []
        for item in items:
            try:
                databases.append(int(item))
            except ValueError:
                continue
        if len(databases) != 0:
            start_number = max(databases) + 1
    end_number = start_number + database_count
    paths = PathNaming(os.name, database_path=source_path)
    database_paths = [source_path + str(i) + paths.slash for i in range(start_number, end_number)]
    explorations = [exploration_count for i in range(database_count)]
    exploitations = [exploitation_count for i in range(database_count)]
    randoms = [random_count for i in range(database_count)]
    exploit_method = [exploit_method for i in range(database_count)]
    record_errors = [record_errors for i in range(database_count)]

    # Make a new folder for each database and place the input files in it
    for i in range(database_count):
        if not os.path.isdir(database_paths[i]):
            os.mkdir(database_paths[i])
            shutil.copy(source_path + paths.base_input, database_paths[i])
            shutil.copy(source_path + paths.dbase_input, database_paths[i])

    # If there's existing libraries to begin with, copy them as well
    if os.path.isdir(source_path + paths.slash + paths.FR_Input_folder + paths.slash):
        source_dir = source_path + paths.slash + paths.FR_Input_folder + paths.slash
        for i in range(database_count):
            copy_tree(source_dir, database_paths[i] + paths.slash + paths.FR_Input_folder)
    if os.path.isdir(source_path + paths.slash + paths.FR_Output_folder + paths.slash):
        source_dir = source_path + paths.slash + paths.FR_Output_folder + paths.slash
        for i in range(database_count):
            copy_tree(source_dir, database_paths[i] + paths.slash + paths.FR_Output_folder)

    # Run databases
    pool = Pool(processes=processes)
    pool.starmap(database_thread, zip(database_paths, explorations, exploitations, randoms, exploit_method,
                                      record_errors))

    return
Пример #9
0
def multi_main(proc, FILENAME, FUN, **kargs):
    pool = Pool(proc)
    multiargs = []

    # FPGrowth_LERS 用
    if FUN == updateConfidenceSupport :
        min_sup_range = kargs['min_sup_range']
        for iter1, iter2, min_sup in product(range(1,2), range(1,11), min_sup_range):
            multiargs.append((FILENAME, iter1, iter2, min_sup))
        print(multiargs)
        pool.starmap(FUN, multiargs)

    else :
        print("I dont' know the function.")        
Пример #10
0
    def write_cost_matrix(self):
        begin = timeit.default_timer()

        pool = Pool(processes=cpu_count())
        iterable = []
        for i in range(self.n):
            for j in range(i+1,self.n):
                iterable.append((i,j))
        pool.starmap(self.set_total_costs_matrix, iterable)

        self.total_costs_matrix.dump(os.getcwd()+'/memoria/outputs/cost_matrix')

        end = timeit.default_timer()
        print(end - begin)
Пример #11
0
    def pool_decode(self, data, callback):
        """
        Decode mz and i values in parallel.

        Args:
            data (): ...

        Keyword Args:
            callback (:obj:`func`): Callback function to call if decoding is
                finished. Should be :py:meth:`~pymzml.spec.Spectrum._register`.
        """
        ZE_POOL = Pool(processes=2)

        ZE_POOL.starmap(_decode, data)
Пример #12
0
def parse_wsj(processes=8):
    ptb = LazyCorpusLoader( # Penn Treebank v3: WSJ portions
        'ptb', CategorizedBracketParseCorpusReader, r'wsj/\d\d/wsj_\d\d\d\d.mrg',
        cat_file='allcats.txt', tagset='wsj')

    fileids = ptb.fileids()
    params = []
    for f in fileids:
        corpus = zip(ptb.parsed_sents(f), ptb.tagged_sents(f))
        for i, (parsed, tagged) in enumerate(corpus):
            params.append((f, i, parsed, tagged))

    p = Pool(processes)
    p.starmap(get_best_parse, sorted(params, key=lambda x: (x[0], x[1])))
Пример #13
0
def main():
    # user_statistic(0, 50)

    num_process = 4
    pool = Pool(num_process)

    num_post = post_detail_infos.find({'crawled_user_info':{'$ne':1}}).count()
    print('Number to get user infos: ' + str(num_post))

    batch_size = int(num_post/num_process)
    print('Batch size: ' + str(batch_size))
    args = [(idx*batch_size, (idx+1)*batch_size) for idx in range(num_process)]
    pool.starmap(user_statistic, args)
    pool.close()
    pool.join()
Пример #14
0
def main(processes=1):
    ptb = Constants().ptb
    fileids = list(ptb.fileids())

    params = []
    for fileid in fileids[:10]:
        for sent_num, parse_tree in enumerate(ptb.parsed_sents(fileid)):
            params.append((fileid, sent_num, parse_tree))

    if processes > 1:
        p = Pool(processes)
        p.starmap(score, sorted(params, key=lambda x: (x[0], x[1])))
    else:
        for param in params:
            score(*param)
Пример #15
0
def main():
    if len(sys.argv) != 4:
        print("Usage: python3 tweetTokenize.py <tweets_folder> <dest_folder> <num_process>")
        sys.exit(-1)
    tweets_folder = sys.argv[1]
    dest_folder = sys.argv[2]
    num_process = int(sys.argv[3])
    tweets_filenames = glob.glob(os.path.join(tweets_folder, '*'))
    tweets_filenames = [(f, dest_folder) for f in tweets_filenames]
    if num_process == 1:
        for f, dest_folder in tweets_filenames:
            tokenize_tweets(f, dest_folder)
    else:
        pool = Pool(num_process)
        pool.starmap(tokenize_tweets, tweets_filenames)
Пример #16
0
Файл: ctag.py Проект: stg7/ctag
def ctag(inputfiles, output_file, remove_stop_words, language, min_freq, min_len, pdf_output, debug, cpu_count):
    startTime = time.time()
    lInfo("process {} files".format(len(inputfiles)))

    pool = Pool(processes=cpu_count)
    params = [(inputfile, remove_stop_words, language, min_len) for inputfile in inputfiles]
    results = pool.starmap(build_word_histogram, params)

    global_histogram = {}
    for histogram in results:
        global_histogram = add_dict(global_histogram, histogram)

    # filter out words with a frequency that is not >= min_freq
    global_histogram = {t: global_histogram[t] for t in global_histogram if global_histogram[t] >= min_freq}

    if debug:
        histogram_file = output_file.replace(os.path.splitext(output_file)[1], ".debug.json")
        lInfo("for debugging write out intermediate histogram to: {}".format(histogram_file))
        with open(histogram_file, "w") as hist:
            hist.write(json.dumps(global_histogram, indent=4, sort_keys=True))

    with open(output_file, "w") as outfile:
        outfile.write(svg_cloud(global_histogram))
    if pdf_output:
        pdf_file_name = output_file.replace(os.path.splitext(output_file)[1], ".pdf")
        lInfo("create pdf graphic: {}".format(pdf_file_name))
        if shutil.which("inkscape") is None:
            lError("inkscape is not installed, therefore no pdf export is available.")
        else:
            os.system("""inkscape --without-gui --export-pdf="{pdffile}" {svgfile}""".format(svgfile=output_file, pdffile=pdf_file_name))
    lInfo("done: {} s".format(time.time() - startTime))
Пример #17
0
def gekko_search(**parameters):

    parallel = settings['parallel']
    num_rounds = settings['num_rounds']

    # remake CS & HS variability;
    candleSize= settings['candleSize']
    historySize= settings['historySize']

    if parallel:
        p = Pool(mp.cpu_count())
        param_list = list([(Strategy, parameters),] * num_rounds)
        scores = p.starmap(Evaluate, param_list)
        p.close()
        p.join()
    else:
        scores = [Evaluate(Strategy, parameters) for n in range(num_rounds)]

    series = pd.Series(scores)
    mean = series.mean()
    stats.append([series.count(), mean, series.std(), series.min()] +
         [series.quantile(x) for x in percentiles] + [series.max()])
    all_val.append(mean)
    write_evolution_logs(len(all_val), stats[-1])
    return mean
Пример #18
0
def gene_lcs(genomes,base_dir):
    #create substring folder, check if substrings have previously been calculated
    if not os.path.exists(base_dir+'substrings/'):
        os.makedirs(base_dir+'substrings/')
    os.chdir(base_dir+'substrings/')

    #import previous substring file if it exists
    substring_file = glob.glob('*.csv')
    orgstring = []
    if len(substring_file) == 1:
        with open('substrings.csv', newline='') as csvfile:
            reader = csv.reader(csvfile, delimiter=',', quotechar='|')
            for row in reader:
                orgstring.append(row[0])

    if len(orgstring) == len(genomes):
        print('Organism substrings already calculated')
    else:
        print('Finding common substrings...')
        pool = ThreadPool(len(genomes))
        orgstring = pool.starmap(extract,zip(genomes,repeat(base_dir)))
        pool.close()
        pool.join()

        #write orgstring file
        os.chdir(base_dir+'substrings/')
        with open('substrings.csv', 'w', newline='') as csvfile:
            writer = csv.writer(csvfile, delimiter=',',quotechar='|', quoting=csv.QUOTE_MINIMAL)
            for line in orgstring:
                writer.writerow([line])

    return orgstring
def applyGrid(_geo_crimes, _n,_grid, _column):
    if _n > 128:
        _n = 128
        print("n was too big. Set to 128.")
    print("splitting crimes in to smaller frames to leverage paralelization")
    _l = len(_geo_crimes.index)
    _crimes_args = []
    _covered = 0
    for _i in range(_n-1):
        _a, _b = int(round(_i*(_l/_n))), int(round((_i+1)*(_l/_n)))
        _crimes_args.append(_geo_crimes[_a:_b])
        _covered = _covered + (_b - _a)
    _crimes_args.append(_geo_crimes[_covered:len(_geo_crimes.index)])
    print("{} data-chunks created.".format(len(_crimes_args)))
    
    
    print("Trying to start {} parallel processes.".format(_n))
    _pool = Pool(processes=_n)
    print("{} parallel process started.".format(_n))
    _result = _pool.starmap(_para_crimes_in_cell, zip(_crimes_args, 
                                          repeat(_grid), repeat(_column)))
    _pool.terminate()
    print("Process terminated.")
    _df = _result.pop(0)
    for _frame in _result:
        _df = _df.append(_frame)
    print("{} crimes where spatialised to their cell.".format(len(_df.index)))
    return _df
    def reviews_to_sentences(self, review_list):
        """For a list of reviews, clean each review, and tranform
        each review to a list of sentences (each sentence is a list of words).
        Finally add all the list to a single list, each elment of the list
        is a sentence

        :param review_list: a list of reviews
        :returns: the list of all sentences from all reviews
        :rtype: a list of list of words (string)

        """

        if self.use_pool:
            pool = Pool(self.pool_size)
            sentences_tmp = pool.starmap(
                self.review_to_sentences_static,
                zip(review_list, repeat(self.tokenizer),
                    repeat(self.clean_method), repeat(self.remove_numbers),
                    repeat(self.remove_punct), repeat(self.remove_stopwords)))
            pool.close()

            sentences = []
            for sentence in sentences_tmp:
                sentences.extend(sentence)
        else:
            sentences = []
            for review in review_list:
                sentences += self.review_to_sentences(review)

        return sentences
Пример #21
0
 def getAllMovesLegalConcurrent (self, side) :
     p = Pool(8)
     unfilteredMovesWithBoard = [(move, copy.deepcopy(self.board)) for move in self.board.getAllMovesUnfiltered(side)]
     legalMoves = p.starmap(self.returnMoveIfLegal, unfilteredMovesWithBoard)
     p.close()
     p.join()
     return list(filter(None, legalMoves))
def error_rate(method, level, offset):
    pool = Pool()
    error_counts = pool.starmap(
        error_count,
        zip(refs, repeat(method), repeat(level), repeat(offset)))
    pool.close()
    n_values_total = sum(error_count_[0] for error_count_ in error_counts)
    n_errors_total = sum(error_count_[1] for error_count_ in error_counts)
    return n_errors_total/n_values_total
Пример #23
0
def fit_all_ase(ase, func, xs, colnames=None, pool=None, progress=False,
                median_in=None):
    if colnames is None:
        colnames = inspect.getargspec(func).args
        colnames.remove('x')
    if pool is not None:
        if isinstance(pool, int):
            pool = Pool(processes=pool)
        if progress:
            results = [pool.apply_async(fit_func, (func, i, ase, xs),
                                        {'median_in': median_in},)
                       for i in ase.index]
            res = []
            pbar = ProgressBar(maxval=len(results), widgets=[ "|",
                                                             Percentage(), "|",
                                                             SimpleProgress(),
                                                             Bar(),
                                                             FileTransferSpeed(), "|",
                                                             Timer(), "|",
                                                             AdaptiveETA(),
                                                            ])
            for i, r in zip(ase.index, pbar(results)):
                res.append(r.get())
        else:
            res = list(pool.starmap(
                fit_func,
                zip(
                    it.repeat(func),
                    ase.index,
                    it.repeat(ase),
                    it.repeat(xs),
                    it.repeat(None), # p0
                    it.repeat(median_in),
                ),
            ))

    else:
        res = list(it.starmap(
            fit_func,
            zip(
                it.repeat(func),
                ase.index,
                it.repeat(ase),
                it.repeat(xs),
                it.repeat(None), # p0
                it.repeat(median_in),
                )
            ))

    return pd.DataFrame(
            index=ase.index,
            columns=colnames,
            data=res
            )
Пример #24
0
def run_bteq_parallel(bteq_in_file, bteq_log_file, database_name, user_name, user_password, \
                      bteq_ignore_errors, num_of_jobs=1, copy_if_fail=None, copy_to=None):
    """
    Run the same Teradata Bteq script with multiple clones and check for errors.

    Args:
        bteq_input_file (str): Bteq script with queries
        bteq_log_file (str): Name of file that save the output
        database_name (str): Teradata database that Bteq will connect to
        user_name (str): User that have permission to run queries from bteq_input_file
        user_password (str): Password for user_name
        bteq_ignore_errors (Optional(list([int]))): List of errors that can be ignore
        num_of_jobs (int): Number of clone jobs that Teradata Bteq will run in parallel
        copy_if_fail (Optional[str]): The directory that bteq_log_file copy to in case it failed
        copy_to (Optional[str]): The directory that bteq_log_file copy to after it completed successful

    Returns:
        False if errors found not in bteq_ignore_errors, True if successful.

    Raises:
        Raising exception
    """

    def bteq_log_name(i):
        """Creates a unique logfile name with 'i' sequence number."""
        if num_of_jobs==1:
            # if running a single process don't change the name:
            return bteq_log_file
        else:
            return str(i).join(os.path.splitext(bteq_log_file))
            #return bteq_log_file + str(i)
    
    pool = Pool(processes=num_of_jobs)
    results = pool.starmap(run_bteq, [(bteq_in_file, bteq_log_name(i), database_name, user_name, \
                                       user_password, bteq_ignore_errors) for i in range(1, num_of_jobs+1)])
    bteq_pass = bteq_fail = 0
    #https://docs.python.org/2.3/whatsnew/section-enumerate.html
    for i, (bteq_run_result, bteq_errors_that_not_ignore) in enumerate(results, 1):
        
        if bteq_run_result:
            bteq_pass = bteq_pass + 1
            if copy_to is not None:
                copy_file(bteq_log_name(i), copy_to)
        else:
            logging.error("Bteq found that can not be ignored: %s, please check out this log file: %s" \
                          % (bteq_errors_that_not_ignore, bteq_log_name(i)))
            bteq_fail = bteq_fail + 1
            if copy_if_fail is not None:
                copy_file(bteq_log_name(i), copy_if_fail)
    if bteq_fail > 0:
        return False
    else:
        return True    
Пример #25
0
def predict_interval(alpha, Xs, likelihood, basis, m, C, lparams, bparams,
                     nsamples=100, multiproc=True):
    """
    Predictive percentile interval (upper and lower quantiles) for a Bayesian
    GLM.

    Parameters
    ----------
        alpha: float
            The percentile confidence interval (e.g. 95%) to return.
        Xs: ndarray
            (Ns,d) array query input dataset (Ns samples, D dimensions).
        likelihood: Object
            A likelihood object, see the likelihoods module.
        basis: Basis
            A basis object, see the basis_functions module.
        m: ndarray
            (D,) array of regression weights (posterior).
        C: ndarray
            (D,) or (D, D) array of regression weight covariances (posterior).
        lparams: sequence
            a sequence of parameters for the likelihood object, e.g. the
            likelihoods.Gaussian object takes a variance parameter, so this
            should be :code:`[var]`.
        bparams: sequence
            A sequence of hyperparameters of the basis object.
        nsamples: int, optional
            The number of samples to draw from the posterior in order to
            approximate the predictive mean and variance.
        multiproc: bool, optional
            Use multiprocessing to paralellise this prediction computation.

    Returns
    -------
        a: ndarray
            The lower end point of the interval with shape (Ns,)
        b: ndarray
            The upper end point of the interval with shape (Ns,)
    """

    f = _sample_func(Xs, basis, m, C, bparams, nsamples)
    work = ((fn, likelihood, lparams, alpha) for fn in f)

    if multiproc:
        pool = Pool()
        res = pool.starmap(_rootfinding, work)
        pool.close()
        pool.join()
    else:
        res = [_rootfinding(*w) for w in work]
    ql, qu = zip(*res)

    return np.array(ql), np.array(qu)
Пример #26
0
 def test(self, test_paragraphs, subgraph_strategy='greedy', order_strategy='greedy', processes=1):
     print('subgraph_strategy: %s' % subgraph_strategy)
     print('order_strategy: %s' % order_strategy)
     if processes > 1:
         p = Pool(processes)
         graphs, kendall_taus = zip(*p.starmap(self.evaluate, [(t, subgraph_strategy, order_strategy) for t in test_paragraphs]))
     else:
         graphs, kendall_taus = zip(*[self.evaluate(test_paragraph, subgraph_strategy, order_strategy) for test_paragraph in test_paragraphs])
     print('mean, std_dev, min, max kendall tau: ', summary(kendall_taus))
     print(kendall_taus)
     print()
     return graphs
Пример #27
0
def main(graph, nbk, delta_max, mu,  temp, alpha, max_eval, iter, move_operator, logsPath):

    logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
    fh = logging.FileHandler(logsPath + "/mproc_simulated-annealing.log")
    fh.setLevel(logging.INFO)
    frmt = logging.Formatter('%(message)s')
    fh.setFormatter(frmt)
    log.addHandler(fh)

    all_num_evaluations = []
    all_best_score = []
    all_time = []
    all_temp = []

    nb_proc = cpu_count()
    pool = Pool(processes=nb_proc)
    log.info("-------MULTI_PROC SIMULATED ANNEALING-------")
    startWork = timeit.default_timer()
    results = pool.starmap(doWork, zip(range(iter), repeat(graph), repeat(move_operator), repeat(max_eval),
                                       repeat(delta_max), repeat(mu), repeat(temp), repeat(alpha), repeat(nbk)))
    stopWork = timeit.default_timer()
    timeWork = (stopWork - startWork)

    actual_best_score = sys.maxsize
    for result in results:
        num_evaluations, best_score, best, temp, time = result
        if best_score < actual_best_score:
            actual_best = best
            actual_best_score = best_score
        all_temp.append(temp)
        all_num_evaluations.append(num_evaluations)
        all_best_score.append(best_score)
        all_time.append(time)

    log.info("Running on %d proc" % nb_proc)
    log.info("nbS = %d; nbK = %d; delta_max = %d; mu = %r; alpha = %r; move_operator= %s" % (graph.get_nbVertices(), nbk,
                                                                                           delta_max, mu, alpha,  move_operator.__name__))
    log.info("for %d iteration with %d max_evaluations each, "
             "\n best score found is %d,"
             "\n total time in sec : %r"
             "\n mean time in sec : %r,"
             "\n mean best_score : %r, EcT : %r"
             "\n mean num_eval : %r,"
             "\n mean end temperature : %r" % (iter,
                                               max_eval,
                                               min(score for score in all_best_score),
                                               timeWork,
                                               statistics.mean(all_time),
                                               statistics.mean(all_best_score),
                                               statistics.stdev(all_best_score),
                                               statistics.mean(all_num_evaluations),
                                               statistics.mean(all_temp)))
Пример #28
0
class PatternFinder(object):

	def __init__(self):
		""" Initializes the object and creates a pool of 4 processes """

		super(PatternFinder, self).__init__()
		self.pool = Pool(processes=cpu_count())

	def find(self, pattern, files):
		""" For each file, calls a function that searches for the pattern in
		that file. Each file is handled by a different process.
		To give 2 parameters to the function, a list of tuples is passed, each
		tuple contains the pattern and a file.
		Finally, the list of lists of ids is flattened and returned.
		"""

		res = self.pool.starmap(self.find_in_file, zip([pattern]*len(files), files))

		return list(itertools.chain.from_iterable(res))

	def find_in_file(self, pattern, fileName):
		""" Searches for the pattern in a file, line by line.
		If it finds it, it saves the id. Returns an array of ids.
		"""

		ids = []
		file = open(fileName, "r" )
		lines = file.readlines()

		for line in lines:
			res = re.search(pattern, line, re.DOTALL)
			if res is not None:
				ids.append(int(re.search('^(\d+)', line[:res.start()]).group(0)))
		
		file.close()

		return ids

	def __getstate__(self):
		""" Removes pool object from the instance before pickling
		since Pool objects cannot be pickled.
		"""

		self_dict = self.__dict__.copy()
		del self_dict['pool']

		return self_dict

	def __setstate__(self, state):
		""" Called upon unpickling """

		self.__dict__.update(state)	
Пример #29
0
def computeNormals(PointsXYZ):

    p = Pool(4)

    f1,f2 = PointsXYZ[:int(len(PointsXYZ)/2)],PointsXYZ[int(len(PointsXYZ)/2):]
    one, two = f1[:int(len(f1)/2)],f1[int(len(f1)/2):]
    three,four = f2[:int(len(f2)/2)],f2[int(len(f2)/2):]

    jobs = [[one,PointsXYZ],[two,PointsXYZ],[three,PointsXYZ],[four,PointsXYZ]]
    
    L1,L2,L3,L4= p.starmap(Normals,jobs)
    PointsNormal = L1+L2+L3+L4
    return PointsNormal
Пример #30
0
 def evaluate(self):
     _pool = Pool(self.n_jobs)
     # iterador de parametros de función objetivo multiproceso
     _iterable = it.product([self.parametros], np.int32(self.sample[:, :self.dimensions]),
                            [self.estimator], [self.score_cache], [self.resultados])
     print(_iterable)
     self.sample[:, -1] = _pool.starmap(self.objective_function, _iterable)
     _pool.close()
     _pool.join()
     if self.debug:
         print("evaluate")
         print(self.sample)
         print("\n")
Пример #31
0
def main(options, args):

    time0 = time.time()
    image_dir = args[0]
    geojson_list = io_function.get_file_list_by_ext('.geojson',
                                                    image_dir,
                                                    bsub_folder=False)
    # remove some scenes, or maybe we should set bsub_folder=False
    # geojson_list = [item for item in geojson_list if 'incomplete_scenes' not in item ]  # remove those in "incomplete_scenes"
    # geojson_list = [item for item in geojson_list if 'scenes_high_cloud_cover' not in item ]  # remove those in "scenes_high_cloud_cover"

    if len(geojson_list) < 1:
        raise ValueError('There is no geojson files in %s' % image_dir)

    basic.outputlogMessage('Image Dir: %s' % image_dir)
    basic.outputlogMessage("Number of geojson files: %d" % len(geojson_list))

    grid_polygon_shp = args[
        1]  # the polygon should be in projection Cartesian coordinate system (e.g., UTM )
    basic.outputlogMessage('Image grid polygon shapefile: %s' %
                           grid_polygon_shp)
    process_num = options.process_num
    basic.outputlogMessage(
        'The number of processes for creating the mosaic is: %d' % process_num)

    # read grid polygons
    grid_polygons = vector_gpd.read_polygons_gpd(grid_polygon_shp)
    grid_ids = vector_gpd.read_attribute_values_list(grid_polygon_shp, 'id')
    if grid_ids is None:
        basic.outputlogMessage(
            'Warning, field: id is not in %s, will create default ID for each grid'
            % grid_polygon_shp)
        grid_ids = [id + 1 for id in range(len(grid_polygons))]

    shp_prj = map_projection.get_raster_or_vector_srs_info_proj4(
        grid_polygon_shp).strip()
    # print(shp_prj)
    grid_polygons_latlon = grid_polygons
    if shp_prj != '+proj=longlat +datum=WGS84 +no_defs':
        # read polygons and reproject to 4326 projection
        grid_polygons_latlon = vector_gpd.read_shape_gpd_to_NewPrj(
            grid_polygon_shp, 'EPSG:4326')
    # else:
    #     raise ValueError(' %s should be in projection of Cartesian coordinate system'%grid_polygon_shp)

    shp_prj_wkt = map_projection.get_raster_or_vector_srs_info_wkt(
        grid_polygon_shp)

    max_sr = options.max_sr
    min_sr = options.min_sr

    original_img_copy_dir = options.original_img_copy_dir
    b_to_rgb_8bit = options.to_rgb
    basic.outputlogMessage('Convert to 8bit RGB images: %s' %
                           str(b_to_rgb_8bit))

    # group planet image based on acquisition date
    b_group_date = options.group_date
    basic.outputlogMessage('Group Planet image based on acquisition date: %s' %
                           str(b_group_date))
    if b_group_date:
        # diff_days as 0, group images acquired at the same date
        geojson_groups = group_planet_images_date(geojson_list, diff_days=0)

        # sort based on yeardate in accending order : operator.itemgetter(0)
        geojson_groups = dict(
            sorted(geojson_groups.items(), key=operator.itemgetter(0)))

        save_group_txt = 'geojson_groups_input_folder.txt'
        basic.outputlogMessage(
            'images are divided into %d groups, save to %s' %
            (len(geojson_groups.keys()), save_group_txt))
        io_function.save_dict_to_txt_json(save_group_txt, geojson_groups)
    else:
        geojson_groups = {'all': geojson_list}

    # create mosaic of each grid
    cloud_cover_thr = options.cloud_cover
    cloud_cover_thr = cloud_cover_thr * 100  # for Planet image, it is percentage
    out_res = options.out_res
    cur_dir = os.getcwd()
    resampling_method = options.merged_method

    for key in geojson_groups.keys():

        # # test
        # if key != '20200701':
        #     continue

        geojson_list = geojson_groups[key]
        save_dir = os.path.basename(cur_dir) + '_mosaic_' + str(
            out_res) + '_' + key
        # print(save_dir)
        if process_num == 1:
            for id, polygon, poly_latlon in zip(grid_ids, grid_polygons,
                                                grid_polygons_latlon):
                # if id != 34:
                #     continue
                create_moasic_of_each_grid_polygon(
                    id,
                    polygon,
                    poly_latlon,
                    out_res,
                    cloud_cover_thr,
                    geojson_list,
                    save_dir,
                    new_prj_wkt=shp_prj_wkt,
                    new_prj_proj4=shp_prj,
                    sr_min=min_sr,
                    sr_max=max_sr,
                    to_rgb=b_to_rgb_8bit,
                    save_org_dir=original_img_copy_dir,
                    resampling_method=resampling_method)
        elif process_num > 1:
            theadPool = Pool(process_num)  # multi processes

            parameters_list = [
                (id, polygon, poly_latlon, out_res, cloud_cover_thr,
                 geojson_list, save_dir, shp_prj_wkt, shp_prj, min_sr, max_sr,
                 b_to_rgb_8bit, 0, original_img_copy_dir)
                for id, polygon, poly_latlon in zip(grid_ids, grid_polygons,
                                                    grid_polygons_latlon)
            ]
            results = theadPool.starmap(create_moasic_of_each_grid_polygon,
                                        parameters_list)  # need python3
            theadPool.close()
        else:
            raise ValueError('incorrect process number: %d' % process_num)

    cost_time_sec = time.time() - time0
    basic.outputlogMessage(
        'Done, total time cost %.2f seconds (%.2f minutes or %.2f hours)' %
        (cost_time_sec, cost_time_sec / 60, cost_time_sec / 3600))

    pass
def initialize_variables(input_v, activate_parallel, num_cores):
    from evaluate_objective import evaluate_objective
    from random import uniform
    import numpy as np
    from multiprocessing import Pool, cpu_count
    N = input_v['population']
    M = input_v['M']
    V = input_v['V']

    mini = input_v['min_range']
    maxi = input_v['max_range']

    K = M + V
    my_list = []

    ##Initialize each chromosome
    f_obj = np.zeros((N, M))  ##To only contain objective function values
    f_x = np.zeros((N, V))  ##To contain only variables

    if True:
        #print("in parallel")
        for i in range(0, N):
            ##Initialize the decision variables based on the minimum and the maximum possible values.
            ##V is the number of decision variables. A random number is picked between the minimum and
            ##maximum possible values for each decision variable.

            for j in range(0, V):
                f_x[i, j] = mini[j] + ((maxi[j] - mini[j]) * uniform(0, 1))

            my_list.append((f_x[i, :], input_v))

            ##For ease of computation and handling data the chromosome also has the value of the objective
            ##function concatenated at the end. The elements V + 1 to K has the objective function values.
            ##The function evaluate_objective takes one chromosome at a time, infact only the decision variables
            ##are passed to the function along with information about the number of objective functions which are
            ##processed and returns the value for the objective functions. These values are now stored at the end
            ##of the chromosome itself.
        #print(*my_list)
        if (num_cores < cpu_count()) and (num_cores >= 1):

            p = Pool(num_cores)

        else:
            p = Pool()

        results = p.starmap(evaluate_objective, my_list)
        p.close()
        p.join()
        for i in range(0, N):
            f_obj[i, :] = results[i]
    else:

        for i in range(0, N):
            ##Initialize the decision variables based on the minimum and the maximum possible values.
            ##V is the number of decision variables. A random number is picked between the minimum and
            ##maximum possible values for each decision variable.

            for j in range(0, V):
                f_x[i, j] = mini[j] + ((maxi[j] - mini[j]) * uniform(0, 1))

            ##For ease of computation and handling data the chromosome also has the value of the objective
            ##function concatenated at the end. The elements V + 1 to K has the objective function values.
            ##The function evaluate_objective takes one chromosome at a time, infact only the decision variables
            ##are passed to the function along with information about the number of objective functions which are
            ##processed and returns the value for the objective functions. These values are now stored at the end
            ##of the chromosome itself.

            f_obj[i, :] = evaluate_objective(f_x[i, :], input_v)
            number = i + 1
            display_message = 'Initializing population ' + str(
                number) + ' of ' + str(N)
            print(display_message)

    input_v['objAll'] = f_obj
    input_v['xAll'] = f_x

    f_all = np.concatenate((f_x, f_obj), axis=1)

    return f_all, input_v


##Copyright (c) 2009, Aravind Seshadri
##All rights reserved.

##Redistribution and use in source and binary forms, with or without  modification, are permitted provided that the following
##conditions are met:

##   * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
##   * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer
##     in the documentation and/or other materials provided with the distribution
##
##THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
##NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
##THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
##(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
##HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
##ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        with open(os.path.join(output_path, title + '.json'), 'w') as f:
            json.dump(f0_labels, f)
    except:
        print(title)
        pass


if __name__ == '__main__':

    input1_path = sys.argv[1]
    input2_path = sys.argv[2]
    output_path = sys.argv[3]

    #input1_path = '../build/15_augmented_f0'
    #input2_path = '../build/16_f0_timepoints'
    #output_path = '../build/18_f0_labels'

    # This parameter is the frequency resolution, i.e. vertical sampling rate
    s = 24

    titles1 = load_titles(input1_path, '.f0')
    titles2 = load_titles(input2_path, '.json')

    titles = list(set(titles1).intersection(titles2))

    p = Pool()
    p.starmap(extract_f0_labels,
              [(title, input1_path, input2_path, output_path, s)
               for title in titles])
Пример #34
0
                nb_all.append(bcut + n_cmin_buckets)
    rshape = (len(args.space_list), len(b_cutoffs), len(args.n_hashes_list))
    n_cm_all = np.array(nb_all) - np.array(bcut_all)

    if args.lookup_data:
        min_scut = np.min(scut_all) # no need to store elements that are smaller
        x_train = np.asarray(x_train)
        x_train_hh = x_train[y_train > min_scut]
        y_train_hh = y_train[y_train > min_scut]
        lookup_dict = dict(zip(x_train_hh, y_train_hh))

    start_t = time.time()
    pool = Pool(args.n_workers)
    if args.perfect_order:
        y_sorted = np.sort(y_valid)[::-1]
        results = pool.starmap(run_ccm, zip(repeat(y_sorted), bcut_all, nh_all, nb_all, repeat(name), repeat(sketch_type)))
    elif args.lookup_data:
        results = pool.starmap(run_ccm_lookup, zip(repeat(x_valid), repeat(y_valid), nh_all, n_cm_all, repeat(lookup_dict), scut_all, repeat(name), repeat(sketch_type)))
    else:
        results = pool.starmap(run_ccm, zip(repeat(y_valid_ordered), bcut_all, nh_all, nb_all, repeat(name), repeat(sketch_type)))
    pool.close()
    pool.join()
    valid_results, space_actual = zip(*results)
    valid_results = np.reshape(valid_results, rshape)
    space_actual = np.reshape(space_actual, rshape)
    bcut_all = np.reshape(bcut_all, rshape)
    scut_all = np.reshape(scut_all, rshape)
    nh_all = np.reshape(nh_all, rshape)
    nb_all = np.reshape(nb_all, rshape)

    log_str += '==== valid_results ====\n'
Пример #35
0
	def fft2_calc(self,line,zone,start_time,mean_time,coef_time=1,fft_type="fft",norm=None,resamp=None,crop=None,val_sum=None,min_lim=None,max_lim=None,val_type=None,data_filter=None,pp_stack=None,pp_fftt=None,pp_ffts=None):
		""" Calcul de la transformée 2D

		pos_start,pos_end: position intiale et finale de calcul
		start_time:        Temps à partir duquel la FFT commence 
		mean_time:         Durée sur laquelle la FFT est appliquée   
		coef_time:         Nombre de set consécutif de donnée de durée mean_time à concaténer 
		fft_type:          fft,rfft,ifft,irfft
		norm:			   None or ortho
		dist_corr:		   Relation between fiber length and pipe length
		resamp:            [lsampling,timesampling]
		crop:              [kinf,ksup,finf,fsup]
		val_sum:           Intensity are summed or averaged (default averaged)
		min_lim:           Set to min_lim[1] intensity less than min_lim[0]
		max_lim:           Set to max_lim[1] intensity more than max_lim[0]
		val_type		   Output data "real" or "abs"
		pp_stack		   Functions and args to apply before data stacking after the FFTT
		pp_fftt			   Functions and args to apply after data stacking
		pp_ffts			   Functions and args to apply after the FFTS
		"""
		c = 0
		start_data = int(start_time*self.fsamp)
		dt = int(mean_time*self.fsamp)
		duration = int(mean_time*coef_time*self.fsamp)
		if fft_type == "fft" or fft_type == "ifft" or fft_type == "fft2":
			res = n.zeros((int(self.lines[line][zone][1]-self.lines[line][zone][0]),dt),dtype=complex)
		if fft_type == "rfft" or fft_type == "irfft" or fft_type == "rfft2":
			res = n.zeros((int(self.lines[line][zone][1]-self.lines[line][zone][0]),int(dt/2)+1),dtype=complex)
			# if fft_type == "rfft2":
			# 	res = n.zeros((int((self.lines[line][zone][1]-self.lines[line][zone][0])/2)+1,dt),dtype=complex)
		k = cpu_count()
		print("FFT time")
		for i in range(start_data,int(start_data+duration),k*dt):
			argsdat = list()
			print(i)
			for l in range(k):
				if int(i+(l+1)*dt) <= int(start_data+duration):
					if fft_type == "rfft2" or fft_type == "fft2":
						argsdat.append((self.get_array(self.lines[line][zone][0],self.lines[line][zone][1],i,int(i+dt)),None,(-2,-1),norm))
					else:
						argsdat.append((self.get_array(self.lines[line][zone][0],self.lines[line][zone][1],i,int(i+dt)),None,1,norm))
			pool = Pool(k)
			t=time.time()
			if fft_type == "fft": restemp = pool.starmap(n.fft.fft,argsdat)
			if fft_type == "rfft": restemp = pool.starmap(n.fft.rfft,argsdat)
			if fft_type == "ifft": restemp = pool.starmap(n.fft.ifft,argsdat)
			if fft_type == "irfft": restemp = pool.starmap(n.fft.irfft,argsdat)
			pool.close()
			pool.join()
			print("FFT time finished in ",time.time()-t,"s")
			del pool
			del argsdat
			c+= len(restemp)
			restemp = n.array(restemp)
			for j in range(len(restemp)):
				aa = vt.set_to_lim(restemp[j,:,:],min_lim,max_lim)
				if pp_stack is not None:
					for i,j in pp_stack.items():
						if j["args"] is not None: aa = j["func"](aa,**j["args"])
						else: aa = j["func"](aa)
				res += aa 
			del restemp
		# X and Y vec calc
		if fft_type == "fft" or fft_type == "ifft" or fft_type == "fft2": 
			self.fvec = n.fft.fftfreq(n.size(res,1),1/self.fsamp)
		if fft_type == "rfft" or fft_type == "irfft" or fft_type == "rfft2":
			self.fvec = n.fft.rfftfreq((n.size(res,1)-1)*2+1,1/self.fsamp)
		res,[k,self.fvec] = vt.sort_vecs(res,[None,self.fvec])
		dx = self.spatial_res/self.lines[line]["dist_ratio"]
		self.kvec = n.fft.fftfreq(n.size(res,0),dx)
		# Post fft time processing
		if pp_fftt is not None:
			for i,j in pp_fftt.items():
				if j["args"] is not None: res = j["func"](res,**j["args"])
				else: res = j["func"](res)
		print("FFT time calc in progress")
		t=time.time()
		# if fft_type != "fft2" and fft_type != "rfft2":
		res = n.fft.fft(res,None,0,norm)
		print("FFT space calc finished in",time.time()-t,"s")
		# Post fft space processing
		if pp_ffts is not None:
			for i,j in pp_ffts.items():
				if j["args"] is not None: res = j["func"](res,**j["args"])
				else: res = j["func"](res)

		# Post processing
		if val_type == "real": res = n.real(res)
		if val_type == "abs": res = n.abs(res)
		if val_sum is not None: res = res/c
		if crop is not None:
			res,[self.kvec,self.fvec] = vt.crop_vecs(res,[self.kvec,self.fvec],crop)
		if resamp is not None:
			if resamp[0] == "auto": resamp[0] = len(self.fvec)
			if resamp[1] == "auto": resamp[1] = len(self.kvec)
			res,[self.kvec,self.fvec] = vt.resamp_vecs(res,[self.kvec,self.fvec],[resamp[0],resamp[1]],False)
		res,[self.kvec,self.fvec] = vt.sort_vecs(res,[self.kvec,self.fvec])
		return res,self.kvec,self.fvec
Пример #36
0
def test_calc_events():
    test_data_dir = '/u/casey/scratch/work/microlens/galaxia_test/OGLE672/'
    hdf5_file = test_data_dir + 'OGLE672.h5'
    output_root2 = 'OGLE672_TEST'
    obs_time = 1000
    n_obs = 101
    radius_cut = 2
    theta_frac = 2
    blend_rad = 0.8
    microlens_path = '/u/casey/scratch/code/PopSyCLE'
    n_proc = 1
    overwrite = False

    t0 = time.time()

    # Initialize events_tmp and blends_tmp.
    events_tmp = None
    blends_tmp = None

    # Get the l and b from the HDF5 file.
    hf = h5py.File(hdf5_file, 'r')
    l_array = np.array(hf['long_bin_edges'])
    b_array = np.array(hf['lat_bin_edges'])
    hf.close()

    # Converts radius_cut from arcseconds into milliarcseconds
    radius_cut *= 1000.0

    # Set up the multiprocessing
    pool = Pool(n_proc)

    # Set up inputs to be able to be read by pool.map
    # Only do a few of the bins.
    nll = 7
    nbb = 7

    llbb = itertools.product(range(nll), range(nbb))

    reps = nll * nbb

    hd = itertools.repeat(hdf5_file, reps)
    ot = itertools.repeat(obs_time, reps)
    no = itertools.repeat(n_obs, reps)
    rc = itertools.repeat(radius_cut, reps)
    tf = itertools.repeat(theta_frac, reps)
    br = itertools.repeat(blend_rad, reps)

    inputs = zip(llbb, hd, ot, no, rc, tf, br)

    ##########
    # Loop through galactic latitude and longitude bins. For each bin vertex, take
    # the nearest 4 bin samples and calculate microlensing events. We do this
    # to properly handle bin edges (i.e. a sliding window analysis of 2x2 bins).
    # Duplicate events are removed.
    ##########
    # Should I use starmap_async?
    results = pool.starmap(synthetic._calc_event_time_loop, inputs)

    pool.close()
    pool.join()

    # Remove all the None values
    # (occurs for patches with less than 10 objects)
    results = [i for i in results if i is not None]

    # Remove all the None values
    # (occurs for patches with less than 10 objects)
    results = [i for i in results if i is not None]

    results_ev = []
    results_bl = []

    for ii in range(len(results)):
        if results[ii] is not None:
            if results[ii][0] is not None:
                results_ev.append(results[ii][0])
            if results[ii][1] is not None:
                results_bl.append(results[ii][1])

    events_tmp = np.concatenate(results_ev, axis=1)
    blends_tmp = np.concatenate(results_bl, axis=1)

    # Convert the events numpy array into an Astropy Table for easier consumption.
    # The dimensions of events_final_table are 52 x Nevents
    if events_tmp is not None:
        events_tmp = synthetic.unique_events(events_tmp)
        events_final = Table(
            events_tmp.T,
            names=('zams_mass_L', 'rem_id_L', 'mass_L', 'px_L', 'py_L', 'pz_L',
                   'vx_L', 'vy_L', 'vz_L', 'rad_L', 'glat_L', 'glon_L', 'vr_L',
                   'mu_b_L', 'mu_lcosb_L', 'age_L', 'popid_L', 'ubv_k_L',
                   'ubv_i_L', 'exbv_L', 'obj_id_L', 'ubv_j_L', 'ubv_u_L',
                   'ubv_r_L', 'ubv_b_L', 'ubv_h_L', 'ubv_v_L', 'zams_mass_S',
                   'rem_id_S', 'mass_S', 'px_S', 'py_S', 'pz_S', 'vx_S',
                   'vy_S', 'vz_S', 'rad_S', 'glat_S', 'glon_S', 'vr_S',
                   'mu_b_S', 'mu_lcosb_S', 'age_S', 'popid_S', 'ubv_k_S',
                   'ubv_i_S', 'exbv_S', 'obj_id_S', 'ubv_j_S', 'ubv_u_S',
                   'ubv_r_S', 'ubv_b_S', 'ubv_h_S', 'ubv_v_S', 'theta_E', 'u0',
                   'mu_rel', 't0'))

    if blends_tmp is not None:
        blends_tmp = synthetic.unique_blends(blends_tmp)
        blends_final = Table(
            blends_tmp.T,
            names=('obj_id_L', 'obj_id_S', 'zams_mass_N', 'rem_id_N', 'mass_N',
                   'px_N', 'py_N', 'pz_N', 'vx_N', 'vy_N', 'vz_N', 'rad_N',
                   'glat_N', 'glon_N', 'vr_N', 'mu_b_N', 'mu_lcosb_N', 'age_N',
                   'popid_N', 'ubv_k_N', 'ubv_i_N', 'exbv_N', 'obj_id_N',
                   'ubv_j_N', 'ubv_u_N', 'ubv_r_N', 'ubv_b_N', 'ubv_h_N',
                   'ubv_v_N', 'sep_LN'))

    if events_tmp is None:
        print('No events!')
        return

    t1 = time.time()

    # Save out file
    events_final.write(output_root2 + '_events.fits', overwrite=overwrite)
    blends_final.write(output_root2 + '_blends.fits', overwrite=overwrite)

    print('Total runtime: {0:f} s'.format(t1 - t0))

    return
Пример #37
0
def analyze_recon(sbml_folder, output_stat_file, padmet_folder=None, padmet_bool=None, nb_cpu=1):
    """Analyze the sbml and/or the padmet files after metabolic network reconstruction.
    And write the result in a file.

    Args:
        sbml_folder (str): directory of SBML files
        output_stat_file (str): path to output stat file
        padmet_folder (str): directory of PADMET files
        padmet_bool (bool): use or not the padmet files
        nb_cpu (int): number of CPU to use
    """
    analyze_pool = Pool(processes=nb_cpu)
    if padmet_bool and padmet_folder:
        genes = {}
        reactions = {}
        gene_associated_reactions = {}
        compounds = {}
        pathways = {}

        multiprocessing_data = []

        if os.listdir(padmet_folder) == 0:
            logger.critical("No padmet in " + padmet_folder)
            sys.exit(1)

        for padmet in os.listdir(padmet_folder):
            padmet_file = os.path.join(padmet_folder, padmet)
            species_name = padmet.replace('.padmet', '')
            multiprocessing_data.append((species_name, padmet_file))
        recon_stats = analyze_pool.starmap(create_padmet_stat, multiprocessing_data)

        with open(output_stat_file, 'w') as micro_file:
            csvwriter = csv.writer(micro_file, delimiter='\t')
            csvwriter.writerow(['species', 'nb_reactions', 'nb_reactions_with_genes', 'nb_genes', 'nb_compounds', 'nb_pathways'])
            for recon_stat in recon_stats:
                species_name = recon_stat[0]
                genes[species_name] = recon_stat[1]
                reactions[species_name] = recon_stat[2]
                gene_associated_reactions[species_name] = recon_stat[3]
                compounds[species_name] = recon_stat[4]
                pathways[species_name] = recon_stat[5]
                csvwriter.writerow([species_name, len(reactions[species_name]), len(gene_associated_reactions[species_name]),
                                    len(genes[species_name]), len(compounds[species_name]), len(pathways[species_name])])
    else:
        genes = {}
        reactions = {}
        compounds = {}
        pathways = None
        gene_associated_reactions = {}

        multiprocessing_data = []

        if os.listdir(sbml_folder) == 0:
            logger.critical("No sbml in " + sbml_folder)
            sys.exit(1)

        for sbml in os.listdir(sbml_folder):
            species_name = sbml.replace('.sbml','')
            sbml_file = os.path.join(sbml_folder, sbml)
            multiprocessing_data.append((species_name, sbml_file))
        sbml_stats = analyze_pool.starmap(create_sbml_stat, multiprocessing_data)

        with open(output_stat_file, 'w') as micro_file:
            csvwriter = csv.writer(micro_file, delimiter='\t')
            csvwriter.writerow(['species', 'nb_reactions', 'nb_reactions_with_genes', 'nb_genes', 'nb_compounds'])
            for sbml_stat in sbml_stats:
                species_name = sbml_stat[0]
                genes[species_name] = set(sbml_stat[1])
                reactions[species_name] = set(sbml_stat[2])
                gene_associated_reactions[species_name] = set(sbml_stat[3])
                compounds[species_name] = set(sbml_stat[4])
                csvwriter.writerow([species_name, len(reactions[species_name]), len(gene_associated_reactions[species_name]),
                                    len(genes[species_name]), len(compounds[species_name])])

    analyze_pool.close()
    analyze_pool.join()

    logger.info("######### Stats GSMN reconstruction #########")

    if len(genes) == len(reactions) and len(genes) == len(compounds) and len(reactions) == len(compounds):
        logger.info("Number of genomes: " + str(len(genes)))

    dataset_all_reactions = set([reaction for species_name in reactions for reaction in reactions[species_name]])
    logger.info("Number of reactions in all GSMN: " + str(len(dataset_all_reactions)))

    dataset_all_compounds = set([compound for species_name in compounds for compound in compounds[species_name]])
    logger.info("Number of compounds in all GSMN: " + str(len(dataset_all_compounds)))

    species_reactions = [len(reactions[species_name]) for species_name in reactions]
    if len(species_reactions) > 1:
        mean_species_reactions, sd_species_reactions = mean_sd_data(species_reactions)
        if mean_species_reactions and sd_species_reactions:
            logger.info("Average reactions per GSMN: " + mean_species_reactions + sd_species_reactions)
    else:
        logger.info("Number of reactions in GSMN: " + str(species_reactions[0]))

    species_compounds = [len(compounds[species_name]) for species_name in compounds]
    if len(species_compounds) > 1:
        mean_species_compounds, sd_species_compounds = mean_sd_data(species_compounds)
        if mean_species_compounds and sd_species_compounds:
            logger.info("Average compounds per GSMN: " + mean_species_compounds + sd_species_compounds)
    else:
        logger.info("Number of compounds in GSMN: " + str(species_compounds[0]))

    species_genes = [len(genes[species_name]) for species_name in genes]
    if len(species_genes) > 1:
        mean_species_genes, sd_species_genes = mean_sd_data(species_genes)
        if mean_species_genes and sd_species_genes:
            logger.info("Average genes per GSMN: " + mean_species_genes + sd_species_genes)
    else:
        logger.info("Number of genes in GSMN: " + str(species_genes[0]))

    if pathways:
        species_pathways = [len(pathways[species_name]) for species_name in pathways]
        if len(species_pathways) > 1:
            mean_species_pathways, sd_species_pathways = mean_sd_data(species_pathways)
            if mean_species_pathways and sd_species_pathways:
                logger.info("Average pathways per GSMN: " + mean_species_pathways + sd_species_pathways)
        else:
            logger.info("Number of pathways in GSMN: " + str(species_pathways[0]))

    gene_reactions_assoc_percentages = []
    for species_name in reactions:
        if len(reactions[species_name]) > 0:
            gene_reactions_assoc_percentages.append(((len(gene_associated_reactions[species_name]) / len(reactions[species_name]))*100))
        else:
            gene_reactions_assoc_percentages.append(0)
            logger.info('Warning: ' + species_name + ' metabolic network contains 0 reactions.')
    if len(gene_reactions_assoc_percentages) > 1:
        mean_gene_reactions_assoc_percentages, sd_gene_reactions_assoc_percentages = mean_sd_data(gene_reactions_assoc_percentages)
        if mean_gene_reactions_assoc_percentages and sd_gene_reactions_assoc_percentages:
            logger.info('Percentage of reactions associated with genes: ' + mean_gene_reactions_assoc_percentages + sd_gene_reactions_assoc_percentages)
    else:
        logger.info('Percentage of reactions associated with genes: ' + str(gene_reactions_assoc_percentages[0]))
def main():
    parser = ArgumentParser()
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--train_corpus', type=Path, required=True)
    parser.add_argument("--output_dir", type=Path, required=True)
    parser.add_argument("--bert_model",
                        type=str,
                        required=True,
                        choices=[
                            "bert-base-uncased", "bert-large-uncased",
                            "bert-base-cased",
                            "bert-base-multilingual-uncased",
                            "bert-base-chinese", "bert-base-multilingual-cased"
                        ])
    parser.add_argument("--do_lower_case", action="store_true")
    parser.add_argument(
        "--do_whole_word_mask",
        action="store_true",
        help=
        "Whether to use whole word masking rather than per-WordPiece masking.")
    parser.add_argument(
        "--reduce_memory",
        action="store_true",
        help=
        "Reduce memory usage for large datasets by keeping data on disc rather than in memory"
    )

    parser.add_argument("--num_workers",
                        type=int,
                        default=1,
                        help="The number of workers to use to write the files")
    parser.add_argument("--epochs_to_generate",
                        type=int,
                        default=3,
                        help="Number of epochs of data to pregenerate")
    parser.add_argument("--max_seq_len", type=int, default=128)
    parser.add_argument(
        "--short_seq_prob",
        type=float,
        default=0.1,
        help="Probability of making a short sentence as a training example")
    parser.add_argument(
        "--masked_lm_prob",
        type=float,
        default=0.15,
        help="Probability of masking each token for the LM task")
    parser.add_argument(
        "--max_predictions_per_seq",
        type=int,
        default=20,
        help="Maximum number of tokens to mask in each sequence")

    args = parser.parse_args()

    print('Using seed', args.seed)
    random.seed(args.seed)

    if args.num_workers > 1 and args.reduce_memory:
        raise ValueError("Cannot use multiple workers while reducing memory")

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)
    vocab_list = list(tokenizer.vocab.keys())
    with DocumentDatabase(reduce_memory=args.reduce_memory) as docs:
        with args.train_corpus.open() as f:
            doc = []
            for line in tqdm(f, desc="Loading Dataset", unit=" lines"):
                line = line.strip()
                if line == "":
                    docs.add_document(doc)
                    doc = []
                else:
                    tokens = tokenizer.tokenize(line)
                    if tokens:  # Sometimes the stuff is really weird and tokenization will fail
                        doc.append(tokens)
            if doc:
                docs.add_document(
                    doc
                )  # If the last doc didn't end on a newline, make sure it still gets added
        if len(docs) <= 1:
            exit(
                "ERROR: No document breaks were found in the input file! These are necessary to allow the script to "
                "ensure that random NextSentences are not sampled from the same document. Please add blank lines to "
                "indicate breaks between documents in your input file. If your dataset does not contain multiple "
                "documents, blank lines can be inserted at any natural boundary, such as the ends of chapters, "
                "sections or paragraphs.")

        args.output_dir.mkdir(exist_ok=True)

        if args.num_workers > 1:
            writer_workers = Pool(
                min(args.num_workers, args.epochs_to_generate))
            arguments = [(docs, vocab_list, args, idx)
                         for idx in range(args.epochs_to_generate)]
            writer_workers.starmap(create_training_file, arguments)
        else:
            for epoch in trange(args.epochs_to_generate, desc="Epoch"):
                create_training_file(docs, vocab_list, args, epoch)
Пример #39
0
    train_titles = sorted(
        set(titles['train']).intersection(merged_labels_titles))
    test_titles = sorted(
        set(titles['test']).intersection(merged_labels_titles))
    valid_titles = sorted(
        set(titles['valid']).intersection(merged_labels_titles))

    print(train_titles)

    unary_batch = True

    if unary_batch == True:

        p = Pool()
        p.starmap(unary_batching, [(title, input1_path, train_output_path)
                                   for title in train_titles])
        p.starmap(unary_batching, [(title, input1_path, test_output_path)
                                   for title in test_titles])
        p.starmap(unary_batching, [(title, input1_path, valid_output_path)
                                   for title in valid_titles])

    else:

        # this option puts all the files with the same length in the same batch.
        # this makes training fasters, but it also produces lower quality models

        # This is a limit on how long/large a batch will be
        # A batch is n_time_steps*n_files,
        # where all the files have the same number of time steps
        size_limit = 10000
Пример #40
0
def run_overlay_resources_score_motifs(motif_sites_dir,
                                       all_chromatin_makrs_all_cells_combined_dir_path, 
                                       motifs_overlapping_tracks_output_dir,
                                       run_in_parallel_param,
                                       number_processes_to_run_in_parallel,
                                       normal_expression_per_tissue_origin_per_TF,
                                       matching_cell_name_representative_dict, 
                                       motifTFName_TFNames_matches_dict, 
                                       cells_assays_dict, 
                                       cell_tfs, 
                                       tf_cells, 
                                       assay_cells_datatypes, 
                                       header):
    """pairs matching chromosomes in motif_sites_input_dir and all_chromatin_makrs_all_cells_input_dir and calls overlay_resources_score_motifs
    Input: moitf instances input dir (one file per chr)
           chromatin data collection dir (one file per chr, bed4 format; track pos, track cell#assaytype#value or cell#TFname in case of chip-seq) 
    Return: a list of motif_overlapping_track files
    Precondition: files in motif_sites_input_dir and chromatin_tracks_input_dir should have the same names 
                  Recommended: name files in both dirs as chrNumber, chrX or chrY (where number is between 1-22)
    """
    
    motif_files = []
    if not os.path.isdir(motif_sites_dir) and os.path.isfile(motif_sites_dir):
        motif_files = [motif_sites_dir]
        motif_sites_dir = "."
    else:
        motif_files = os.listdir(motif_sites_dir)

   
    motif_files_full_path =  [motif_sites_dir+'/' + s  for s in motif_files]
    print(motif_files_full_path )
    
    chromatin_tracks_files = os.listdir(all_chromatin_makrs_all_cells_combined_dir_path)
    if not os.path.exists(motifs_overlapping_tracks_output_dir):
        os.makedirs(motifs_overlapping_tracks_output_dir)
    #scored_motifs_chromatin_tracks_output_file = '.'.join(motifs_overlapping_tracks_file.split('.')[0:-1]) + '_scored.bed10' 
    #if not (os.path.exists(motifs_overlapping_tracks_file) and os.path.exists(scored_motifs_chromatin_tracks_output_file)):
    if run_in_parallel_param and len(motif_files)>1:
        p = Pool(int(number_processes_to_run_in_parallel))
        motifs_overlapping_tracks_files = p.starmap(overlay_resources_score_motifs, product(motif_files_full_path , 
                                                                    [motifs_overlapping_tracks_output_dir],
                                                                     [all_chromatin_makrs_all_cells_combined_dir_path],
                                                                    [chromatin_tracks_files]))
        p.close()
        p.join()
    else:
    
        motifs_overlapping_tracks_files = overlay_resources_score_motifs(motif_files_full_path, 
                                                motifs_overlapping_tracks_output_dir,
                                                all_chromatin_makrs_all_cells_combined_dir_path, 
                                                chromatin_tracks_files)

    #score intersected track files
    print(motifs_overlapping_tracks_files)
    scored_motifs_overlapping_tracks_files =[]
    for motifs_overlapping_tracks_file in motifs_overlapping_tracks_files:
        scored_motifs_chromatin_tracks_output_file = '.'.join(motifs_overlapping_tracks_file.split('.')[0:-1]) + '_scored.bed10' 
        with open(motifs_overlapping_tracks_file) as f:
                    count = sum(1 for _ in f)
        if not os.path.exists(scored_motifs_chromatin_tracks_output_file):#score each motif-track_overlapping file file
            print("computing scores to: " + scored_motifs_chromatin_tracks_output_file)
            index_track_names=7
            index_motif_name=3
            with open(scored_motifs_chromatin_tracks_output_file, 'w') as scored_motifs_writefile:
                if header:
                    header_line = ['posrange', 'chr', 'motifstart', 'motifend', 'name', 'score', 'pval','strand']
                    for cell in sorted(cells_assays_dict.keys()):
                        for assay in sorted(cells_assays_dict[cell].keys()):
                            if cell[0].isdigit():
                            #if cell=='22Rv1' or cell=='8988T':
                                cell='a'+cell
                                
                            #if cell=="Ammon's horn":
                            #    cell="Ammons horn"
                            #if cell=="Peyer's patch":
                            #    cell="Peyers patch"
                            cell_name ='_'.join(((cell + "___" + assay).replace('(','').replace(')','')
                                                 .replace('-','__').replace('.','').replace("'","")).split())
                            header_line.append('"'+cell_name+'"')
                    #print(header_line)
                    scored_motifs_writefile.write('\t'.join(header_line) + '\n')
            if (run_in_parallel_param):
                os.system( """split -l 200000 {} {}""" .format(motifs_overlapping_tracks_file,motifs_overlapping_tracks_file+'_tmp'))
                motifs_overlapping_tracks_file_splitted = glob.glob(motifs_overlapping_tracks_file+'_tmp*')
                p = Pool(int(number_processes_to_run_in_parallel))
                p.starmap(score_motifs_per_cell, product(motifs_overlapping_tracks_file_splitted, 
                                      [normal_expression_per_tissue_origin_per_TF], 
                                      [matching_cell_name_representative_dict], 
                                      [motifTFName_TFNames_matches_dict], 
                                      [cells_assays_dict], 
                                      [cell_tfs], 
                                      [tf_cells], 
                                      [assay_cells_datatypes], 
                                      [index_track_names], 
                                      [index_motif_name]))
                p.close()
                p.join() 
                #remove tmp splitted files
                with open(scored_motifs_chromatin_tracks_output_file, 'a') as scored_motifs_writefile:
                    for f in motifs_overlapping_tracks_file_splitted:
                        print(f+'_scored')
                        with open(f+'_scored', 'r') as f_score_ifile:
                            l = f_score_ifile.readline()
                            while l:
                                scored_motifs_writefile.write(l)
                                l = f_score_ifile.readline()
                            
                            
                        
                        f_score_ifile.close()
                        os.remove(f)
                        os.remove(f+'_scored')   
                scored_motifs_writefile.close()
            else:
                score_motifs_per_cell(motifs_overlapping_tracks_file, 
                                      normal_expression_per_tissue_origin_per_TF, 
                                      matching_cell_name_representative_dict, 
                                      motifTFName_TFNames_matches_dict, 
                                      cells_assays_dict, 
                                      cell_tfs, 
                                      tf_cells, 
                                      assay_cells_datatypes, 
                                      index_track_names, 
                                      index_motif_name)
                
                    
        scored_motifs_overlapping_tracks_files.append(scored_motifs_chromatin_tracks_output_file)   
    print(scored_motifs_overlapping_tracks_files)
    return motifs_overlapping_tracks_files, scored_motifs_overlapping_tracks_files
Пример #41
0
					# Establish iterators
					it1 = batch_iterator(FastqGeneralIterator(f1), n)
					it2 = batch_iterator(FastqGeneralIterator(f2), n)
					it3 = batch_iterator(FastqGeneralIterator(f3), n)

					# iterate over batches of length n
				
					for i, batch1 in enumerate(it1):
						batch2 = it2.__next__()
						batch3 = it3.__next__()
						output = o 
			
						# parallel process the barcode processing and accounting of failures.
						pool = Pool(processes=cpu)
						pm = pool.map(debarcode_trio, zip(batch1, batch2, batch3))
						pool.close()
			
						# Aggregate output
						fastq1 = [item[0] for item in pm]
						fastq2 = [item[1] for item in pm]

						# Export one chunk in parallel
						filename1 = output +'_R1.fastq.gz'
						filename2 = output +'_R2.fastq.gz'
			
						pool = Pool(processes=2)
						toke = pool.starmap(chunk_writer_gzip, [(filename1, fastq1), (filename2, fastq2)])
						pool.close()
			
	
def main(magneticum_snap_directories,
         bahamas_snap_directories,
         multi_processing=False,
         addition=False,
         check_clusters=False,
         number_of_virtual_nodes=500,
         number_of_projections=26,
         exposure_time=5000.,
         redshift=0.20,
         plotting=0,
         cores=16,
         number_of_neighbours=6,
         move_to_front=None):
    yt.funcs.mylog.setLevel(40)  # Suppresses yt status output.
    soxs.utils.soxsLogger.setLevel(40)  # Suppresses soxs status output.
    pyxsim.utils.pyxsimLogger.setLevel(40)  # Suppresses pyxsim status output.

    # Define the directories containing the data
    if os.getcwd().split('/')[2] == 's2675544':
        base_data_dir = '/home/s2675544/data'
        print('Running on ALICE')
    else:
        base_data_dir = '/home/matthijs/Documents/Studie/Master_Astronomy/1st_Research_Project/Data'
        print('Running at home')

    my_magneticum_data_dir = os.path.join(base_data_dir, 'Magneticum/Box2_hr')
    my_bahamas_data_dir = os.path.join(base_data_dir, 'Bahamas')
    my_tf_records_dir = os.path.join(base_data_dir, 'tf_records')

    magneticum_snap_paths = [
        os.path.join(my_magneticum_data_dir, snap_dir)
        for snap_dir in magneticum_snap_directories
    ]
    bahamas_snap_paths = [
        os.path.join(my_bahamas_data_dir, snap_dir)
        for snap_dir in bahamas_snap_directories
    ]

    defect_clusters = {
        'snap_128': {
            'split': [109, 16, 72, 48],
            'photon_max': [53, 78],
            'too_small': []
        },
        'snap_132': {
            'split': [75, 50, 110, 18],
            'photon_max': [8, 52, 55, 93, 139, 289],
            'too_small': []
        },
        'snap_136': {
            'split': [75, 107, 52, 15],
            'photon_max': [96, 137, 51, 315, 216, 55, 102, 101, 20, 3],
            'too_small': []
        },
        'AGN_TUNED_nu0_L100N256_WMAP9': {
            'split': [],
            'photon_max': [3],
            'too_small':
            [4, 10] + list(set(np.arange(20, 200)) - {20, 21, 22, 28})
        },
        'AGN_TUNED_nu0_L400N1024_WMAP9': {
            'split': [62, 89, 108, 125, 130, 191],
            'photon_max': [],
            'too_small': []
        }
    }

    for snap_idx, snap_path in enumerate(magneticum_snap_paths +
                                         bahamas_snap_paths):
        print(f'Snapshot path : {snap_path}')
        snap_dir = os.path.basename(snap_path)

        if snap_dir[0:3] == 'AGN':
            cluster_dirs = glob.glob(os.path.join(snap_path, '*'))

            snapnum = 32
            gdata = g.Gadget(os.path.join(base_data_dir, snap_dir),
                             'subh',
                             snapnum,
                             sim='BAHAMAS')

            subhalo_ids = [
                int(id)
                for id in gdata.read_var('FOF/FirstSubhaloID', verbose=False)
            ]

            centers = gdata.read_var('Subhalo/CentreOfPotential',
                                     verbose=False)
            centers = centers[subhalo_ids[:-1]]
            # Convert to codelength by going from cm to Mpc and from Mpc to codelength
            centers /= gdata.cm_per_mpc / 0.7
        else:
            cluster_dirs = glob.glob(os.path.join(snap_path, '*/*/*'))
            centers = []

        tfrecord_dir = os.path.join(my_tf_records_dir,
                                    snap_dir + '_tf_records')

        print(f'Tensorflow records will be saved in : {tfrecord_dir}')
        print(f'Number of clusters : {len(cluster_dirs)}')

        bad_cluster_idx = defect_clusters[snap_dir]['split'] + \
                          defect_clusters[snap_dir]['too_small'] + \
                          defect_clusters[snap_dir]['photon_max']

        print(
            f'Number of viable clusters : {len(cluster_dirs) - len(bad_cluster_idx)}'
        )

        if check_clusters:
            if os.path.isdir(tfrecord_dir):
                tfrecords = glob.glob(os.path.join(tfrecord_dir, '*'))

                if os.path.isdir(tfrecords[0]):
                    tfrecords = glob.glob(os.path.join(tfrecord_dir, 'train', '*')) + \
                                glob.glob(os.path.join(tfrecord_dir, 'test', '*'))

                processed_cluster_idxs = []

                # Why does the downsample function not run if this is executed AND multiprocessing is on????
                existing_cluster_datasets = tf.data.TFRecordDataset(
                    tfrecords).map(
                        lambda record_bytes: existing_clusters(record_bytes))

                os.makedirs(os.path.join(base_data_dir, 'images', snap_dir),
                            exist_ok=True)

                for cluster_idx, projection_idx, image in tqdm(
                        iter(existing_cluster_datasets)):
                    processed_cluster_idxs.append(cluster_idx.numpy())
                    if projection_idx.numpy() < 3:
                        fig, ax = plt.subplots(1, 1, figsize=(6, 6))
                        ax.imshow(image.numpy()[:, :, 0])
                        plt.savefig(
                            os.path.join(
                                base_data_dir, 'images', snap_dir, 'xray_' +
                                '{:03d}'.format(cluster_idx.numpy()) + '_' +
                                '{:02d}'.format(projection_idx.numpy()) +
                                '.png'))
                        plt.close(fig=fig)

                bad_cluster_dirs = []

                for cluster_dir in cluster_dirs:
                    if get_index(cluster_dir) in bad_cluster_idx:
                        bad_cluster_dirs.append(cluster_dir)

                for bad_cluster_dir in bad_cluster_dirs:
                    cluster_dirs.remove(bad_cluster_dir)

                for cluster_dir in sorted(cluster_dirs):
                    if processed_cluster_idxs.count(
                            get_index(cluster_dir)) != 26:
                        print(
                            'Unfinished cluster ', cluster_dir, ' : ',
                            processed_cluster_idxs.count(
                                get_index(cluster_dir)), ' images')

                print(f'Number of bad clusters: {len(bad_cluster_dirs)}')
                print(
                    f'Already processed : {len(set(processed_cluster_idxs))}')
                print(
                    f'Remaining cluster indices : {list(set([get_index(cluster) for cluster in cluster_dirs]) - set(processed_cluster_idxs))}'
                )
            else:
                print(
                    'Can not check clusters because the cluster directory does not yet exist!'
                )
        else:
            bad_cluster_dirs = []

            # print(f'All cluster indices : {[get_index(cluster) for cluster in cluster_dirs]}')

            if move_to_front is not None:
                cluster_dirs.insert(
                    0,
                    cluster_dirs.pop([
                        get_index(cluster) for cluster in cluster_dirs
                    ].index(move_to_front)))
            if addition:
                # Remove clusters which are already processed, lie on a periodic boundary
                # or will take too long to process
                # good_cluster_idxs = [53, 78]  # snap_128
                # good_cluster_idxs = [139, 289, 1]  # snap_132
                # good_cluster_idxs = [0, 7, 28, 59, 2, 53, 46, 152]  # snap_132
                # AGN 400 remaining big clusters [0, 1, 2, 3, 4, 40]
                good_cluster_idxs = [0, 1, 2, 3, 4, 40]

                for cluster_dir in cluster_dirs:
                    if get_index(cluster_dir) not in good_cluster_idxs:
                        bad_cluster_dirs.append(cluster_dir)
            else:
                for cluster_dir in cluster_dirs:
                    if get_index(cluster_dir) in bad_cluster_idx:
                        bad_cluster_dirs.append(cluster_dir)

            for bad_cluster_dir in bad_cluster_dirs:
                cluster_dirs.remove(bad_cluster_dir)

            # print(f'Remaining cluster indices : {[get_index(cluster) for cluster in cluster_dirs]}')

            if multi_processing:
                params = [(cluster, tfrecord_dir, base_data_dir, cluster_dirs,
                           snap_dir, centers, number_of_projections,
                           exposure_time, redshift, number_of_virtual_nodes,
                           number_of_neighbours, plotting)
                          for cluster in cluster_dirs]

                pool = Pool(cores)
                pool.starmap(generate_data, params)
            else:
                for cluster in cluster_dirs:
                    generate_data(
                        cluster=cluster,
                        tfrecord_dir=tfrecord_dir,
                        base_data_dir=base_data_dir,
                        cluster_dirs=cluster_dirs,
                        snap_dir=snap_dir,
                        centers=centers,
                        number_of_projections=number_of_projections,
                        exp_time=exposure_time,
                        redshift=redshift,
                        number_of_virtual_nodes=number_of_virtual_nodes,
                        number_of_neighbours=number_of_neighbours,
                        plotting=plotting)
Пример #43
0
from multiprocessing import Pool

def el(*args):
	m=args[0]
	ind = list(args[1])
	res=sum(i*j for i,j in zip(m,ind))
	return res

m1=[[1,1],
	[2,2]]

m2=[[3,4],
	[3,4]]

m22=list(zip(*m2))

res=[]
M=[]
pool=Pool(3)
for i in range(len(m1[0])):
	for j in range(len(m22)):
		res = pool.starmap(el, [(i,j) for i in m1 for j in m22])
	
print(res)
Пример #44
0
def getVersions(config):
    p = Pool(len(config['targets']))
    configs = [config for t in config['targets']]
    p.starmap(getVersion, zip(configs, config['targets']))
Пример #45
0
from itertools import repeat
import pandas as pd

nodes = search_algos.parse_graph()
# PARAMS
# iterative experiments
MAX_ITER = 10000
RATE = 0.004  # to be determined
N_CORES = 12

# GLS++ iterative
pool = Pool(12)
max_iters = [MAX_ITER] * N_CORES
# copies =  repeat(nodes.copy())
args = list(zip(repeat(nodes, 12), repeat(MAX_ITER, 12)))
MLS_iter_res12 = pool.starmap(search_algos.GLS_own_iter, args)
pool = Pool(13)
args = list(zip(repeat(nodes, 13), repeat(MAX_ITER, 13)))
MLS_iter_res13 = pool.starmap(search_algos.GLS_own_iter, args)
total = MLS_iter_res12 + MLS_iter_res13
results = []
i = 0
for dict_ in total:
    dict_['iteration'] = repeat(i, len(dict_[list(dict_.keys())[0]]))
    results.append(pd.DataFrame(dict_))
    i+=1
results_total = pd.concat(results)
results_total.to_csv('GLS_plusplus.csv')

del results_total, MLS_iter_res12, MLS_iter_res13, total, results
Пример #46
0
def getOEMUpdateStatuses(config):
    p = Pool(len(config['targets']))
    configs = [config for t in config['targets']]
    p.starmap(getOEMUpdateStatus, zip(configs, config['targets']))
Пример #47
0
class Tangle:
    def __init__(self, transactions, genesis):
        self.transactions = transactions
        self.genesis = genesis
        if current_process().name == 'MainProcess':
            os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
            self.process_pool = Pool(10)

    def add_transaction(self, tip):
        self.transactions[tip.name()] = tip

    def run_nodes(self, train_fn, clients, rnd, num_epochs=1, batch_size=10, malicious_clients=None, poison_type=PoisonType.NONE):
        norm_this_round = []
        new_transactions = []

        sys_metrics = {
            c.id: {BYTES_WRITTEN_KEY: 0,
                   BYTES_READ_KEY: 0,
                   LOCAL_COMPUTATIONS_KEY: 0} for c in clients}

        train_params = [[client.id, client.group, client.model.flops, random.randint(0, 4294967295), client.train_data, client.eval_data, self.name, (client.id in malicious_clients), poison_type] for client in clients]

        results = self.process_pool.starmap(train_fn, train_params)

        for tx, metrics, client_id, client_sys_metrics in results:
            if tx is None:
                continue

            sys_metrics[client_id][BYTES_READ_KEY] += client_sys_metrics[BYTES_READ_KEY]
            sys_metrics[client_id][BYTES_WRITTEN_KEY] += client_sys_metrics[BYTES_WRITTEN_KEY]
            sys_metrics[client_id][LOCAL_COMPUTATIONS_KEY] = client_sys_metrics[LOCAL_COMPUTATIONS_KEY]

            tx.tag = rnd
            new_transactions.append(tx)

        for tx in new_transactions:
            self.add_transaction(tx)

        return sys_metrics

    def test_model(self, test_fn, clients_to_test, set_to_use='test'):
        metrics = {}

        test_params = [[client.id, client.group, client.model.flops, random.randint(0, 4294967295), client.train_data, client.eval_data, self.name, set_to_use] for client in clients_to_test]
        results = self.process_pool.starmap(test_fn, test_params)

        for client, c_metrics in results:
            metrics[client] = c_metrics

        return metrics

    def save(self, tangle_name, global_loss, global_accuracy, norm):
        n = [{'name': t.name(), 'time': t.tag, 'malicious': t.malicious, 'parents': list(t.parents)} for _, t in self.transactions.items()]

        with open(f'tangle_data/tangle_{tangle_name}.json', 'w') as outfile:
            json.dump({'nodes': n, 'genesis': self.genesis, 'global_loss': global_loss, 'global_accuracy': global_accuracy, 'norm': norm}, outfile)

        self.name = tangle_name

    @classmethod
    def fromfile(cls, tangle_name):
      with open(f'tangle_data/tangle_{tangle_name}.json', 'r') as tanglefile:
          t = json.load(tanglefile)

      transactions = {n['name']: Transaction(None, set(n['parents']), n['name'], n['time'], n['malicious'] if 'malicious' in n else False) for n in t['nodes']}
      tangle = cls(transactions, t['genesis'])
      tangle.name = tangle_name
      return tangle
Пример #48
0
def updateBIOSfirmwares(config):
    p = Pool(len(config['targets']))
    configs = [config for t in config['targets']]
    p.starmap(updateBIOSfirmware, zip(configs, config['targets']))
Пример #49
0
def parallel_simu_help(mode,
                       num_obs,
                       p,
                       generating_mode,
                       individual_level=True,
                       num_iterations=50,
                       noise_type='G'):
    assert generating_mode in ['ma', 'var']
    print("now doing simulation with setting p = {}, mode = {}".format(
        p, mode))
    print("================")
    weights = fetch_weights(p, mode, generating_mode)
    stdev = 1
    span = fetch_span(num_obs, generating_mode)

    model_info = {}
    model_info['model'] = generating_mode
    model_info['weights'] = weights
    model_info['span'] = span
    model_info['stdev'] = stdev

    if generating_mode == 'ma':
        ts = generate_ma(weights,
                         num_obs=num_obs,
                         stdev=stdev,
                         noise_type=noise_type)
    elif generating_mode == 'var':
        ts = generate_mvar(weights,
                           num_obs=num_obs,
                           stdev=stdev,
                           noise_type=noise_type)

    arguments = list(
        zip(cycle([num_obs]), [model_info for _ in range(num_iterations)]))
    #print(arguments)

    iteration_pool = Pool(10)

    res = iteration_pool.starmap(evaluate_iteration, arguments)

    errs_dict_al,  errs_dict_th, errs_dict_so, errs_dict_sh, errs_dict_sm , precision_th, recall_th, F1_th, \
        precision_so, recall_so, F1_so, precision_al, recall_al, F1_al, true_spectral_norm_square = list(zip(*res))

    true_spectral_norm_square = true_spectral_norm_square[0]
    result = {}

    result['raw_error'] = {
        'al': errs_dict_al,
        'th': errs_dict_th,
        'so': errs_dict_so,
        'sh': errs_dict_sh,
        'sm': errs_dict_sm,
        'true': true_spectral_norm_square
    }
    result['precision'] = {
        'so': (np.mean(precision_so), np.std(precision_so)),
        'al': (np.mean(precision_al), np.std(precision_al)),
        'th': (np.mean(precision_al), np.std(precision_al))
    }
    result['recall'] = {
        'so': (np.mean(recall_so), np.std(recall_so)),
        'al': (np.mean(recall_al), np.std(recall_al)),
        'th': (np.mean(recall_th), np.std(recall_th))
    }
    result['F1'] = {
        'so': (np.mean(F1_so), np.std(F1_so)),
        'al': (np.mean(F1_al), np.std(F1_al)),
        'th': (np.mean(F1_th), np.std(F1_th))
    }

    append_relative_err(result)

    return result, simu_setting_2_str(p, mode)
Пример #50
0
def main(args=None):

    if args is None:
        # Top level arg parser
        parser = argparse.ArgumentParser()

        # Allow adding of subparsers - one for each model type
        subparsers = parser.add_subparsers(help='first argument should be the model to use e.g. BIMPM')

        # Add the parsers for the different model types
        ms = [i for i in dir(m_args) if '__' not in i]
        ms = [i for i in ms if i != 'str2bool']

        for m in ms:
            p = subparsers.add_parser(m.split('_args')[0])
            p = getattr(m_args, m)(p)

        args = parser.parse_args()

    kwargs = {}

    # Match problems to the loss fns they'll be trained with
    prob_losses = probs_to_losses(args.problems, args.loss_fn, args.loss_weights)

    vocab = None

    # Prep all the datasets
    prob_iterators = {}

    # Temporarily store the preprocessed data iterators
    temp_store = './data/data_temp/' + args.save_id + '/'
    if os.path.exists(temp_store):
        shutil.rmtree(temp_store)
    os.makedirs(temp_store)

    v_load = True

    for prob, dd in prob_losses.items():

        if problems[prob]['preload']:

            prob_temp = temp_store + prob + '/'

            # Read in the datasets - run in separate process so can free up memory when done
            print('Reading datasets and building vocab for {} problem...'.format(prob))
            p = Pool(processes=1)
            v_load = p.starmap(prep_data, [(args, prob, dd, vocab, False, prob_temp, temp_store)])
            p.close()

            v_load = v_load[0]

            # Load the vocab object as too large to pass back from multiprocess
            if v_load:
                vocab = joblib.load(temp_store + '/vocab')

            # Read the val iterator into memory and pass list of train iterator names
            fs = os.listdir(prob_temp)
            prob_iterators[prob] = {}
            prob_iterators[prob]['train'] = [prob_temp + t for t in fs if 'train' in t]
            if 'val' in fs:
                prob_iterators[prob]['val'] = joblib.load(prob_temp + 'val')

        else:
            base = './data/problems/' + prob + '/data/'
            prob_iterators[prob] = {'train': [base + f for f in os.listdir(base)]}

        if dd['ent']:
            setattr(args, 'num_ent_types', problems[prob]['ent_types'])

    if v_load:
        args.vocab_size = len(vocab.word_to_index)

    # Save the args and the vocab object
    if args.save:
        save_path = './saved_models/' + '_'.join(args.problems) + '/' + args.model + '/' + args.save_id + '/'
        os.makedirs(save_path, exist_ok=True)
        args.save_path = save_path
        # Save the params
        pickle.dump(args, open(save_path + 'args.p', 'wb'))
        # Save the vocab
        if v_load:
            pickle.dump(vocab, open(save_path + 'vocab.p', 'wb'))

    # Create the model
    model_class = getattr(models, args.model)

    print('Building model...')
    model = model_class(args, vocab, **kwargs)

    if args.gpu:
        model.cuda()

    # Train the model
    print('Training...')
    r = trainer(model, prob_iterators, args, vocab, prob_losses)

    return r
Пример #51
0
class S2Ranker:
    """A class to encapsulate the Semantic Scholar search ranker.

    Arguments:
        data_dir {str} -- where the language models and lightgbm model live.
        use_posthoc_correction {bool} -- whether to use posthoc correction
    """
    def __init__(self, data_dir, use_posthoc_correction=True):
        self.use_posthoc_correction = use_posthoc_correction
        self.data_dir = data_dir
        with open(os.path.join(data_dir, 'lightgbm_model.pickle'), 'rb') as f:
            self.model = pickle.load(f)

        self.pool = Pool(cpu_count())

    def score(self, query, papers):
        """Score each pair of (query, paper) for all papers

        Arguments:
            query {str} -- plain text search query 
            papers {list of dicts} -- A list of candidate papers, each of which
                                      is a dictionary.

        Returns:
            scores {np.array} -- an array of scores, one per paper in papers
        """
        query = str(query)
        #presults = [self.prepare_result(paper) for paper in papers]
        presults = self.pool.map(self.prepare_result, papers)
        args = [(query, pr) for pr in presults]
        feats = self.pool.starmap(make_features, args)
        X = np.array(feats)

        scores = self.model.predict(X)
        if self.use_posthoc_correction:
            scores = posthoc_score_adjust(scores, X, query)
        return scores

    @classmethod
    def prepare_result(cls, paper):
        """Prepare the raw text result for featurization

        Arguments:
            paper {dict} -- A dictionary that has the required paper fields:
                            'title', 'abstract', 'authors', 'venues', 'year',
                            'n_citations', 'n_key_citations'
        Returns:
            out {dict} -- A dictionary where the paper fields have been pre-processed.
        """
        out = {'paper_year': paper.get('year', np.nan)}
        out['n_citations'] = paper.get('n_citations', 0)
        # if n_key_citations aren't available, we can get a quick estimate of what they are from the n_citations
        out['n_key_citations'] = paper.get(
            'n_key_citations', int(-1.4 + np.log1p(out['n_citations'])))
        if out['n_key_citations'] < 0:
            out['n_key_citations'] = 0
        out['paper_title_cleaned'] = fix_text(paper.get('title', ''))
        out['paper_abstract_cleaned'] = fix_text(paper.get('abstract', ''))
        out['paper_venue_cleaned'] = fix_text(paper.get('venue', ''))
        out['author_name'] = [
            fix_author_text(i) for i in paper.get('authors', [])
        ]
        return out
            cv2.RETR_TREE, \
            cv2.CHAIN_APPROX_SIMPLE)
    mask_shape_lv_ = img.shape
    print(mask_shape_lv_)
    tissue_mask_lv_ = make_tumor_mask(mask_shape_lv_, contours_tissue_lv_)
    print('at save')
    cv2.imwrite(save_location, tissue_mask_lv_)


if __name__ == '__main__':
    tiff_path = './data/svs/'
    print('start')
    t1 = time.time()
    save_location = './data/tissue_mask/'
    print('start')
    list_file_name = [f for f in listdir(tiff_path)]
    opt_list = []
    for i, file_name in enumerate(list_file_name):
        if file_name.split('.')[-1] == 'svs':
            cur_name = file_name.split('.svs')[0]
            file_path = tiff_path + file_name
            img = choose_level(file_path)
            save_path = save_location + cur_name + '_tissue_mask_' + '64.png'
            opt_list.append((img, save_path))
    pool = Pool(5)
    pool.starmap(make_mask, opt_list)
    pool.close()
    pool.join()
    t2 = time.time()
    print((t2 - t1) / 60)
def extract(a_range, a_dirs, a_tensor, size):
    for i in a_range:
        a_data = np.load(DATASET_POSITIVE_CHIRPS_PATH + a_dirs[i] +
                         '/X.npy').astype(np.float32)
        a_data = F.interpolate(torch.tensor(a_data), size=[224, 224])
        try:
            a_tensor[i % size] = a_data
        except:
            continue


if __name__ == '__main__':
    MAX_CORES = os.cpu_count()
    PATH_TO_BUCKET = '../../data-step/processed_data/'
    p = Pool(MAX_CORES)

    p_args = []

    b = size * int(sys.argv[1])

    for i in range(MAX_CORES):
        s = b + i * (size // MAX_CORES)
        e = b + (i + 1) * (size // MAX_CORES)
        p_args.append((np.arange(s, e), dirs, tensor, size))

    p.starmap(extract, p_args)
    p.close()

    torch.save(tensor, PATH_TO_BUCKET + f'positive/chirps/chirps{b}.pt')
Пример #54
0
def test_calc_event_time_loop(n_proc):
    t0 = time.time()

    test_data_dir = '/u/casey/scratch/work/microlens/galaxia_test/OGLE672/'
    hdf5_file = test_data_dir + 'OGLE672.h5'
    obs_time = 1000
    n_obs = 101
    radius_cut = 2
    theta_frac = 2
    blend_rad = 0.8

    # Initialize events_tmp and blends_tmp.
    events_tmp = None
    blends_tmp = None

    # Get the l and b from the HDF5 file.
    hf = h5py.File(hdf5_file, 'r')
    l_array = np.array(hf['long_bin_edges'])
    b_array = np.array(hf['lat_bin_edges'])
    hf.close()

    # Converts radius_cut from arcseconds into milliarcseconds
    radius_cut *= 1000.0

    # Set up the multiprocessing
    pool = Pool(n_proc)

    #    # Set up inputs to be able to be read by pool.map
    #    # Only do a few of the bins.
    nll = range(10, 18, 1)
    nbb = range(10, 11, 1)

    llbb = itertools.product(nll, nbb)

    reps = len(nll) * len(nbb)

    hd = itertools.repeat(hdf5_file, reps)
    ot = itertools.repeat(obs_time, reps)
    no = itertools.repeat(n_obs, reps)
    rc = itertools.repeat(radius_cut, reps)
    tf = itertools.repeat(theta_frac, reps)
    br = itertools.repeat(blend_rad, reps)

    inputs = zip(llbb, hd, ot, no, rc, tf, br)

    ##########
    # Loop through galactic latitude and longitude bins. For each bin vertex, take
    # the nearest 4 bin samples and calculate microlensing events. We do this
    # to properly handle bin edges (i.e. a sliding window analysis of 2x2 bins).
    # Duplicate events are removed.
    ##########
    # map_async? Make it wait till there are all done.
    results = pool.starmap(synthetic._calc_event_time_loop, inputs)

    pool.close()
    pool.join()

    results = [i for i in results if i is not None]

    # Is there a way for this to NOT involve a loop?????
    if len(results) > 0:
        for ii in range(len(results)):
            if events_tmp is not None:
                events_tmp = np.concatenate((events_tmp, results[ii][0]),
                                            axis=1)
            else:
                events_tmp = results[ii][0]

        for ii in range(len(results)):
            if blends_tmp is not None:
                blends_tmp = np.concatenate((blends_tmp, results[ii][1]),
                                            axis=1)
            else:
                blends_tmp = results[ii][1]

    t1 = time.time()

    runtime = t1 - t0

    return runtime
def main():
    config = parseArgs()
    print(f'config >{config}<')
    check_config(config)

    nr_of_cpus = psutil.cpu_count(logical=True)
    print(f'We got nr_of_cpus >{nr_of_cpus}<')

    ## build return type dict-file and max-seq-length-file and vocabulary
    pickle_files = common_stuff_lib.get_all_filenames_of_type(
        config['save_dir'], '.pickle')
    #print(f'pickle-files we use to build >{pickle_files}<')
    pickle_lib.print_X_pickle_filenames(pickle_files, 5)

    print(f'Building return-type dict, vocabulary and max-squenece-length')

    p = Pool(nr_of_cpus)

    pickle_files = [config['save_dir'] + "/" + f for f in pickle_files]

    star_list = zip(pickle_files, repeat(config))

    all_ret_types = p.starmap(proc_build, star_list)
    p.close()
    p.join()

    ret_set = set()
    vocab = set()
    seq_length = 0

    ##put all stuff together
    for ret_set1, vocab1, seq_length1 in all_ret_types:
        ret_set.update(ret_set1)
        vocab.update(vocab1)
        if seq_length1 > seq_length:
            seq_length = seq_length1


#     ret_set = set()
#     vocab = set()
#     seq_length = 0
#     counter = 1
#     pickle_count = len(pickle_files)
#
#     for file in pickle_files:
#         print(f'File >{file}< >{counter}/{pickle_count}<', end='\r')
#         counter += 1
#         cont = pickle_lib.get_pickle_file_content(config['save_dir'] + file)
#         for item in cont:
#             #print(f'item-1 >{item[1]}<')
#             ## build ret-type-dict
#             ret_set.add(item[1])
#
#             ##build max-seq-length
#             if len(item[0]) > seq_length:
#                 seq_length = len(item[0])
#
#             ## build vocabulary
#             for word in item[0].split():
#                 vocab.add(word)

    print(
        f"Build return-type dict from set and save it to >{config['return_type_dict_file']}<"
    )
    ## build ret-type-dict and save
    ret_type_dict = dict()
    counter = 0
    ret_set_list = sorted(ret_set)
    for elem in ret_set_list:
        ret_type_dict[elem] = counter
        counter += 1

    print(f"ret-type-dict :")
    for key in ret_type_dict:
        print(f"nr-of-args >{key}<  label >{ret_type_dict[key]}<")

    pickle_lib.save_to_pickle_file(ret_type_dict,
                                   config['return_type_dict_file'])

    print(f"Saving vocabulary to >{config['vocabulary_file']}<")
    ## build vocabulary list from set and save
    vocab_list = list(vocab)
    pickle_lib.save_to_pickle_file(vocab_list, config['vocabulary_file'])

    ## save max-seq-length
    print(f"Saving max-sequence-length to >{config['max_seq_length_file']}<")
    pickle_lib.save_to_pickle_file(seq_length, config['max_seq_length_file'])

    print("Done. Run build_balanced_dataset.py next")
Пример #56
0
    global_list_minus_dif = manager.list()
    global_list_plus_difc = manager.list()
    global_list_minus_difc = manager.list()

    global_list_ezk_plus_sim = manager.list()
    global_list_ezk_minus_sim = manager.list()
    global_list_ezk_plus_simc = manager.list()
    global_list_ezk_minus_simc = manager.list()
    global_list_ezk_plus_dif = manager.list()
    global_list_ezk_minus_dif = manager.list()
    global_list_ezk_plus_difc = manager.list()
    global_list_ezk_minus_difc = manager.list()

    # main function for hypotheses generation:
    p = Pool(32)
    p.starmap(extensions_apply, arguments)
    p.close()
    p.join()

    # now we need only 8 processes: no extensions (4x), and we only process lists of each sign (8):
    arguments2 = [(x, y) for x in signs for y in strategies]

    p = Pool(8)
    p.starmap(lists_analysis, arguments2)
    p.close()
    p.join()

    # now to exclude overlapping hypotheses (only in strategies without counter-examples):
    p = Pool(2)
    async_eliminate_overap = p.map_async(eliminate_overap, ['sim', 'dif'])
    p.close()
Пример #57
0
def eval_mot(results,
             annotations,
             logger=None,
             classes=None,
             iou_thr=0.5,
             ignore_iof_thr=0.5,
             ignore_by_classes=False,
             nproc=4):
    """Evaluation CLEAR MOT metrics.

    Args:
        results (list[list[list[ndarray]]]): The first list indicates videos,
            The second list indicates images. The third list indicates
            categories. The ndarray indicates the tracking results.
        annotations (list[list[dict]]): The first list indicates videos,
            The second list indicates images. The third list indicates
            the annotations of each video. Keys of annotations are

            - `bboxes`: numpy array of shape (n, 4)
            - `labels`: numpy array of shape (n, )
            - `instance_ids`: numpy array of shape (n, )
            - `bboxes_ignore` (optional): numpy array of shape (k, 4)
            - `labels_ignore` (optional): numpy array of shape (k, )
        logger (logging.Logger | str | None, optional): The way to print the
            evaluation results. Defaults to None.
        classes (list, optional): Classes in the dataset. Defaults to None.
        iou_thr (float, optional): IoU threshold for evaluation.
            Defaults to 0.5.
        ignore_iof_thr (float, optional): Iof threshold to ignore results.
            Defaults to 0.5.
        ignore_by_classes (bool, optional): Whether ignore the results by
            classes or not. Defaults to False.
        nproc (int, optional): Number of the processes. Defaults to 4.

    Returns:
        dict[str, float]: Evaluation results.
    """
    print_log('---CLEAR MOT Evaluation---', logger)
    t = time.time()
    gts = annotations.copy()
    if classes is None:
        classes = [i + 1 for i in range(len(results[0]))]
    assert len(results) == len(gts)
    metrics = METRIC_MAPS.keys()

    print_log('Accumulating...', logger)

    pool = Pool(nproc)
    accs = pool.starmap(
        acc_single_video,
        zip(results, gts, [iou_thr for _ in range(len(gts))],
            [ignore_iof_thr for _ in range(len(gts))],
            [ignore_by_classes for _ in range(len(gts))]))
    names, accs, items = aggregate_accs(accs, classes)
    print_log('Evaluating...', logger)
    eval_results = pd.DataFrame(columns=metrics)
    summaries = pool.starmap(eval_single_class, zip(names, accs))
    pool.close()

    # category and overall results
    for i, item in enumerate(items):
        eval_results.loc[item] = summaries[i]

    dtypes = {m: type(d) for m, d in zip(metrics, summaries[0])}
    # average results
    avg_results = []
    for i, m in enumerate(metrics):
        v = np.array([s[i] for s in summaries[:len(classes)]])
        v = np.nan_to_num(v, nan=0)
        if dtypes[m] == int:
            avg_results.append(int(v.sum()))
        elif dtypes[m] == float:
            avg_results.append(float(v.mean()))
        else:
            raise TypeError()
    eval_results.loc['AVERAGE'] = avg_results
    eval_results = eval_results.astype(dtypes)

    print_log('Rendering...', logger)
    strsummary = mm.io.render_summary(
        eval_results,
        formatters=mm.metrics.create().formatters,
        namemap=METRIC_MAPS)

    print_log('\n' + strsummary, logger)
    print_log(f'Evaluation finishes with {(time.time() - t):.2f} s.', logger)

    eval_results = eval_results.to_dict()
    out = {METRIC_MAPS[k]: v['OVERALL'] for k, v in eval_results.items()}
    for k, v in out.items():
        out[k] = float(f'{(v):.3f}') if isinstance(v, float) else int(f'{v}')
    for m in ['OVERALL', 'AVERAGE']:
        out[f'track_{m}_copypaste'] = ''
        for k in METRIC_MAPS.keys():
            v = eval_results[k][m]
            v = f'{(v):.3f} ' if isinstance(v, float) else f'{v} '
            out[f'track_{m}_copypaste'] += v

    return out
Пример #58
0
def eval_map(det_results,
             annotations,
             scale_ranges=None,
             iou_thr=0.5,
             dataset=None,
             logger=None,
             nproc=4):
    """Evaluate mAP of a dataset.

    Args:
        det_results (list[list]): [[cls1_det, cls2_det, ...], ...].
            The outer list indicates images, and the inner list indicates
            per-class detected bboxes.
        annotations (list[dict]): Ground truth annotations where each item of
            the list indicates an image. Keys of annotations are:

            - `bboxes`: numpy array of shape (n, 4)
            - `labels`: numpy array of shape (n, )
            - `bboxes_ignore` (optional): numpy array of shape (k, 4)
            - `labels_ignore` (optional): numpy array of shape (k, )
        scale_ranges (list[tuple] | None): Range of scales to be evaluated,
            in the format [(min1, max1), (min2, max2), ...]. A range of
            (32, 64) means the area range between (32**2, 64**2).
            Default: None.
        iou_thr (float): IoU threshold to be considered as matched.
            Default: 0.5.
        dataset (list[str] | str | None): Dataset name or dataset classes,
            there are minor differences in metrics for different datsets, e.g.
            "voc07", "imagenet_det", etc. Default: None.
        logger (logging.Logger | str | None): The way to print the mAP
            summary. See `mmdet.utils.print_log()` for details. Default: None.
        nproc (int): Processes used for computing TP and FP.
            Default: 4.

    Returns:
        tuple: (mAP, [dict, dict, ...])
    """
    assert len(det_results) == len(annotations)

    num_imgs = len(det_results)
    num_scales = len(scale_ranges) if scale_ranges is not None else 1
    num_classes = len(det_results[0])  # positive class num
    area_ranges = ([(rg[0]**2, rg[1]**2) for rg in scale_ranges]
                   if scale_ranges is not None else None)

    pool = Pool(nproc)
    eval_results = []
    for i in range(num_classes):
        # get gt and det bboxes of this class
        cls_dets, cls_gts, cls_gts_ignore = get_cls_results(
            det_results, annotations, i)
        # choose proper function according to datasets to compute tp and fp
        if dataset in ['det', 'vid']:
            tpfp_func = tpfp_imagenet
        else:
            tpfp_func = tpfp_default
        # compute tp and fp for each image with multiple processes
        tpfp = pool.starmap(
            tpfp_func,
            zip(cls_dets, cls_gts, cls_gts_ignore,
                [iou_thr for _ in range(num_imgs)],
                [area_ranges for _ in range(num_imgs)]))
        tp, fp = tuple(zip(*tpfp))
        # calculate gt number of each scale
        # ignored gts or gts beyond the specific scale are not counted
        num_gts = np.zeros(num_scales, dtype=int)
        for j, bbox in enumerate(cls_gts):
            if area_ranges is None:
                num_gts[0] += bbox.shape[0]
            else:
                gt_areas = (bbox[:, 2] - bbox[:, 0]) * (bbox[:, 3] -
                                                        bbox[:, 1])
                for k, (min_area, max_area) in enumerate(area_ranges):
                    num_gts[k] += np.sum((gt_areas >= min_area)
                                         & (gt_areas < max_area))
        # sort all det bboxes by score, also sort tp and fp
        cls_dets = np.vstack(cls_dets)
        num_dets = cls_dets.shape[0]
        sort_inds = np.argsort(-cls_dets[:, -1])
        tp = np.hstack(tp)[:, sort_inds]
        fp = np.hstack(fp)[:, sort_inds]
        # calculate recall and precision with tp and fp
        tp = np.cumsum(tp, axis=1)
        fp = np.cumsum(fp, axis=1)
        eps = np.finfo(np.float32).eps
        recalls = tp / np.maximum(num_gts[:, np.newaxis], eps)
        precisions = tp / np.maximum((tp + fp), eps)
        # calculate AP
        if scale_ranges is None:
            recalls = recalls[0, :]
            precisions = precisions[0, :]
            num_gts = num_gts.item()
        mode = 'area' if dataset != 'voc07' else '11points'
        ap = average_precision(recalls, precisions, mode)
        eval_results.append({
            'num_gts': num_gts,
            'num_dets': num_dets,
            'recall': recalls,
            'precision': precisions,
            'ap': ap
        })
    pool.close()
    if scale_ranges is not None:
        # shape (num_classes, num_scales)
        all_ap = np.vstack([cls_result['ap'] for cls_result in eval_results])
        all_num_gts = np.vstack(
            [cls_result['num_gts'] for cls_result in eval_results])
        mean_ap = []
        for i in range(num_scales):
            if np.any(all_num_gts[:, i] > 0):
                mean_ap.append(all_ap[all_num_gts[:, i] > 0, i].mean())
            else:
                mean_ap.append(0.0)
    else:
        aps = []
        for cls_result in eval_results:
            if cls_result['num_gts'] > 0:
                aps.append(cls_result['ap'])
        mean_ap = np.array(aps).mean().item() if aps else 0.0

    print_map_summary(mean_ap,
                      eval_results,
                      dataset,
                      area_ranges,
                      logger=logger)

    return mean_ap, eval_results
def validate(dataset, strategy, ranker, reader, params=None):
    if params is not None:
        global args
        args = params

    def dist_forward_ranker_step(input_ids):
        print("This function is tracing")

        def step_fn(input_ids):
            input_ids = tf.reshape(input_ids, [-1, args.max_sequence_length])
            attention_mask = tf.cast(input_ids > 0, dtype=tf.int32)

            rank_logits = ranker(input_ids=input_ids,
                                 attention_mask=attention_mask,
                                 training=False)

            return tf.reshape(rank_logits, [args.batch_size, -1])

        per_replica_logits = strategy.run(step_fn, args=(input_ids, ))
        return per_replica_logits

    def dist_forward_reader_step(input_ids):
        print("This function is tracing")

        def step_fn(input_ids):
            attention_mask = tf.cast(input_ids > 0, dtype=tf.int32)

            start_logits, end_logits = reader(input_ids=input_ids,
                                              attention_mask=attention_mask,
                                              training=False)

            return start_logits, end_logits

        per_replica_results = strategy.run(step_fn, args=(input_ids, ))
        return per_replica_results

    if not args.disable_tf_function:
        dist_forward_ranker_step = tf.function(dist_forward_ranker_step)
        dist_forward_reader_step = tf.function(dist_forward_reader_step)

    def value_fn_template(ctx, pool_tensors):
        return pool_tensors[ctx.replica_id_in_sync_group]

    processes = ProcessPool(processes=os.cpu_count())
    tokenizer = get_tokenizer(model_name=args.pretrained_model,
                              prefix=args.prefix)

    get_best_span_partial = partial(get_best_span,
                                    max_answer_length=args.max_answer_length,
                                    tokenizer=tokenizer)

    iterator = iter(dataset)
    em_hits = []
    match_stats = []
    for element in tqdm(iterator):
        answers_serialized = element['answers']
        question = element['question']
        input_ids = element[
            'passages/sequence_ids']  # bsz x num_passages x max_sequence_length
        passage_offsets = element[
            'passages/passage_offset']  # bsz x num_passages

        reduced_input_ids = tf.concat(input_ids.values, axis=0)
        per_replica_passage_offsets = strategy.experimental_local_results(
            passage_offsets)

        global_batch_size = reduced_input_ids.shape[0]
        if global_batch_size < args.batch_size * strategy.num_replicas_in_sync:
            # TODO: add code in case batch is not divisible
            aggregated_input_ids = tf.concat(input_ids.values, axis=0)
            padded_size = args.batch_size * strategy.num_replicas_in_sync - global_batch_size
            padded_input_ids = tf.zeros(
                [padded_size, args.max_passages, args.max_sequence_length],
                dtype=tf.int32)
            input_ids = tf.concat([aggregated_input_ids, padded_input_ids],
                                  axis=0)
            pool_input_ids = tf.split(
                input_ids,
                num_or_size_splits=strategy.num_replicas_in_sync,
                axis=0)
            value_fn_for_input_ids = partial(value_fn_template,
                                             pool_tensors=pool_input_ids)
            input_ids = strategy.experimental_distribute_values_from_function(
                value_fn_for_input_ids)

            aggregated_per_replica_passage_offsets = tf.concat(
                per_replica_passage_offsets, axis=0)
            lack_size = args.batch_size * strategy.num_replicas_in_sync - aggregated_per_replica_passage_offsets.shape[
                0]
            padded_per_replica_passage_offsets = tf.zeros(
                [lack_size, args.max_passages], dtype=tf.int32)
            per_replica_passage_offsets = tf.concat([
                aggregated_per_replica_passage_offsets,
                padded_per_replica_passage_offsets
            ],
                                                    axis=0)
            per_replica_passage_offsets = tf.split(
                per_replica_passage_offsets,
                num_or_size_splits=strategy.num_replicas_in_sync)

        rank_logits = dist_forward_ranker_step(input_ids)
        rank_logits = strategy.experimental_local_results(rank_logits)
        selected_passage_idxs = [
            tf.cast(tf.argmax(logits, axis=-1), dtype=tf.int32)
            for logits in rank_logits
        ]  # num_replicas x batch_size

        selected_passage_offsets = []
        per_replica_input_ids = strategy.experimental_local_results(
            input_ids
        )  # num_replicas x batch_sizse x max_passages x max_sequence_length
        selected_input_ids = []

        for sequence_ids, psg_offsets, passage_idxs in zip(
                per_replica_input_ids, per_replica_passage_offsets,
                selected_passage_idxs):
            range_idxs = tf.range(sequence_ids.shape[0], dtype=tf.int32)
            indices = tf.concat([
                tf.expand_dims(range_idxs, axis=1),
                tf.expand_dims(passage_idxs, axis=1)
            ],
                                axis=1)
            selected_passage_offsets.append(tf.gather_nd(psg_offsets, indices))
            selected_input_ids.append(tf.gather_nd(sequence_ids, indices))

        value_fn = partial(value_fn_template, pool_tensors=selected_input_ids)
        dist_selected_input_ids = strategy.experimental_distribute_values_from_function(
            value_fn)

        start_logits, end_logits = dist_forward_reader_step(
            input_ids=dist_selected_input_ids)
        sentence_ids = tf.concat(dist_selected_input_ids.values, axis=0)
        sentence_ids = tf.RaggedTensor.from_tensor(
            sentence_ids, padding=tokenizer.pad_token_id)
        sentence_ids = sentence_ids.to_list()
        sentence_ids = sentence_ids[:global_batch_size]
        selected_passage_offsets = tf.concat(selected_passage_offsets, axis=0)
        selected_passage_offsets = selected_passage_offsets[:global_batch_size]
        ctx_ids = [
            ids[offset:]
            for ids, offset in zip(sentence_ids, selected_passage_offsets)
        ]

        start_logits = tf.concat(start_logits.values, axis=0)
        start_logits = start_logits.numpy().tolist()
        start_logits = start_logits[:global_batch_size]
        start_logits = [
            logits[offset:offset + len(ctx)] for logits, offset, ctx in zip(
                start_logits, selected_passage_offsets, ctx_ids)
        ]
        end_logits = tf.concat(end_logits.values, axis=0)
        end_logits = end_logits.numpy().tolist()
        end_logits = end_logits[:global_batch_size]
        end_logits = [
            logits[offset:offset + len(ctx)] for logits, offset, ctx in zip(
                end_logits, selected_passage_offsets, ctx_ids)
        ]

        best_spans = processes.starmap(get_best_span_partial,
                                       zip(start_logits, end_logits, ctx_ids))

        answers_serialized = tf.concat(answers_serialized.values, axis=0)
        question = tf.concat(question.values, axis=0)

        answers = []
        for ans in answers_serialized:
            ans_sparse = tf.io.parse_tensor(ans, out_type=tf.string)
            ans_values = tf.io.parse_tensor(ans_sparse[1], out_type=tf.string)
            ans_values = [answer.numpy().decode() for answer in ans_values]
            answers.append(ans_values)

        question = question.numpy().tolist()
        question = [q.decode() for q in question]

        hits = processes.starmap(compare_spans, zip(answers, best_spans))
        passages = [tokenizer.decode(ids) for ids in ctx_ids]

        selected_passage_idxs = tf.concat(selected_passage_idxs, axis=0)
        selected_passage_idxs = selected_passage_idxs.numpy().tolist()

        stats = [{
            "question": q,
            "answers": ans,
            "passage": psg,
            "predicted": span,
            "retriever_rank": idx + 1,
            "hit": hit
        }
                 for q, ans, span, idx, psg, hit in zip(
                     question, answers, best_spans, selected_passage_idxs,
                     passages, hits)]
        match_stats.extend(stats)
        em_hits.extend(hits)

    print("done")
    print("-----------------------------------------------------------")
    return em_hits, match_stats
Пример #60
0
def main(gt_folder: Path, prediction_folder: Path, original_images: str,
         output_path: Path, no_visualization: bool, eval_tool: Path,
         processes: int):
    # Select the number of threads
    if processes == 0:
        pool = Pool(processes=cpu_count())
    else:
        pool = Pool(processes=processes)

    # Get the paths for all gt files
    gt_files_path = get_file_list(gt_folder, EXTENSIONS_PATTERNS)

    # Get the the paths for all prediction images
    prediction_files_path = get_file_list(prediction_folder,
                                          EXTENSIONS_PATTERNS)

    # Check if we have the same amount of gt and prediction files
    assert len(gt_files_path) == len(
        prediction_files_path
    ), "Amount of gt files and prediction files differ."

    # Get the paths of original images if set
    if original_images is not None:
        original_images = Path(original_images)
        original_files_path = get_file_list(Path(original_images),
                                            EXTENSIONS_PATTERNS)
    else:
        original_files_path = [None] * len(gt_files_path)

    # Timer
    tic = time.time()

    # Debugging purposes only!
    # input_images = [input_images[1]]
    # gt_xml = [gt_xml[1]]
    # gt_pxl = [gt_pxl[1]]

    # input_xml = input_xml[0:3]
    # gt_xml = gt_xml[0:3]
    # gt_pxl = gt_pxl[0:3]

    # For each file run
    results = list(
        pool.starmap(
            evaluate,
            zip(prediction_files_path, gt_files_path,
                itertools.repeat(output_path), itertools.repeat(eval_tool),
                original_files_path, itertools.repeat(no_visualization))))
    pool.close()
    print("Pool closed)")

    scores = []
    errors = []

    for item in results:
        if item[0] is not None:
            scores.append(item[0])
        else:
            errors.append(item)

    if list(scores):
        score = np.mean(scores)
    else:
        score = -1

    # np.save(os.path.join(output_path, 'results.npy'), results)
    write_stats(results, errors, score)
    print('Total time taken: {:.2f}, avg_miou={}, nb_errors={}'.format(
        time.time() - tic, score, len(errors)))
    return score