예제 #1
0
def cast_data(header, tablename, data):
    typedict = get_typedict(tablename)
    type_casters = []
    for i in range(len(header)):
        sql_type = typedict[header[i]]
        if sql_type == text_type:
            type_casters.append(lambda str: str.encode('UTF-8'))
            #type_casters.append(lambda passer: passer)
        elif sql_type == int_type:
            type_casters.append(int)
        elif sql_type == date_type:
            type_casters.append(timestamp_parser.parse)

    log('casting data for ' + str(len(data)) + " rows")

    def cast_line(dataln):
        cast_line = []
        for col_id in range(len(dataln)):
            cast_line.append(type_casters[col_id](dataln[col_id]))
        return cast_line

    tpool = Pool(processes=6)
    ret = tpool.map(cast_line, data)
    tpool.close()
    return ret
예제 #2
0
    def __init__(self, observables: List[AgentHandler],
                 actionables: List[AgentHandler],
                 mission_handlers: List[AgentHandler], nsteps, gamma,
                 data_directory, num_workers, worker_batch_size,
                 min_size_to_dequeue):
        """
        Sets up a tensorflow dataset to load videos from a given data directory.
        :param data_directory: the directory of the data to be loaded, eg: 'minerl.herobraine_parse/output/rendered/'
        """

        self.data_dir = data_directory
        self.observables = observables
        self.actionables = actionables
        self.mission_handlers = mission_handlers
        # self.vectorizer = vectorizer

        self.number_of_workers = num_workers
        self.worker_batch_size = worker_batch_size
        self.size_to_dequeue = min_size_to_dequeue
        self.nsteps = nsteps
        self.gamma = gamma

        self.processing_pool = Pool(self.number_of_workers)
        self.m = multiprocessing.Manager()
        self.data_queue = self.m.Queue(maxsize=self.size_to_dequeue //
                                       self.worker_batch_size * 4)

        pool_size = self.size_to_dequeue * 4
        self.random_queue = PriorityQueue(maxsize=pool_size)
예제 #3
0
def process_experiment(_experiment, _overwrite=False):
    _arguments = [(_experiment, int(_series.split('_')[1]), _overwrite)
                  for _series in paths.image_files(paths.serieses(_experiment))
                  ]
    _p = Pool(CPUS_TO_USE)
    _p.starmap(process_series, _arguments)
    _p.close()
예제 #4
0
def compute_simulations_fiber_densities(_simulations):
    _arguments = []
    for _simulation in _simulations:
        for _direction in ['left', 'right', 'up', 'down']:
            _arguments.append({
                'simulation': _simulation,
                'length_x': config.QUANTIFICATION_WINDOW_HEIGHT_IN_CELL_DIAMETER
                if _direction in ['up', 'down'] else config.QUANTIFICATION_WINDOW_WIDTH_IN_CELL_DIAMETER,
                'length_y': config.QUANTIFICATION_WINDOW_WIDTH_IN_CELL_DIAMETER
                if _direction in ['up', 'down'] else config.QUANTIFICATION_WINDOW_HEIGHT_IN_CELL_DIAMETER,
                'offset_x': OFFSET_Y if _direction in ['up', 'down'] else OFFSET_X,
                'offset_y': OFFSET_X if _direction in ['up', 'down'] else OFFSET_Y,
                'cell_id': 'cell',
                'direction': _direction,
                'time_points': TIME_POINTS
            })

    _fiber_densities = {}
    with Pool(CPUS_TO_USE) as _p:
        for _keys, _value in tqdm(
                _p.imap_unordered(compute.window_fiber_density_by_time, _arguments),
                total=len(_arguments), desc='Computing windows & fiber densities'):
            _fiber_densities[(_keys['simulation'], _keys['direction'])] = _value
        _p.close()
        _p.join()

    return _fiber_densities
    def synthesize_spectrograms(self,
                                texts: List[str],
                                embeddings: Union[np.ndarray,
                                                  List[np.ndarray]],
                                return_alignments=False):
        """
        Synthesizes mel spectrograms from texts and speaker embeddings.

        :param texts: a list of N text prompts to be synthesized
        :param embeddings: a numpy array or list of speaker embeddings of shape (N, 256) 
        :param return_alignments: if True, a matrix representing the alignments between the 
        characters
        and each decoder output step will be returned for each spectrogram
        :return: a list of N melspectrograms as numpy arrays of shape (80, Mi), where Mi is the 
        sequence length of spectrogram i, and possibly the alignments.
        """
        if not self._low_mem:
            # Usual inference mode: load the model on the first request and keep it loaded.
            if not self.is_loaded():
                self.load()
            specs, alignments = self._model.my_synthesize(embeddings, texts)
        else:
            # Low memory inference mode: load the model upon every request. The model has to be
            # loaded in a separate process to be able to release GPU memory (a simple workaround
            # to tensorflow's intricacies)
            specs, alignments = Pool(1).starmap(
                Synthesizer._one_shot_synthesize_spectrograms,
                [(self.checkpoint_fpath, embeddings, texts)])[0]

        return (specs, alignments) if return_alignments else specs
예제 #6
0
def main():
    global _logger
    parser = ArgumentParser(usage='prog [options]')
    parser.add_argument('-c', '--config',
                        dest='config_file', required=True,
                        metavar='CONFIG_FILE', help='The full path to the config file to open')
    parser.add_argument('-l', '--log-config',
                        default='logger.conf', dest='log_config_file',
                        metavar='LOG_CONFIG_FILE', help='The full path to the log config file to open')
    parser.add_argument('--start-frame-time', default=-1, type=int, dest='start_frame_pos',
                        help='Start frame time in seconds')
    parser.add_argument('--end-frame-time', default=-1, type=int, dest='end_frame_pos',
                        help='End frame time in seconds')
    parser.add_argument('--smooth-filter-size', default=3, type=int, dest='gaussian_filter_size',
                        help='Gaussian filter kernel size')
    parser.add_argument('--smooth-filter-sigma', default=0, type=int, dest='gaussian_filter_sigma',
                        help='Gaussian filter sigma')
    parser.add_argument('--nthreads', default=1, type=int, dest='nthreads')
    parser.add_argument('--nprocesses', default=1, type=int, dest='nprocesses',
                        help='Number of processes to run in parallel')

    args = parser.parse_args()

    # setup logger
    logging.config.fileConfig(args.log_config_file)
    _logger = logging.getLogger('tracker')

    if args.config_file is None:
        _logger.warning('Missing config file')
        parser.exit(1, 'Missing config file\n')

    # load config file
    config, errors = load_config(args.config_file)
    errors |= set(config.validate())

    if len(errors) == 0:
        if args.nprocesses > 1:
            source = MovieFile(config.get_source(),
                                     start_msecs=args.start_frame_pos * 1000,
                                     end_msecs=args.end_frame_pos * 1000,
                                     resolution=config.get_image_size())
            if not source.is_opened():
                _logger.error('Error opening %s' % config.get_source())
                return
            start_frame_pos = int(source.get_start_time_in_seconds())
            end_frame_pos = int(source.get_end_time_in_seconds())
            frame_interval = int((end_frame_pos - start_frame_pos) / args.nprocesses)
            tracker_args = [(config, s * 1000, (s + frame_interval) * 1000,
                             args.gaussian_filter_size, args.gaussian_filter_sigma,
                             args.nthreads, _get_run_interval(s, s + frame_interval)[1]) for s in
                            range(start_frame_pos, end_frame_pos, frame_interval)
                            ]
            with Pool(args.nprocesses) as p:
                p.starmap(_run_tracker, tracker_args)
        else:
            _run_tracker(config, args.start_frame_pos * 1000, args.end_frame_pos * 1000,
                         args.gaussian_filter_size, args.gaussian_filter_sigma, args.nthreads)
    else:
        _logger.error('Config load error: %r' % errors)
예제 #7
0
def process_experiment(_experiment, _overwrite=False):
    _arguments = []
    for _tuple in load.experiment_groups_as_tuples(_experiment):
        _experiment, _series_id, _group = _tuple
        _arguments.append((_experiment, _series_id, _group, _overwrite))

    _p = Pool(CPUS_TO_USE)
    _p.starmap(process_group, _arguments)
    _p.close()
예제 #8
0
 def runMultiProcessTrajectories(self, repeat):
     pool = Pool(processes=len(self.posIni))
     result = pool.map(partial(self.nTraj, repeat=repeat),
                       [(x, y) for x, y in self.posIni])
     pool.close()
     pool.join()
     meanCost, meanTraj = 0, 0
     for CostCMAES, traj in result:
         meanCost += CostCMAES
         meanTraj += traj
     size = len(result)
     return meanCost / size, meanTraj / size
예제 #9
0
 def run(self):
     nproc = PipeEnum.PARALLEL_N_PROCS.value
     nchunks = PipeEnum.PARALLEL_N_CHUNKS.value
     if nproc in self.kwargs:
         n_processes = self.kwargs[nproc]
         chunks = self.kwargs.get(nchunks, 1)
         pool = Pool(n_processes)
         self.output = [
             i for i in pool.map(
                 self.map_function, self.reader.data, chunksize=chunks)
         ]
     else:
         self.output = self.map_function(self.reader.data)
예제 #10
0
 def __init__(self,
              chromosomes: Iterable[Any],
              eval_function: Callable,
              checkpoint_target: Optional[str] = None,
              concurrent_workers: Optional[int] = 1,
              maximize: bool = True,
              generation: int = 0,
              intended_size: Optional[int] = None,
              serializer=None):
     self.concurrent_workers = concurrent_workers
     self.documented_best = None
     self.eval_function = eval_function
     self.generation = generation
     self.id = str(uuid4())[:6]
     self.individuals = [Individual(chromosome=chromosome) for chromosome in chromosomes]
     self.intended_size = intended_size or len(self.individuals)
     self.maximize = maximize
     self.serializer = serializer or SimpleSerializer(target=checkpoint_target)
     self.pool = None if concurrent_workers == 1 else Pool(concurrent_workers)
예제 #11
0
def _phase_coherence(
    signal_pair: Tuple[TimeSeries, TimeSeries], params: PCParams
) -> Tuple[Tuple[TimeSeries, TimeSeries], ndarray, ndarray, ndarray, ndarray]:
    """
    Function which uses `wpc` to calculate phase coherence for a single pair of signals. The signals must have
    their wavelet transforms attached in their `output_data` member variable.

    :param signal_pair: tuple containing 2 signals
    :param params: the params object with parameters for the function
    :return:
    [tuple] the pair of signals;
    [2D array] the time-localised phase coherence;
    [1D array] phase coherence;
    [1D array] phase difference;
    [1D array] time-localised phase coherence of surrogates
    """
    s1, s2 = signal_pair

    wt1 = s1.output_data.values
    wt2 = s2.output_data.values

    freq = s1.output_data.freq
    fs = s1.frequency

    # Calculate surrogates.
    surr_count = params.surr_count
    surr_method = params.surr_method
    surr_preproc = params.surr_preproc
    surrogates, _ = surrogate_calc(s1, surr_count, surr_method, surr_preproc,
                                   fs)

    # Calculate surrogates.
    pool = Pool()
    args = [(wt1, surrogates[i], params) for i in range(surr_count)]
    tpc_surr = pool.starmap(_wt_surrogate_calc, args)

    if len(tpc_surr) > 0:
        tpc_surr = np.mean(tpc_surr, axis=0)

    # Calculate phase coherence.
    tpc, pc, pdiff = wpc(wt1, wt2, freq, fs)

    return signal_pair, tpc, pc, pdiff, tpc_surr
예제 #12
0
    def __init__(self, observables: List[AgentHandler],
                 actionables: List[AgentHandler],
                 mission_handlers: List[AgentHandler], data_directory,
                 num_workers, worker_batch_size, min_size_to_dequeue):
        """
        Sets up a tensorflow dataset to load videos from a given data directory.
        :param data_directory: the directory of the data to be loaded, eg: 'minerl.herobraine_parse/output/rendered/'
        """

        self.data_dir = data_directory
        self.observables = observables
        self.actionables = actionables
        self.mission_handlers = mission_handlers
        # self.vectorizer = vectorizer

        self.number_of_workers = num_workers
        self.worker_batch_size = worker_batch_size
        self.size_to_dequeue = min_size_to_dequeue
        self.processing_pool = Pool(self.number_of_workers)
def clean_data():
    rows_per_loop = 100000
    log("")
    log("starting")

    dirty_db_path = ROOTDIR + dir_sep + "stage_2_clean.db"
    clean_db_path = ROOTDIR + dir_sep + "stage_3_cleaner.db"
    dirty_db_cursor = create_connection(dirty_db_path).cursor()
    clean_db = create_connection(clean_db_path)
    clean_db_cursor = clean_db.cursor()

    clean_db_cursor.execute("DELETE FROM bodies")
    clean_db_cursor.execute("delete from sqlite_sequence where name='bodies'")

    dirty_db_cursor.execute("select bodies from bodies")
    data = dirty_db_cursor.fetchmany(rows_per_loop)

    tpool = Pool(processes=4)
    locp_n = 1
    log("detected " + str(cpus) + " as cpu count")
    inserted = 0
    more_data = True
    while more_data:
        log("cleaning data")
        data = tpool.map(clean_line, data)
        data = list(map(lambda line: (line, ), data))

        log("inserting 100k rows")
        query = "insert into bodies (bodies) values (?)"
        clean_db_cursor.executemany(query, data)
        clean_db.commit()

        log("done loop, getting more data.")
        inserted += len(data)
        data = dirty_db_cursor.fetchmany(rows_per_loop)
        #more_data = False
        if len(data) < 1:
            more_data = False
            log("end of data")
        log("done " + str(locp_n) + " loops")
        locp_n += 1
    log("done")
    log("inserted " + str(inserted) + " rows")
 def test_concurrent_download_and_prepare(self):
     with tempfile.TemporaryDirectory() as tmp_dir:
         processes = 2
         with Pool(processes=processes) as pool:
             jobs = [
                 pool.apply_async(_run_concurrent_download_and_prepare, kwds={"tmp_dir": tmp_dir})
                 for _ in range(processes)
             ]
             dummy_builders = [job.get() for job in jobs]
             for dummy_builder in dummy_builders:
                 self.assertTrue(
                     os.path.exists(
                         os.path.join(tmp_dir, "dummy_builder", "dummy", "0.0.0", "dummy_builder-train.arrow")
                     )
                 )
                 self.assertDictEqual(dummy_builder.info.features, Features({"text": Value("string")}))
                 self.assertEqual(dummy_builder.info.splits["train"].num_examples, 100)
                 self.assertTrue(
                     os.path.exists(os.path.join(tmp_dir, "dummy_builder", "dummy", "0.0.0", "dataset_info.json"))
                 )
    def filter(dirty_data):
        log("starting filter")
        tpool = Pool(processes=cpus)
        ret = []
        log("filtering deleted and not english")
        for line in tpool.map(Filter.__is_not_deleted_or_not_non_english,
                              dirty_data):
            if line[1]:
                ret.append(line[0])

        def clean_links_and_punctuation(comment):
            words = comment.split(" ")
            words = list(map(Filter.__filter_links, words))
            comment = reduce(lambda x, y: x + " " + y, words)
            return comment

        log("filtering links and punctuation")
        ret = tpool.map(clean_links_and_punctuation, ret)
        tpool.close()
        log("filter done")
        return ret
예제 #16
0
def classify_multiproc(model, stack_data, result, array_outfile=None, mask=None):
    d = Classifier()
    d.get_stack(stack_data, outfile=array_outfile, mask_path=mask)
    stack_data = d.masked_data_stack

    cores = cpu_count()
    a = ArrayDisAssembly(stack_data)
    arrays = a.disassemble(n_sections=cores)
    classifiers = [Classifier(idx=i, arr=a, model=model) for i, a in enumerate(arrays)]
    time = datetime.now()
    pool = Pool(processes=cores)
    with pool as p:
        pool_results = [p.apply_async(get_classifier, (c, a)) for a, c in zip(arrays, classifiers)]
        classified_arrays = [res.get() for res in pool_results]
        a.assemble(classified_arrays)
        final = a.assembled.reshape(d.final_shape)
    td = (datetime.now() - time)

    d.write_raster(out_file=result, new_array=final)

    return None
예제 #17
0
def _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root,
                             out_dir, extension, skip_existing, logger):
    print("%s: Preprocessing data for %d speakers." %
          (dataset_name, len(speaker_dirs)))

    # 预处理一个说话人的所有语音
    def preprocess_speaker(speaker_dir: Path):
        speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)
        speaker_out_dir = out_dir.joinpath(speaker_name)
        speaker_out_dir.mkdir(exist_ok=True)
        sources_fpath = speaker_out_dir.joinpath("_sources.txt")
        sources_file = sources_fpath.open("a" if skip_existing else "w")
        for in_fpath in speaker_dir.glob("**/*.%s" % extension):
            out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
            out_fname = out_fname.replace(".%s" % extension, ".npy")

            # 预处理单条语音
            wav = audio.preprocess_wav(in_fpath)

            # 丢弃过短语音
            frames = audio.wav_to_mel_spectrogram(wav)
            if len(frames) < partials_n_frames:
                continue

            out_fpath = speaker_out_dir.joinpath(out_fname)
            np.save(out_fpath, frames)
            logger.add_sample(duration=len(wav) / sampling_rate)
            sources_file.write("%s,%s\n" % (out_fname, in_fpath))

        sources_file.close()

    # 处理每个说话人的所有语音
    with Pool(32) as pool:
        list(
            tqdm(pool.imap(preprocess_speaker, speaker_dirs),
                 dataset_name,
                 len(speaker_dirs),
                 unit="speakers"))
    logger.finalize()
    print("Done preprocessing %s.\n" % dataset_name)
def connect_structures_find_saddles(sequence, structure_list):
    pairs = {}
    #fc = RNA.fold_compound(sequence)
    fp_pool = Pool(Max_Threads)
    res_list=[]
    for i, se_1 in enumerate(structure_list):
        for j in range(i+1, len(structure_list)):
            se_2 = structure_list[j]
            a = fp_pool.apply_async(find_saddle, args=(se_1.Structure, se_2.Structure, i, j))
            res_list.append(a)
            #saddle_energy_dcal = fc.path_findpath_saddle(se_1.Structure, se_2.Structure)
            #saddle_energy_kcal = saddle_energy_dcal / 100.0
            #pairs[(i,j)] = saddle_energy_kcal
            #pairs[(j,i)] = saddle_energy_kcal
    fp_pool.close()
    
    for a in res_list:
        i,j, saddle_energy_kcal = a.get()
        pairs[(i,j)] = saddle_energy_kcal
        pairs[(j,i)] = saddle_energy_kcal
        
        
    # get lowest saddle for each structure that ends in a structure with lower energy than the first structure.
    minimal_saddle_list = []
    for i in range(0, len(structure_list)):
        se_1 = structure_list[i]
        min_saddle_energy = sys.maxsize
        tree_neighbor = None
        for j in range(0, len(structure_list)):
            if i == j:
                continue
            se_2 = structure_list[j]
            saddle_energy = pairs[(i,j)]
            if saddle_energy <= min_saddle_energy and se_2.Energy < se_1.Energy:
                min_saddle_energy = saddle_energy
                tree_neighbor = j
        if tree_neighbor == None: # it could be the root.
            tree_neighbor = -1
        minimal_saddle_list.append((i, tree_neighbor, min_saddle_energy))
    return minimal_saddle_list
예제 #19
0
def save_word_error_parallel(results_file, output_file, X_test, y_test):
    file = open(results_file, 'r')
    lines = file.readlines()
    file.close()

    pool = Pool(4)  # uses up 4 cores per call (for 8 core machine), so change this to match half your cpu count
    f_vals = []

    # packs a tuple - line, index, X_test, y_test, verbose (should print or not)
    data = [(l, i, X_test, y_test, True) for i, l in enumerate(lines)]

    # process it in correct order
    accuracies = pool.imap(process_line, data)  # maintains order when processing in parallel

    # wait while processing the data
    pool.close()

    for acc in accuracies:
        f_vals.append(str(acc) + "\n")

    file = open(output_file, 'w')
    file.writelines(f_vals)
    file.close()
예제 #20
0
def main():
    parser = argparse.ArgumentParser(
        description='Space Time Action Unit R-CNN training example:')
    parser.add_argument('--pid', '-pp', default='/tmp/SpaceTime_AU_R_CNN/')
    parser.add_argument('--gpu',
                        '-g',
                        nargs='+',
                        type=int,
                        help='GPU ID, multiple GPU split by space')
    parser.add_argument('--lr', '-l', type=float, default=0.001)
    parser.add_argument('--out',
                        '-o',
                        default='end_to_end_result',
                        help='Output directory')
    parser.add_argument('--trainval', default='train', help='train/test')
    parser.add_argument('--database',
                        default='BP4D',
                        help='Output directory: BP4D/DISFA/BP4D_DISFA')
    parser.add_argument('--iteration', '-i', type=int, default=70000)
    parser.add_argument('--epoch', '-e', type=int, default=20)
    parser.add_argument('--batch_size', '-bs', type=int, default=1)
    parser.add_argument('--snapshot', '-snap', type=int, default=1000)
    parser.add_argument('--need_validate',
                        action='store_true',
                        help='do or not validate during training')
    parser.add_argument('--mean',
                        default=config.ROOT_PATH +
                        "BP4D/idx/mean_no_enhance.npy",
                        help='image mean .npy file')
    parser.add_argument('--backbone',
                        default="mobilenet_v1",
                        help="vgg/resnet101/mobilenet_v1 for train")
    parser.add_argument('--optimizer',
                        default='RMSprop',
                        help='optimizer: RMSprop/AdaGrad/Adam/SGD/AdaDelta')
    parser.add_argument('--pretrained_model',
                        default='mobilenet_v1',
                        help='imagenet/mobilenet_v1/resnet101/*.npz')
    parser.add_argument('--pretrained_model_args',
                        nargs='+',
                        type=float,
                        help='you can pass in "1.0 224" or "0.75 224"')
    parser.add_argument('--spatial_edge_mode',
                        type=SpatialEdgeMode,
                        choices=list(SpatialEdgeMode),
                        help='1:all_edge, 2:configure_edge, 3:no_edge')
    parser.add_argument(
        '--temporal_edge_mode',
        type=TemporalEdgeMode,
        choices=list(TemporalEdgeMode),
        help='1:rnn, 2:attention_block, 3.point-wise feed forward(no temporal)'
    )
    parser.add_argument("--bi_lstm",
                        action="store_true",
                        help="whether to use bi-lstm as Edge/Node RNN")
    parser.add_argument(
        '--use_memcached',
        action='store_true',
        help='whether use memcached to boost speed of fetch crop&mask')  #
    parser.add_argument('--memcached_host', default='127.0.0.1')
    parser.add_argument("--fold", '-fd', type=int, default=3)
    parser.add_argument("--layers", type=int, default=1)
    parser.add_argument("--split_idx", '-sp', type=int, default=1)
    parser.add_argument("--use_paper_num_label",
                        action="store_true",
                        help="only to use paper reported number of labels"
                        " to train")
    parser.add_argument("--previous_frame", type=int, default=50)
    parser.add_argument("--sample_frame", '-sample', type=int, default=25)
    parser.add_argument(
        "--snap_individual",
        action="store_true",
        help="whether to snapshot each individual epoch/iteration")
    parser.add_argument("--proc_num", "-proc", type=int, default=1)
    parser.add_argument('--eval_mode',
                        action='store_true',
                        help='Use test datasets for evaluation metric')
    args = parser.parse_args()
    os.makedirs(args.pid, exist_ok=True)
    os.makedirs(args.out, exist_ok=True)
    pid = str(os.getpid())
    pid_file_path = args.pid + os.sep + "{0}_{1}_fold_{2}.pid".format(
        args.database, args.fold, args.split_idx)
    with open(pid_file_path, "w") as file_obj:
        file_obj.write(pid)
        file_obj.flush()

    print('GPU: {}'.format(",".join(list(map(str, args.gpu)))))

    adaptive_AU_database(args.database)
    mc_manager = None
    if args.use_memcached:
        from collections_toolkit.memcached_manager import PyLibmcManager
        mc_manager = PyLibmcManager(args.memcached_host)
        if mc_manager is None:
            raise IOError("no memcached found listen in {}".format(
                args.memcached_host))

    train_data = AUDataset(
        database=args.database,
        fold=args.fold,
        split_name=args.trainval,
        split_index=args.split_idx,
        mc_manager=mc_manager,
        train_all_data=False,
    )
    result_data = [
        img_path
        for img_path, AU_set, current_database_name in train_data.result_data
        if args.database + "|" + img_path not in mc_manager
    ]
    sub_list = split_list(result_data, len(result_data) // 100)

    for img_path_lst in sub_list:
        with Pool(processes=50) as pool:
            input_list = [(img_path, None, None) for img_path in img_path_lst]
            result =\
                pool.starmap(parallel_landmark_and_conn_component, input_list)
            pool.close()
            pool.join()
            for img_path, AU_box_dict, landmark_dict, box_is_whole_image in result:
                key_prefix = args.database + "|"
                key = key_prefix + img_path
                orig_img = cv2.imread(img_path, cv2.IMREAD_COLOR)
                new_face, rect = FaceMaskCropper.dlib_face_crop(
                    orig_img, landmark_dict)

                print("write {}".format(key))
                if mc_manager is not None and key not in mc_manager:
                    save_dict = {
                        "landmark_dict": landmark_dict,
                        "AU_box_dict": AU_box_dict,
                        "crop_rect": rect
                    }
                    mc_manager.set(key, save_dict)
예제 #21
0
def Pool(processes=None, initializer=None, initargs=()):
    '''
    Returns a process pool object
    '''
    from multiprocess.pool import Pool
    return Pool(processes, initializer, initargs)
def _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root,
                             out_dir, extension, skip_existing, logger):
    print("%s: Preprocessing data for %d speakers." %
          (dataset_name, len(speaker_dirs)))

    # Function to preprocess utterances for one speaker
    def preprocess_speaker(speaker_dir: Path):
        # Give a name to the speaker that includes its dataset
        speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)

        # Create an output directory with that name, as well as a txt file containing a
        # reference to each source file.
        speaker_out_dir = out_dir.joinpath(speaker_name)
        speaker_out_dir.mkdir(exist_ok=True)
        sources_fpath = speaker_out_dir.joinpath("_sources.txt")

        # There's a possibility that the preprocessing was interrupted earlier, check if
        # there already is a sources file.
        if sources_fpath.exists():
            try:
                with sources_fpath.open("r") as sources_file:
                    existing_fnames = {
                        line.split(",")[0]
                        for line in sources_file
                    }
            except Exception:
                existing_fnames = {}
        else:
            existing_fnames = {}

        # Gather all audio files for that speaker recursively
        sources_file = sources_fpath.open("a" if skip_existing else "w")
        for in_fpath in speaker_dir.glob("**/*.%s" % extension):
            # Check if the target output file already exists
            out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
            out_fname = out_fname.replace(".%s" % extension, ".npy")
            if skip_existing and out_fname in existing_fnames:
                continue

            # Load and preprocess the waveform
            wav = audio.preprocess_wav(in_fpath)
            if len(wav) == 0:
                continue

            # Create the mel spectrogram, discard those that are too short
            frames = audio.wav_to_mel_spectrogram(wav)
            if len(frames) < partials_n_frames:
                continue

            out_fpath = speaker_out_dir.joinpath(out_fname)
            np.save(out_fpath, frames)
            logger.add_sample(duration=len(wav) / sampling_rate)
            sources_file.write("%s,%s\n" % (out_fname, in_fpath))

        sources_file.close()

    # Process the utterances for each speaker
    # for speaker_dir in speaker_dirs: # DEBUG
    #     preprocess_speaker(speaker_dir)
    with Pool(56) as pool:
        list(
            tqdm(pool.imap(preprocess_speaker, speaker_dirs),
                 dataset_name,
                 len(speaker_dirs),
                 unit="speakers"))
    logger.finalize()
    print("Done preprocessing %s.\n" % dataset_name)
예제 #23
0
파일: cli.py 프로젝트: Ohmarinus/vsketch
def save(
    target: Optional[str] = typer.Argument(default=None,
                                           help="sketch directory or file"),
    name: Optional[str] = typer.Option(None,
                                       "-n",
                                       "--name",
                                       help="output name (without extension)"),
    config: Optional[str] = typer.Option(
        None,
        "-c",
        "--config",
        help=
        ("path to the config file to use (may be a path to JSON file or the name of the "
         "configuration)"),
    ),
    seed: Optional[str] = typer.Option(None,
                                       "-s",
                                       "--seed",
                                       help="seed or seed range to use"),
    destination: Optional[str] = typer.Option(None,
                                              "-d",
                                              "--destination",
                                              help="destination path"),
    multiprocessing: bool = typer.Option(True,
                                         envvar="VSK_MULTIPROCESSING",
                                         help="enable multiprocessing"),
) -> None:
    """Save the sketch to a SVG file.

    By default, the output is named after the sketch and the provided options. An alternative
    name my be provided with the --name option.

    If the sketch as parameters, their default values are used. Alternatively, a pre-existing
    configuration can be used instead with the --config option.

    By default, a random seed is used for vsketch's random number generator. If --config is
    used, the seed saved in the configuration is used instead. A seed may also be provided with
    the --seed option, in which case it will override the configuration's seed.

    The --seed option also accepts seed range in the form of FIRST..LAST, e.g. 0..100. In this
    case, one output file per seed is generated.

    If the number of files to generate is greater than 4, all available cores are used for the
    process. This behaviour can be disabled with --no-multiprocessing or the
    VSK_MULTIPROCESSING variable.

    By default, all SVG are saved in the sketch's "output" sub-directory. This can be
    overridden using the --destination option.
    """

    try:
        path = _find_sketch_script(target)
    except ValueError as err:
        print_error("Sketch could not be found: ", str(err))
        raise typer.Exit(code=1)

    # load configuration
    param_set: Dict[str, vsketch.ParamType] = {}
    config_postfix = ""
    if config is not None:
        config_path = pathlib.Path(config)
        if not config_path.exists():
            config_path = get_config_path(path) / (config + ".json")

        if config_path.exists():
            param_set = load_config(config_path)
            config_postfix = "_" + config_path.stem
        else:
            print_error("Config file not found: ", str(config_path))

    # compute name
    if name is None:
        name = canonical_name(path) + config_postfix
    seed_in_name = seed is not None

    if seed is None:
        if param_set is not None and "__seed__" in param_set:
            seed_start = seed_end = int(param_set["__seed__"])
        else:
            seed_start = seed_end = random.randint(0, 2**31 - 1)
    else:
        try:
            seed_start, seed_end = _parse_seed(seed)
        except ValueError as err:
            print_error(f"Could not parse seed {seed}: ", str(err))
            raise typer.Exit(code=1)

    # prepare output path
    if destination is not None:
        output_path = pathlib.Path(destination)
        if not output_path.exists():
            print_error("Provided output path does not exist: ",
                        str(output_path.absolute()))
            raise typer.Exit(code=1)
        if not output_path.is_dir():
            print_error("Provided output path is not a directory: ",
                        str(output_path.absolute()))
            raise typer.Exit(code=1)
    else:
        output_path = path.parent / "output"
        if not output_path.exists():
            output_path.mkdir()
        elif not output_path.is_dir():
            print_error("Could not create output directory: ",
                        str(output_path))
            raise typer.Exit(code=1)

    # noinspection PyShadowingNames
    def _write_output(seed: int) -> None:
        # this needs to be there because the sketch class cannot be pickled apparently
        sketch_class = load_sketch_class(path)
        if sketch_class is None:
            print_error("Could not load script: ", str(path))
            raise typer.Exit(code=1)

        sketch_class.set_param_set(param_set)

        output_name = name
        if seed_in_name:
            output_name += "_s" + str(seed)  # type: ignore
        output_name += ".svg"  # type: ignore

        output_file = output_path / output_name

        sketch = sketch_class.execute(finalize=True, seed=seed)

        if sketch is None:
            print_error("Could not execute script: ", str(path))
            raise typer.Exit(code=1)

        doc = sketch.vsk.document
        with open(output_file, "w") as fp:
            print_info("Exporting SVG: ", str(output_file))
            vp.write_svg(fp,
                         doc,
                         source_string=f"vsketch save -s {seed} {path}",
                         color_mode="layer")

    seed_range = range(seed_start, seed_end + 1)

    if len(seed_range) < 4 or not multiprocessing:
        for s in seed_range:
            _write_output(s)
    else:
        with Pool() as p:
            list(p.imap(_write_output, seed_range))
예제 #24
0
def Pool(processes=None, initializer=None, initargs=(), maxtasksperchild=None):
    '''
    Returns a process pool object
    '''
    from multiprocess.pool import Pool
    return Pool(processes, initializer, initargs, maxtasksperchild)
예제 #25
0
c4 = Dataset.load_from_disk("/home/ahemf/processed/c4_extended")
cpu_count = os.cpu_count()
overall_counts = Counter()
# dsets_tokenized = dset.map(lambda x: dict(tokens=tokenizer.batch_encode_plus(x["text"], add_special_tokens=False, max_length=4096, padding=False)["input_ids"]), batched=True, batch_size=1024)


def term_frequency_builder(tokens):
    raw_counts = Counter(tokens)
    mc = raw_counts.most_common()[0]
    most_common_count = mc[1]
    tf = [[str(k), str(0.5 + 0.5 * (v / most_common_count))]
          for k, v in raw_counts.items()]
    return tf


with Pool(cpu_count) as p:

    def mapping(x):
        tokens = [[
            re.sub(r'[^\s0-9a-zA-Z]', ' ', w).strip() for w in t.split()
        ] for t in x["text"]]
        tokens = [[w for w in t if len(w) >= 2] for t in tokens]
        tf = [
            term_frequency_builder(tk) if len(tk) > 0 else [["", str(1.0)]]
            for tk in tokens
        ]
        return tf

    def batch_term_frequency_builder(x):
        texts = x["text"]
        csz = int(np.ceil(len(texts) / cpu_count))
예제 #26
0
        # Load the wav from disk if needed
        if isinstance(fpath_or_wav, str):
            wav = librosa.core.load(fpath_or_wav, sr=sampling_rate)[0]
        else:
            wav = fpath_or_wav

        return wav

    def local_mel2samp(filepath):
        print("start", filepath)
        filepath = filepath.split("|")[0]
        audio = preprocess_wav(filepath, sampling_rate=args.sampling_rate)
        filename = os.path.basename(filepath)
        new_filepath = args.output_dir + '/' + filename + '.npy'
        print("finish", new_filepath)
        np.save(new_filepath, audio)

    filepaths = files_to_list(args.filelist_path)

    with Pool(args.num_processes) as pool:  # ThreadPool(8) as pool:
        # list(tqdm(pool.imap(preprocess_speaker, speaker_dirs), dataset_name, len(speaker_dirs),
        list(pool.map(local_mel2samp, filepaths))

    # for filepath in filepaths:
    #     filepath = filepath.split("|")[0]
    #     audio = preprocess_wav(filepath, sampling_rate=args.sampling_rate)
    #     filename = os.path.basename(filepath)
    #     new_filepath = args.output_dir + '/' + filename + '.npy'
    #     print(new_filepath)
    #     np.save(new_filepath, audio)
예제 #27
0
def main():
    _simulations = load.structured()
    _simulations = filtering.by_categories(
        _simulations,
        _is_single_cell=False,
        _is_heterogeneity=None,
        _is_low_connectivity=False,
        _is_causality=CAUSALITY,
        _is_dominant_passive=DOMINANT_PASSIVE,
        _is_fibrin=False)
    _simulations = filtering.by_pair_distances(_simulations,
                                               _distances=PAIR_DISTANCE)
    print('Total simulations:', len(_simulations))

    _arguments = []
    for _simulation in _simulations:
        for _cell_id in ['left_cell', 'right_cell']:
            _arguments.append({
                'simulation': _simulation,
                'length_x':
                config.QUANTIFICATION_WINDOW_WIDTH_IN_CELL_DIAMETER,
                'length_y':
                config.QUANTIFICATION_WINDOW_HEIGHT_IN_CELL_DIAMETER,
                'offset_x': 0,
                'offset_y': 0,
                'cell_id': _cell_id,
                'direction': 'inside',
                'time_points': TIME_POINTS
            })

    _fiber_densities = {}
    with Pool(CPUS_TO_USE) as _p:
        for _keys, _value in tqdm(_p.imap_unordered(
                compute.window_fiber_density_by_time, _arguments),
                                  total=len(_arguments),
                                  desc='Computing windows & fiber densities'):
            _fiber_densities[(_keys['simulation'], _keys['cell_id'])] = _value
        _p.close()
        _p.join()

    _simulations_by_heterogeneity = organize.by_heterogeneity(_simulations)

    _headers = [
        'time_point',
        'simulation',
        'pair_distance_in_cell_diameter',
        'heterogeneity_std',
        'left_cell_fiber_density',
        'left_cell_fiber_density_z_score',
        'right_cell_fiber_density',
        'right_cell_fiber_density_z_score',
    ]

    _csv_path = os.path.join(paths.OUTPUTS,
                             'simulations_density_cell_pairs.csv')
    with open(_csv_path, 'w', newline='') as _csv_file:
        _csv_writer = csv.writer(_csv_file)
        _csv_writer.writerow(_headers)
        for _simulation_std in _simulations_by_heterogeneity:
            print('STD:', _simulation_std)
            _std_simulations = _simulations_by_heterogeneity[_simulation_std]
            for _simulation in tqdm(_std_simulations, desc='Simulations loop'):
                _properties = load.properties(_simulation)
                _pair_distance = compute.pair_distance(_properties)
                _average, _std = load.normalization(_simulation).values()
                _left_cell_fiber_densities = _fiber_densities[(_simulation,
                                                               'left_cell')]
                _right_cell_fiber_densities = _fiber_densities[(_simulation,
                                                                'right_cell')]
                for _time_point, (_left_cell_fiber_density, _right_cell_fiber_density) in \
                        enumerate(zip(_left_cell_fiber_densities, _right_cell_fiber_densities)):
                    _left_cell_fiber_density_z_score = compute_lib.z_score(
                        _left_cell_fiber_density, _average, _std)
                    _right_cell_fiber_density_z_score = compute_lib.z_score(
                        _right_cell_fiber_density, _average, _std)
                    _csv_writer.writerow([
                        _time_point, _simulation, _pair_distance,
                        _simulation_std, _left_cell_fiber_density,
                        _left_cell_fiber_density_z_score,
                        _right_cell_fiber_density,
                        _right_cell_fiber_density_z_score
                    ])
예제 #28
0
    def __call__(self, inet, return_last_net=False):
        # local vars for efficiency
        args = self.args
        nettype = args.nettype
        first_inf_nodes = self.first_inf_nodes
        no_exposed = self.no_exposed
        is_covid = self.is_covid
        tr_rate = self.tr_rate
        trans_true_items = self.trans_true_items
        trans_know_items = self.trans_know_items

        # will hold all network events
        net_events = defaultdict()
        # whether return of netstate is needed from this step (only in the case of predefined networks)
        netstate_return = False
        # initialize the true network seed either randomly, or based on what has been supplied already + net index
        net_seed = random.randint(
            0, 2e9) if args.netseed is None else args.netseed + inet

        args_dict = vars(args)

        # the infection net is either predefined in the first element of args.nettype or created at random based on the model name specified by the same param
        # if the net is predefined, we also seed here the first_inf_nodes (this is because we did not have access to the list of nodes prior to this point)
        try:
            true_net = network.get_from_predef(nettype['0'][0],
                                               inet=inet,
                                               W_factor=nettype.get('Wi', 0),
                                               **args_dict)
            # if we didn't set the netsize, it means that 'nid' was not supplied in the dict
            # so we assume only the nodes in the edges supplied are nodes in this network
            if args.netsize == -1:
                args.netsize = len(true_net.nodes)
                args.k = true_net.avg_degree()
                netstate_return = True
        # TypeError: nettype is a str, KeyError: key '0' is not in the dynamic dict, IndexError: element 0 (inf net) not in list
        except (TypeError, KeyError, IndexError):
            # args_dict should also contain the netsize and the average degree
            true_net = network.get_random(typ=nettype,
                                          nseed=net_seed,
                                          inet=inet,
                                          weighted=args.use_weights,
                                          **args_dict)

        # if NO compute distribution was enabled, we can use args (since it is not deepcopied) to track all average weights
        if not args.multip:
            args.k_i = {
                '0':
                (true_net.avg_degree(), true_net.avg_degree(use_weights=True))
            }

        # first_inf_nodes could have been calculated by this point if an infseed was supplied, and
        # we deal with random network OR 'nid' key was supplied in the predefined network of args.nettype
        if first_inf_nodes is None:
            # turn first_inf into an absolute number if it's a percentage by this point (happens only if predefined net with no nid)
            args.first_inf = int(
                args.first_inf if args.first_inf >= 1 else args.first_inf *
                args.netsize)
            # Random first infected across simulations - seed random locally
            first_inf_nodes = random.Random(net_seed).sample(
                true_net.nodes, args.first_inf)

        # Change the state of the first_inf_nodes to 'I' to root the simulation
        true_net.change_state(first_inf_nodes, state='I', update=True)
        # Placeholder for the dual network (will be initialized only if args.dual)
        know_net = None

        ### Good place to debug whether edge weights are present and how they are normalized
        #         print(true_net.node_counts[list(true_net.neighbors(first_inf_nodes[0]))[0]])
        #         print(1, true_net.node_counts[1])
        #         print(2, true_net.node_counts[2])

        if args.dual:
            # the dual network is either predefined in the second element of args.nettype or initialized at random
            try:
                know_net = network.get_dual_from_predef(
                    true_net,
                    nettype['0'][1],
                    count_importance=tr_rate,
                    W_factor=nettype.get('Wt', 0),
                    **args_dict)
            except (TypeError, KeyError, IndexError):
                # First dual net depends on both overlap and uptake (this is usually the digital contact tracing net)
                # Note this will also copy over the states, so no need to call change_state
                know_net = network.get_dual(true_net,
                                            args.overlap,
                                            args.zadd,
                                            args.zrem,
                                            args.uptake,
                                            args.maintain_overlap,
                                            nseed=net_seed + 1,
                                            inet=inet,
                                            count_importance=tr_rate,
                                            **args_dict)

            # if 2 dual networks selected, create the second network and add both to a ListDelegator
            if args.dual == 2:
                try:
                    know_net_two = network.get_dual_from_predef(
                        true_net,
                        nettype['0'][2],
                        count_importance=args.taut_two,
                        W_factor=nettype.get('Wt2', 0),
                        **args_dict)
                except (TypeError, KeyError, IndexError):
                    # Second tracing net attempt to maintain overlap_two (this is usually the manual tracing net,
                    # uptake may not make sense for manual tracing
                    # Note this will also copy over the states, so no need to call change_state
                    know_net_two = network.get_dual(
                        true_net,
                        args.overlap_two,
                        args.zadd_two,
                        args.zrem_two,
                        args.uptake_two,
                        args.maintain_overlap_two,
                        nseed=net_seed + 2,
                        inet=inet,
                        count_importance=args.taut_two,
                        **args_dict)

                # know_net becomes a ListDelegator of the 2 networks
                know_net = ListDelegator(know_net, know_net_two)

            # Object used during Multiprocessing of Network simulation events
            engine = EngineDual(args=args,
                                no_exposed=no_exposed,
                                is_covid=is_covid,
                                true_net=true_net,
                                know_net=know_net,
                                trans_true=trans_true_items,
                                trans_know=trans_know_items)

        else:
            # Object used during Multiprocessing of Network simulation events
            engine = EngineOne(
                args=args,
                no_exposed=no_exposed,
                is_covid=is_covid,
                true_net=true_net,
                trans_true=trans_true_items,
            )

        niters = args.niters
        iters_range = range(niters)

        if args.multip == 2 or args.multip == 3:
            # allocate EITHER half or all cpus to pathos.multiprocess for parallelizing simulations for different iterations of 1 init
            # multip == 2 parallelize only iterations; multip == 3 parallelize both net and iters
            jobs = int(cpu_count() / (args.multip - 1))
            with Pool(jobs) as pool:
                for itr, stats_events in enumerate(
                        tqdm_redirect(pool.imap(engine, iters_range),
                                      total=niters,
                                      desc='Iterations simulation progress')):
                    # Record sim results
                    net_events[itr] = stats_events

        else:
            with no_std_context(enabled=args.animate):
                for itr in tqdm_redirect(
                        iters_range, desc='Iterations simulation progress'):
                    print('Running iteration ' + str(itr) + ':')

                    # Reinitialize network + Random first infected at the beginning of each run BUT the first one
                    # This is needed only in sequential processing since in multiprocessing the nets are deepcopied anyway
                    if itr:
                        engine.reinit_net(first_inf_nodes)

                    # Run simulation
                    stats_events = engine(itr)
                    # Record sim results
                    net_events[itr] = stats_events
                    # A situation in which there is NO event can arise when all first infected nodes are orphans, and rem_orphans=True
                    total_inf = stats_events[-1][
                        'totalInfected'] if stats_events else args.first_inf
                    print('---> Result:' + str(total_inf) +
                          ' total infected persons over time.')

        if return_last_net:
            return net_events, (true_net, know_net)

        if netstate_return:
            # we may not know about netsize and k by this point, and if Nets are distributed, args will be deepcopied
            # therefore we need to return the inferred netsize and k here from the local args
            return net_events, (args.netsize, args.k)

        return net_events, None
 def run(self):
     pool = Pool(40)
     return dict(pool.imap(self._run, self.runners))
예제 #30
0
    def __call__(
        self,
        dp: DataPanel,
        columns: List[str],
        num_proc: int = None,
        *args,
        **kwargs,
    ):

        if not num_proc or num_proc == 1:
            slices = []
            slice_membership = []
            # Apply each slicebuilder in sequence
            for i, slicebuilder in tqdm(enumerate(self.subpopulations)):
                # Apply the slicebuilder
                slices_i, slice_membership_i = slicebuilder(
                    dp=dp,
                    columns=columns,
                    *args,
                    **kwargs,
                )

                # Add in the slices and slice membership
                slices.extend(slices_i)
                slice_membership.append(slice_membership_i)

        else:
            # TODO(karan): cleanup, make mp.Pool support simpler across the library
            with Pool(num_proc) as pool:
                slices, slice_membership = zip(
                    *pool.map(
                        lambda sb: sb(
                            dp=dp,
                            columns=columns,
                            *args,
                            **kwargs,
                        ),
                        [slicebuilder for slicebuilder in self.subpopulations],
                    )
                )

                # Combine all the slices
                slices = list(tz.concat(slices))

            def _store_updates(batch, indices):

                # Each Subpopulation will generate slices
                for i, subpopulation in enumerate(self.subpopulations):
                    updates = subpopulation.construct_updates(
                        slice_membership=slice_membership[i][indices],
                        columns=columns,
                    )

                    batch = subpopulation.store(
                        batch=batch,
                        updates=updates,
                    )

                return batch

            if isinstance(dp, DataPanel):
                dp = dp.map(
                    _store_updates,
                    with_indices=True,
                    batched=True,
                )

                for subpopulation in self.subpopulations:
                    # Update the DataPanel's history
                    dp.update_tape(
                        path=[SLICEBUILDERS, subpopulation.category],
                        identifiers=subpopulation.identifiers,
                        columns=columns,
                    )

        # Combine all the slice membership matrices
        slice_membership = np.concatenate(slice_membership, axis=1)

        return slices, slice_membership