def lauchCMAESForListOfPoints(target_size, rs, save, points): p = ThreadPool(processes=len(points)) posIni = np.loadtxt(pathDataFolder + rs.experimentFilePosIni) p.map( partial(launchCMAESForSpecificTargetSizeAndSpecificPointMulti, target_size, rs, save), [[i, posIni[i]] for i in points]) p.close() p.join()
def augment(args): aug = Augmenter(sampling_rate=sampling_rate) data_path = args.data inn = data_path.split('/')[-1] if inn == '': inn = data_path.split('/')[-2] outn = inn + '_aug' data_path = Path(data_path) filepaths = list(data_path.glob("**/*.flac")) def augment_file(filepath): filename = filepath.stem save_filename = str(filename) + '_aug' save_orig_path = str(filepath).replace(inn, outn) save_filepath = save_orig_path.replace(filename, save_filename) save_filepath = Path(save_filepath) save_filepath.parent.mkdir(parents=True, exist_ok=True) data, sr = aug.load(filepath) data = aug.resample(data, sr) augmented = aug.augment(data) aug.save(save_filepath, augmented) if args.add: aug.save(save_orig_path, data) # Multi-threading with ThreadPool(8) as pool: list( tqdm(pool.imap(augment_file, filepaths), 'Aug', len(filepaths), unit="files"))
def _calculate_feature(self, input_gs: gpd.GeoSeries): if self.intersect_tbl_name_dict is None or self.input_geom_table is None: raise Exception("Must use an OSM feature factory before extracting the feature") eng = get_sqlalchemy_engine() # input_table = save_geo_series_to_tmp_table(input_gs, eng) # calculate the feature conn = connect_to_db() query = self._build_postgres_query() res = get_df(query, conn=conn) routes = res[['source_point', 'dest_point']].values def f(route): conn = connect_to_db() rout_query = f""" with source as (select st_setsrid(st_astext({"'" + route[0] + "'"}), 4326)), target as (select st_setsrid(st_astext({"'" + route[1] + "'"}), 4326)), node1 as(select way, id::integer from ways_vertices_pgr order by way <#> (select * from source) limit 1), node2 as(select way, id::integer from ways_vertices_pgr order by way <#> (select * from target) limit 1), route as (select * from pgr_dijkstra('select gid::integer as id, source::integer, target::integer, length_m::float as cost from ways', (select id from node1), (select id from node2), false)) select sum(route.cost)::float as distance, sum(((route.cost/1000)/ways.maxspeed_forward)*60*60)::float as duration from route join ways on route.edge=ways.gid; """ shortest_route = get_df(rout_query, conn=conn) distance = shortest_route.distance[0] duration = shortest_route.duration[0] / 60.0 # shortest_route.duration[0] is in seconds ! conn.close() return [distance, duration] # results_list = [f(r) for r in routes] with ThreadPool(4) as p: results_list = p.map(f, routes) results_df = pd.DataFrame(results_list) results_df = pd.concat([results_df, res['geom_id']], names=[self.feature_names, 'geom_id'], axis=1) # edit the df full_df = pd.DataFrame(index=range(len(input_gs)), columns=self.feature_names) if len(res['geom_id']) != 0: full_df.iloc[results_df['geom_id']] = results_df.drop('geom_id', axis=1).values full_df.fillna(self.default_value, inplace=True) full_df['geom'] = input_gs # close up the business conn.close() eng.dispose() return full_df
def _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, extension, skip_existing, logger): print("%s: Preprocessing data for %d speakers." % (dataset_name, len(speaker_dirs))) # Function to preprocess utterances for one speaker def preprocess_speaker(speaker_dir: Path): # Give a name to the speaker that includes its dataset speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts) # Create an output directory with that name, as well as a txt file containing a # reference to each source file. speaker_out_dir = out_dir.joinpath(speaker_name) speaker_out_dir.mkdir(exist_ok=True) sources_fpath = speaker_out_dir.joinpath("_sources.txt") # There's a possibility that the preprocessing was interrupted earlier, check if # there already is a sources file. if sources_fpath.exists(): try: with sources_fpath.open("r") as sources_file: existing_fnames = {line.split(",")[0] for line in sources_file} except: existing_fnames = {} else: existing_fnames = {} # Gather all audio files for that speaker recursively sources_file = sources_fpath.open("a" if skip_existing else "w") for in_fpath in speaker_dir.glob("**/*.%s" % extension): # Check if the target output file already exists out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts) out_fname = out_fname.replace(".%s" % extension, ".npy") if skip_existing and out_fname in existing_fnames: continue # Load and preprocess the waveform wav = audio.preprocess_wav(in_fpath) if len(wav) == 0: continue # Create the mel spectrogram, discard those that are too short frames = audio.wav_to_mel_spectrogram(wav) if len(frames) < partials_n_frames: continue out_fpath = speaker_out_dir.joinpath(out_fname) np.save(out_fpath, frames) logger.add_sample(duration=len(wav) / sampling_rate) sources_file.write("%s,%s\n" % (out_fname, in_fpath)) sources_file.close() # Process the utterances for each speaker with ThreadPool(8) as pool: list(tqdm(pool.imap(preprocess_speaker, speaker_dirs), dataset_name, len(speaker_dirs), unit="speakers")) logger.finalize() print("Done preprocessing %s.\n" % dataset_name)
def _calculate_feature(self, input_gs: gpd.GeoSeries): if self.intersect_tbl_name_dict is None or self.input_geom_table is None: raise Exception( "Must use an OSM feature factory before extracting the feature" ) eng = get_sqlalchemy_engine() # input_table = save_geo_series_to_tmp_table(input_gs, eng) # calculate the feature conn = connect_to_db() query = self._build_postgres_query() res = get_df(query, conn=conn) routes = res[['source_point', 'dest_point']].values def f(route_start_dest): route_coords = wkt_to_centers(route_start_dest) # distance in meters, time in seconds shortest_route = self.client.directions( route_coords, profile=self.transportation_type, preference='recommended', instructions=False, geometry=False) summary = shortest_route['routes'][0]['summary'] if not summary: # for source and target points that are very close to each other distance = 0 duration = 0 else: distance = summary['distance'] duration = summary[ 'duration'] / 60.0 if 'duration' in summary else 0 # we want duration in minutes return [distance, duration] with ThreadPool(4) as p: results_list = p.map(f, routes) results_df = pd.DataFrame(results_list) results_df = pd.concat([results_df, res['geom_id']], names=[self.feature_names, 'geom_id'], axis=1) # edit the df full_df = pd.DataFrame(index=range(len(input_gs)), columns=self.feature_names) if len(res['geom_id']) != 0: full_df.iloc[results_df['geom_id']] = results_df.drop( 'geom_id', axis=1).values full_df.fillna(self.default_value, inplace=True) full_df['geom'] = input_gs # close up the business conn.close() eng.dispose() return full_df
def launchCMAESForAllTargetSizesMulti(rs): ''' Launch in parallel (on differents processor) the cmaes optimization for each target size ''' #initializes setup variables #initializes a pool of worker, ie multiprocessing p = ThreadPool(processes=4) #run cmaes on each targets size on separate processor p.map(partial(launchCMAESForSpecificTargetSize, rs=rs, save=False), rs.target_size) p.close() p.join()
def launch(rs): all_points = [] for target_size in [0.005, 0.01, 0.02, 0.04]: for i in range(15): if not check_if_theta_file_exists(target_size, i): all_points.append([i, target_size]) p = ThreadPool(processes=len(all_points)) posIni = np.loadtxt(pathDataFolder + rs.experimentFilePosIni) p.map(partial(launchCMAESMissing, rs, True, 6), [[point[0], posIni[point[0]], point[1]] for point in all_points]) p.close() p.join()
def launchCMAESForAllPoint(rs, target_size, save, noise=None): """ Launch in parallel (on differents processor) the cmaes optimization for each point input: rs: setup file target_size: size of the target save: for save experience log noise: noise on muscle, if None, defalt noise from muscle setup """ p = ThreadPool(processes=15) #run cmaes on each targets size on separate processor posIni = np.loadtxt(pathDataFolder + rs.experimentFilePosIni) p.map(partial(launchCMAESForSpecificTargetSizeAndSpecificPoint, target_size, rs, save, noise=noise), enumerate(posIni)) p.close() p.join()
def launchCMAESForAllPoint(rs, target_size, save, noise=None): """ Launch in parallel (on differents processor) the cmaes optimization for each point input: rs: setup file target_size: size of the target save: for save experience log noise: noise on muscle, if None, defalt noise from muscle setup """ p = ThreadPool(processes=15) #run cmaes on each targets size on separate processor posIni = np.loadtxt(pathDataFolder + rs.experimentFilePosIni) p.map( partial(launchCMAESForSpecificTargetSizeAndSpecificPoint, target_size, rs, save, noise=noise), enumerate(posIni)) p.close() p.join()
def Pool(processes=None, initializer=None, initargs=()): from multiprocess.pool import ThreadPool return ThreadPool(processes, initializer, initargs)
def lauchCMAESForListOfPoints(target_size, rs, save, points): p = ThreadPool(processes=len(points)) posIni = np.loadtxt(pathDataFolder + rs.experimentFilePosIni) p.map(partial(launchCMAESForSpecificTargetSizeAndSpecificPointMulti, target_size, rs, save), [[i, posIni[i]] for i in points]) p.close() p.join()
def _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, extension, skip_existing, logger): print("%s: Preprocessing data for %d speakers." % (dataset_name, len(speaker_dirs))) # Function to preprocess utterances for one speaker def preprocess_speaker(speaker_dir: Path): # Give a name to the speaker that includes its dataset speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts) # Create an output directory with that name, as well as a txt file containing a # reference to each source file. speaker_out_dir = out_dir.joinpath(speaker_name) speaker_out_dir.mkdir(exist_ok=True) sources_fpath = speaker_out_dir.joinpath("_sources.txt") # There's a possibility that the preprocessing was interrupted earlier, check if # there already is a sources file. if sources_fpath.exists(): try: with sources_fpath.open("r") as sources_file: existing_fnames = { line.split(",")[0] for line in sources_file } except: existing_fnames = {} else: existing_fnames = {} # Gather all audio files for that speaker recursively sources_file = sources_fpath.open("a" if skip_existing else "w") for in_fpath in speaker_dir.glob("**/*.%s" % extension): # Check if the target output file already exists out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts) out_fname = out_fname.replace(".%s" % extension, ".npy") if skip_existing and out_fname in existing_fnames: continue # Load and preprocess the waveform wav = audio.preprocess_wav(in_fpath) if len(wav) == 0: continue # Create the mel spectrogram, discard those that are too short # frames = audio.wav_to_mel_spectrogram(wav) # Extract raw audio patches for fCNN win = np.hamming(int(sampling_rate * 0.02)) inc = int(win.shape[0] / 2) frames = get_frame_from_file(wav, win=win, inc=inc, sr=sampling_rate, n_channels=1, duration=None) frames = np.transpose(frames) if len(frames) < partials_n_frames: continue out_fpath = speaker_out_dir.joinpath(out_fname) np.save(out_fpath, frames) logger.add_sample(duration=len(wav) / sampling_rate) sources_file.write("%s,%s\n" % (out_fname, in_fpath)) sources_file.close() # Process the utterances for each speaker ## CHANGED FRom 8 TO 1 : This helped preprocess the VOXCeleb2 dataset. NO IDEA WHY THIS WORKS!! ## SEE ISSUE: https://www.gitmemory.com/issue/CorentinJ/Real-Time-Voice-Cloning/76/529013562 with ThreadPool(8) as pool: list( tqdm(pool.imap(preprocess_speaker, speaker_dirs), dataset_name, len(speaker_dirs), unit="speakers")) logger.finalize() print("Done preprocessing %s.\n" % dataset_name)
'./data/raw/data-sample_data-nyctaxi-trips-2012-json_corrigido.json' ] def send_register(registers): global total_sent length = len(registers) total_sent += length records = [{ 'Data': data.encode(), 'PartitionKey': '1' } for data in registers] print(f"Total: {total_sent}/4MI") client.put_records( Records=records, StreamName=STREAM_NAME, ) def process_file(file): with open(file, 'r') as buffer: registers = buffer.readlines(max_bytes) while len(registers): send_register(registers) registers = buffer.readlines(max_bytes) pool = ThreadPool(4) pool.map(process_file, files)