Exemplo n.º 1
0
def lauchCMAESForListOfPoints(target_size, rs, save, points):
    p = ThreadPool(processes=len(points))
    posIni = np.loadtxt(pathDataFolder + rs.experimentFilePosIni)
    p.map(
        partial(launchCMAESForSpecificTargetSizeAndSpecificPointMulti,
                target_size, rs, save), [[i, posIni[i]] for i in points])
    p.close()
    p.join()
Exemplo n.º 2
0
def augment(args):
    aug = Augmenter(sampling_rate=sampling_rate)
    data_path = args.data
    inn = data_path.split('/')[-1]
    if inn == '':
        inn = data_path.split('/')[-2]
    outn = inn + '_aug'
    data_path = Path(data_path)
    filepaths = list(data_path.glob("**/*.flac"))

    def augment_file(filepath):
        filename = filepath.stem
        save_filename = str(filename) + '_aug'
        save_orig_path = str(filepath).replace(inn, outn)
        save_filepath = save_orig_path.replace(filename, save_filename)
        save_filepath = Path(save_filepath)
        save_filepath.parent.mkdir(parents=True, exist_ok=True)
        data, sr = aug.load(filepath)
        data = aug.resample(data, sr)
        augmented = aug.augment(data)
        aug.save(save_filepath, augmented)
        if args.add:
            aug.save(save_orig_path, data)

    # Multi-threading
    with ThreadPool(8) as pool:
        list(
            tqdm(pool.imap(augment_file, filepaths),
                 'Aug',
                 len(filepaths),
                 unit="files"))
Exemplo n.º 3
0
    def _calculate_feature(self, input_gs: gpd.GeoSeries):
        if self.intersect_tbl_name_dict is None or self.input_geom_table is None:
            raise Exception("Must use an OSM feature factory before extracting the feature")

        eng = get_sqlalchemy_engine()
        # input_table = save_geo_series_to_tmp_table(input_gs, eng)

        # calculate the feature
        conn = connect_to_db()
        query = self._build_postgres_query()
        res = get_df(query, conn=conn)
        routes = res[['source_point', 'dest_point']].values

        def f(route):
            conn = connect_to_db()
            rout_query = f"""
            with source as (select st_setsrid(st_astext({"'" + route[0] + "'"}), 4326)),
            
                 target as (select st_setsrid(st_astext({"'" + route[1] + "'"}), 4326)),
            
                 node1 as(select way, id::integer from ways_vertices_pgr order by way <#> (select * from source) limit 1),

                 node2 as(select way, id::integer from ways_vertices_pgr order by way <#> (select * from target) limit 1),

                 route as (select * from pgr_dijkstra('select gid::integer as id, 
                                                   source::integer, target::integer, 
                                                   length_m::float as cost from ways',
                                                   (select id from node1),
                                                   (select id from node2), false))
            select sum(route.cost)::float as distance, 
                   sum(((route.cost/1000)/ways.maxspeed_forward)*60*60)::float as duration 
            from route join ways on route.edge=ways.gid;
            """
            shortest_route = get_df(rout_query, conn=conn)
            distance = shortest_route.distance[0]
            duration = shortest_route.duration[0] / 60.0  # shortest_route.duration[0] is in seconds !
            conn.close()
            return [distance, duration]

        # results_list = [f(r) for r in routes]
        with ThreadPool(4) as p:
            results_list = p.map(f, routes)

        results_df = pd.DataFrame(results_list)
        results_df = pd.concat([results_df, res['geom_id']], names=[self.feature_names, 'geom_id'], axis=1)

        # edit the df
        full_df = pd.DataFrame(index=range(len(input_gs)), columns=self.feature_names)
        if len(res['geom_id']) != 0:
            full_df.iloc[results_df['geom_id']] = results_df.drop('geom_id', axis=1).values
        full_df.fillna(self.default_value, inplace=True)
        full_df['geom'] = input_gs

        # close up the business
        conn.close()
        eng.dispose()

        return full_df
Exemplo n.º 4
0
def _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, extension,
                             skip_existing, logger):
    print("%s: Preprocessing data for %d speakers." % (dataset_name, len(speaker_dirs)))
    
    # Function to preprocess utterances for one speaker
    def preprocess_speaker(speaker_dir: Path):
        # Give a name to the speaker that includes its dataset
        speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)
        
        # Create an output directory with that name, as well as a txt file containing a 
        # reference to each source file.
        speaker_out_dir = out_dir.joinpath(speaker_name)
        speaker_out_dir.mkdir(exist_ok=True)
        sources_fpath = speaker_out_dir.joinpath("_sources.txt")
        
        # There's a possibility that the preprocessing was interrupted earlier, check if 
        # there already is a sources file.
        if sources_fpath.exists():
            try:
                with sources_fpath.open("r") as sources_file:
                    existing_fnames = {line.split(",")[0] for line in sources_file}
            except:
                existing_fnames = {}
        else:
            existing_fnames = {}
        
        # Gather all audio files for that speaker recursively
        sources_file = sources_fpath.open("a" if skip_existing else "w")
        for in_fpath in speaker_dir.glob("**/*.%s" % extension):
            # Check if the target output file already exists
            out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
            out_fname = out_fname.replace(".%s" % extension, ".npy")
            if skip_existing and out_fname in existing_fnames:
                continue
                
            # Load and preprocess the waveform
            wav = audio.preprocess_wav(in_fpath)
            if len(wav) == 0:
                continue
            
            # Create the mel spectrogram, discard those that are too short
            frames = audio.wav_to_mel_spectrogram(wav)
            if len(frames) < partials_n_frames:
                continue
            
            out_fpath = speaker_out_dir.joinpath(out_fname)
            np.save(out_fpath, frames)
            logger.add_sample(duration=len(wav) / sampling_rate)
            sources_file.write("%s,%s\n" % (out_fname, in_fpath))
        
        sources_file.close()
    
    # Process the utterances for each speaker
    with ThreadPool(8) as pool:
        list(tqdm(pool.imap(preprocess_speaker, speaker_dirs), dataset_name, len(speaker_dirs),
                  unit="speakers"))
    logger.finalize()
    print("Done preprocessing %s.\n" % dataset_name)
Exemplo n.º 5
0
    def _calculate_feature(self, input_gs: gpd.GeoSeries):
        if self.intersect_tbl_name_dict is None or self.input_geom_table is None:
            raise Exception(
                "Must use an OSM feature factory before extracting the feature"
            )

        eng = get_sqlalchemy_engine()
        # input_table = save_geo_series_to_tmp_table(input_gs, eng)

        # calculate the feature
        conn = connect_to_db()
        query = self._build_postgres_query()
        res = get_df(query, conn=conn)
        routes = res[['source_point', 'dest_point']].values

        def f(route_start_dest):
            route_coords = wkt_to_centers(route_start_dest)
            # distance in meters, time in seconds
            shortest_route = self.client.directions(
                route_coords,
                profile=self.transportation_type,
                preference='recommended',
                instructions=False,
                geometry=False)

            summary = shortest_route['routes'][0]['summary']
            if not summary:
                # for source and target points that are very close to each other
                distance = 0
                duration = 0
            else:
                distance = summary['distance']
                duration = summary[
                    'duration'] / 60.0 if 'duration' in summary else 0  # we want duration in minutes
            return [distance, duration]

        with ThreadPool(4) as p:
            results_list = p.map(f, routes)

        results_df = pd.DataFrame(results_list)
        results_df = pd.concat([results_df, res['geom_id']],
                               names=[self.feature_names, 'geom_id'],
                               axis=1)

        # edit the df
        full_df = pd.DataFrame(index=range(len(input_gs)),
                               columns=self.feature_names)
        if len(res['geom_id']) != 0:
            full_df.iloc[results_df['geom_id']] = results_df.drop(
                'geom_id', axis=1).values
        full_df.fillna(self.default_value, inplace=True)
        full_df['geom'] = input_gs

        # close up the business
        conn.close()
        eng.dispose()

        return full_df
Exemplo n.º 6
0
def launchCMAESForAllTargetSizesMulti(rs):
    '''
    Launch in parallel (on differents processor) the cmaes optimization for each target size
    '''
    #initializes setup variables
    #initializes a pool of worker, ie multiprocessing
    p = ThreadPool(processes=4)
    #run cmaes on each targets size on separate processor
    p.map(partial(launchCMAESForSpecificTargetSize, rs=rs, save=False),
          rs.target_size)
    p.close()
    p.join()
Exemplo n.º 7
0
def launch(rs):
    all_points = []
    for target_size in [0.005, 0.01, 0.02, 0.04]:
        for i in range(15):
            if not check_if_theta_file_exists(target_size, i):
                all_points.append([i, target_size])

    p = ThreadPool(processes=len(all_points))
    posIni = np.loadtxt(pathDataFolder + rs.experimentFilePosIni)
    p.map(partial(launchCMAESMissing, rs, True, 6),
          [[point[0], posIni[point[0]], point[1]] for point in all_points])
    p.close()
    p.join()
Exemplo n.º 8
0
def launchCMAESForAllTargetSizesMulti(rs):
    '''
    Launch in parallel (on differents processor) the cmaes optimization for each target size
    '''
    #initializes setup variables
    #initializes a pool of worker, ie multiprocessing
    p = ThreadPool(processes=4)
    #run cmaes on each targets size on separate processor
    p.map(partial(launchCMAESForSpecificTargetSize, rs=rs, save=False), rs.target_size)
    p.close()
    p.join()
Exemplo n.º 9
0
def launch(rs):
    all_points = []
    for target_size in [0.005, 0.01, 0.02, 0.04]:
        for i in range(15):
            if not check_if_theta_file_exists(target_size, i):
                all_points.append([i, target_size])

    p = ThreadPool(processes=len(all_points))
    posIni = np.loadtxt(pathDataFolder + rs.experimentFilePosIni)
    p.map(partial(launchCMAESMissing, rs, True, 6), [[point[0], posIni[point[0]], point[1]] for point in all_points])
    p.close()
    p.join()
Exemplo n.º 10
0
def launchCMAESForAllPoint(rs, target_size, save, noise=None):
    """
        Launch in parallel (on differents processor) the cmaes optimization for each point
        input:
                    rs: setup file
                    target_size: size of the target
                    save: for save experience log
                    noise: noise on muscle, if None, defalt noise from muscle setup
    
    """
    p = ThreadPool(processes=15)
    #run cmaes on each targets size on separate processor
    posIni = np.loadtxt(pathDataFolder + rs.experimentFilePosIni)
    p.map(partial(launchCMAESForSpecificTargetSizeAndSpecificPoint, target_size, rs, save, noise=noise), enumerate(posIni))
    p.close()
    p.join()
Exemplo n.º 11
0
def launchCMAESForAllPoint(rs, target_size, save, noise=None):
    """
        Launch in parallel (on differents processor) the cmaes optimization for each point
        input:
                    rs: setup file
                    target_size: size of the target
                    save: for save experience log
                    noise: noise on muscle, if None, defalt noise from muscle setup
    
    """
    p = ThreadPool(processes=15)
    #run cmaes on each targets size on separate processor
    posIni = np.loadtxt(pathDataFolder + rs.experimentFilePosIni)
    p.map(
        partial(launchCMAESForSpecificTargetSizeAndSpecificPoint,
                target_size,
                rs,
                save,
                noise=noise), enumerate(posIni))
    p.close()
    p.join()
Exemplo n.º 12
0
def Pool(processes=None, initializer=None, initargs=()):
    from multiprocess.pool import ThreadPool
    return ThreadPool(processes, initializer, initargs)
Exemplo n.º 13
0
def lauchCMAESForListOfPoints(target_size, rs, save, points):
    p = ThreadPool(processes=len(points))
    posIni = np.loadtxt(pathDataFolder + rs.experimentFilePosIni)
    p.map(partial(launchCMAESForSpecificTargetSizeAndSpecificPointMulti, target_size, rs, save), [[i, posIni[i]] for i in points])
    p.close()
    p.join()
Exemplo n.º 14
0
def _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root,
                             out_dir, extension, skip_existing, logger):
    print("%s: Preprocessing data for %d speakers." %
          (dataset_name, len(speaker_dirs)))

    # Function to preprocess utterances for one speaker
    def preprocess_speaker(speaker_dir: Path):
        # Give a name to the speaker that includes its dataset
        speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)

        # Create an output directory with that name, as well as a txt file containing a
        # reference to each source file.
        speaker_out_dir = out_dir.joinpath(speaker_name)
        speaker_out_dir.mkdir(exist_ok=True)
        sources_fpath = speaker_out_dir.joinpath("_sources.txt")

        # There's a possibility that the preprocessing was interrupted earlier, check if
        # there already is a sources file.
        if sources_fpath.exists():
            try:
                with sources_fpath.open("r") as sources_file:
                    existing_fnames = {
                        line.split(",")[0]
                        for line in sources_file
                    }
            except:
                existing_fnames = {}
        else:
            existing_fnames = {}

        # Gather all audio files for that speaker recursively
        sources_file = sources_fpath.open("a" if skip_existing else "w")
        for in_fpath in speaker_dir.glob("**/*.%s" % extension):
            # Check if the target output file already exists
            out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
            out_fname = out_fname.replace(".%s" % extension, ".npy")
            if skip_existing and out_fname in existing_fnames:
                continue

            # Load and preprocess the waveform
            wav = audio.preprocess_wav(in_fpath)
            if len(wav) == 0:
                continue

            # Create the mel spectrogram, discard those that are too short
            # frames = audio.wav_to_mel_spectrogram(wav)

            # Extract raw audio patches for fCNN
            win = np.hamming(int(sampling_rate * 0.02))
            inc = int(win.shape[0] / 2)
            frames = get_frame_from_file(wav,
                                         win=win,
                                         inc=inc,
                                         sr=sampling_rate,
                                         n_channels=1,
                                         duration=None)
            frames = np.transpose(frames)

            if len(frames) < partials_n_frames:
                continue

            out_fpath = speaker_out_dir.joinpath(out_fname)
            np.save(out_fpath, frames)
            logger.add_sample(duration=len(wav) / sampling_rate)
            sources_file.write("%s,%s\n" % (out_fname, in_fpath))

        sources_file.close()

    # Process the utterances for each speaker
    ## CHANGED FRom 8 TO 1 : This helped preprocess the VOXCeleb2 dataset. NO IDEA WHY THIS WORKS!!
    ## SEE ISSUE: https://www.gitmemory.com/issue/CorentinJ/Real-Time-Voice-Cloning/76/529013562
    with ThreadPool(8) as pool:
        list(
            tqdm(pool.imap(preprocess_speaker, speaker_dirs),
                 dataset_name,
                 len(speaker_dirs),
                 unit="speakers"))
    logger.finalize()
    print("Done preprocessing %s.\n" % dataset_name)
Exemplo n.º 15
0
    './data/raw/data-sample_data-nyctaxi-trips-2012-json_corrigido.json'
]


def send_register(registers):
    global total_sent

    length = len(registers)
    total_sent += length
    records = [{
        'Data': data.encode(),
        'PartitionKey': '1'
    } for data in registers]
    print(f"Total: {total_sent}/4MI")
    client.put_records(
        Records=records,
        StreamName=STREAM_NAME,
    )


def process_file(file):
    with open(file, 'r') as buffer:
        registers = buffer.readlines(max_bytes)
        while len(registers):
            send_register(registers)
            registers = buffer.readlines(max_bytes)


pool = ThreadPool(4)
pool.map(process_file, files)