def get_dict_features_from_df_parallel(self, df, nworkers=8): print("extracting features...") df_split = np.array_split(df, nworkers) pool = Pool(nworkers) res_dicts = pool.map(self.get_dict_features_from_df, df_split) pool.close( ) # informs the processor that no new tasks will be added to the pool pool.join( ) # stops and waits for all of the results to be finished and collected before proceeding with the rest of big_dic = defaultdict(lambda: defaultdict(int)) # merge feature dictionaries created for data frame splits into one big dictionary for dic in res_dicts: for k, v in dic.items(): big_dic[k] = v return pd.concat([ pd.get_dummies(df[df.columns.difference(["event", "venue"])], prefix="@", columns=["month", "weekday"]), pd.DataFrame.from_dict(big_dic, orient='index') ], axis=1, join_axes=[df.index]).fillna(0.)
def parallelize_dataframe(self, df, func): df_split = np.array_split(df, 1) pool = Pool(1) rr = pool.map(func, df_split) df = pd.concat(rr) pool.close() pool.join() return df
def parallel_launcher(data_dir, data, worker, pool_size, files_num): files = modified_get_files(data_dir) batches = [(files[i:i + files_num], data, j) for j, i in enumerate(range(0, len(files), files_num))] pool = Pool(pool_size) output = pool.starmap(worker, batches) pool.close() pool.join() return output